In [18]:
from sklearn import preprocessing
import numpy as np
from read_scripts import dict_2_arr 
from read_scripts import read_dataset 
# takes an array of dict_data
# and combines embeddings by averaging with self then appending with other
# convers to array with labels and returns
def ensemble_append_mean_reps(dicts, labels):
    
    new_dict = dict()
    keys = dicts[0].keys()
    for key in keys:
        seqs = []
        for d in dicts:
            seq = np.mean(d[key], axis=0)
            seq = preprocessing.normalize([seq], norm='l2')
            seqs.append(seq)
        combined_seqs = np.concatenate(seqs, axis=1)
        new_dict[key] = combined_seqs

    emb_size = list(new_dict.values())[0].shape[1]
    X = np.zeros((len(new_dict), emb_size))
    y = np.zeros(len(new_dict))
    
    i = 0
    for key in new_dict:
        X[i] = new_dict[key]
        y[i] = labels[key]
        i += 1
        
    return X, y

In [19]:
task = "remote_homology"

y_train = read_dataset('label', task, "train")
X_train_e = read_dataset('elmo', task, "train")
X_train_u = read_dataset('unirep', task, "train")
X_train_t = read_dataset('transformer', task, "train")


In [21]:
X_train_app, y_train = ensemble_append_mean_reps([X_train_e, X_train_u, X_train_t], y_train)
scaler = preprocessing.StandardScaler().fit(X_train_app)
X_train_app = scaler.transform(X_train_app)

print(X_train_app.shape)
print(y_train.shape)

(12305, 3692)
(12305,)


In [22]:
from sklearn.linear_model import LogisticRegression
from timeit import default_timer as timer
from datetime import timedelta

start = timer()
clf = LogisticRegression(max_iter=5000)
clf.fit(X_train_app, y_train)
end = timer()
print(f"fit time: ", timedelta(seconds=end-start))


fit time:  0:43:56.453212


In [23]:
import os
# when run will play a ping sound!
os.system("printf '\a'")
os.system("printf '\a'")
os.system("printf '\a'")
print("Done!")

Done!


In [24]:
train_score = clf.score(X_train_app, y_train)
print(f"append model train score: ", train_score)

append model train score:  1.0


In [25]:
for split in ['valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']:
    X_test_e = read_dataset('elmo', task, split)
    X_test_u = read_dataset('unirep', task, split)
    X_test_t = read_dataset('transformer', task, split)
    y_test_dict = read_dataset('label', task, split)
    X_test_app, y_test = ensemble_append_mean_reps([X_test_e, X_test_u, X_test_t], y_test_dict)
    
    X_test_app = scaler.transform(X_test_app)
    test_score = clf.score(X_test_app, y_test)
    print(f"append model {split} score: ", test_score)

append model valid score:  0.3746594005449591
append model test_fold_holdout score:  0.2743732590529248
append model test_superfamily_holdout score:  0.46485623003194887
append model test_family_holdout score:  0.9544025157232704


In [41]:
from sklearn.utils.extmath import randomized_svd
from sklearn import preprocessing
import numpy as np

# work in progress
def ensemble_append_SVD_mean_reps(uni_dict, elmo_dict, labels, d=100):
    
    new_dict = dict()
    for key in labels:
        seqs = []
        
        # first normalize the samples
        uni_sample =  preprocessing.normalize(uni_dict[key], norm='l2')
        elmo_sample = preprocessing.normalize(elmo_dict[key], norm='l2')
        
        # get full SVD if the length is less than the dimensionality reduction, d
        need_full = False
        if uni_sample.shape[0] < d:
            need_full = True
        
        # extend elmo sample by two to match unirep
        elmo_sample = np.append([elmo_sample[0]], [*elmo_sample, elmo_sample[-1]], axis=0)

        # concatenate samples
        combined_embd = np.concatenate([uni_sample, elmo_sample], axis=1)

        print(combined_embd.shape)
        
        # compute svd
        U, S, _ = np.linalg.svd(combined_embd, full_matrices=need_full)
           
        print(U.shape)      
        print(U[:d].shape)   
        
        # take mean for single vector
        final_emb = np.mean(U[:d], axis=1)
        
        print(final_emb.shape) 
            
        # set into dict
        new_dict[key] = final_emb

    # dict to array
    emb_size = list(new_dict.values())[0].shape[0]
    X = np.zeros((len(new_dict), emb_size))
    y = np.zeros(len(new_dict))
    
    i = 0
    for key in new_dict:
        X[i] = new_dict[key]
        y[i] = labels[key]
        i += 1
        
    return X, y


In [20]:
from scipy.linalg import orthogonal_procrustes
from sklearn.decomposition import PCA
# work in progress
def ensemble_PCA_orthogonal_procrustes_reps(m1, m2, min_dim=99999999):
    
    min_dim = min(min(min_dim, m1.shape[0]), m2.shape[0])
    print("reducing to dim", min_dim)
    
    if m1.shape[1] > min_dim:
        m1 = PCA(n_components=min_dim).fit(m1).transform(m1)
        print(m1.shape)
        
    if m2.shape[1] > min_dim:
        m2 = PCA(n_components=min_dim).fit(m2).transform(m2)
        print(m2.shape)

ensemble_PCA_orthogonal_procrustes_reps(X_e, X_u)

reducing to dim 734
(734, 734)
(734, 734)
