In [3]:
# run me first!

from sklearn.linear_model import LogisticRegression
from timeit import default_timer as timer
from datetime import timedelta
import os
from sklearn import preprocessing
import numpy as np
from read_scripts import dict_2_arr 
from read_scripts import read_dataset 
from sklearn.utils.extmath import randomized_svd
from scipy.linalg import orthogonal_procrustes
from sklearn.decomposition import PCA

task = "remote_homology"

def fit_logistic(X, y):
    start = timer()
    clf = LogisticRegression(max_iter=5000)
    clf.fit(X, y)
    end = timer()
    print(f"fit time: ", timedelta(seconds=end-start))
    
    train_score = clf.score(X, y)
    print(f"model train score: ", train_score)
    
    # when run will play a ping sound!
    os.system("printf '\a'")
    os.system("printf '\a'")
    os.system("printf '\a'")
    print("Done!")
    
    return clf


In [18]:
# takes an array of dict_data
# and combines embeddings by averaging with self then appending with other
# convers to array with labels and returns
def ensemble_append_mean_reps(dicts, labels):
    
    new_dict = dict()
    keys = dicts[0].keys()
    for key in keys:
        seqs = []
        for d in dicts:
            seq = np.mean(d[key], axis=0)
            seq = preprocessing.normalize([seq], norm='l2')
            seqs.append(seq)
        combined_seqs = np.concatenate(seqs, axis=1)
        new_dict[key] = combined_seqs

    emb_size = list(new_dict.values())[0].shape[1]
    X = np.zeros((len(new_dict), emb_size))
    y = np.zeros(len(new_dict))
    
    i = 0
    for key in new_dict:
        X[i] = new_dict[key]
        y[i] = labels[key]
        i += 1
        
    return X, y

In [19]:

y_train = read_dataset('label', task, "train")
X_train_e = read_dataset('elmo', task, "train")
X_train_u = read_dataset('unirep', task, "train")
X_train_t = read_dataset('transformer', task, "train")


In [21]:
X_train_app, y_train = ensemble_append_mean_reps([X_train_e, X_train_u, X_train_t], y_train)
scaler = preprocessing.StandardScaler().fit(X_train_app)
X_train_app = scaler.transform(X_train_app)

print(X_train_app.shape)
print(y_train.shape)

(12305, 3692)
(12305,)


In [22]:
fit_logistic(X_train_app, y_train)

fit time:  0:43:56.453212


In [25]:
for split in ['valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']:
    X_test_e = read_dataset('elmo', task, split)
    X_test_u = read_dataset('unirep', task, split)
    X_test_t = read_dataset('transformer', task, split)
    y_test_dict = read_dataset('label', task, split)
    X_test_app, y_test = ensemble_append_mean_reps([X_test_e, X_test_u, X_test_t], y_test_dict)
    
    X_test_app = scaler.transform(X_test_app)
    test_score = clf.score(X_test_app, y_test)
    print(f"append model {split} score: ", test_score)

append model valid score:  0.3746594005449591
append model test_fold_holdout score:  0.2743732590529248
append model test_superfamily_holdout score:  0.46485623003194887
append model test_family_holdout score:  0.9544025157232704


In [45]:
def SVD_reduce_dim(m, dim):
    need_full = False
    
    # compute svd
    U, S, _ = np.linalg.svd(m)
    
    # multiply U against Sigma
    reduced = np.matmul(U[:,:dim], np.diag(S[:dim]))
    
    return reduced

# work in progress
def ensemble_SVD_concat(matrices, dim):
    
    # concatenate samples
    combined = np.concatenate(matrices, axis=1)
    
    # reduce dimension
    reduced = SVD_reduce_dim(combined, dim)
    
    print(f"reduced shape {reduced.shape}")
    
    return reduced


# # work in progress
# def ensemble_SVD_orthogonal_procrustes_reps(m1, m2, min_dim=99999999):
    
#     print(f"IN m1 shape: {m1.shape}, m2 shape: {m2.shape}")
    
#     min_dim = min(min(min_dim, m1.shape[1]), m2.shape[1])
#     print("reducing to dim", min_dim)
    
#     if m1.shape[1] > min_dim:

#         m1 = PCA(n_components=min_dim).fit(m1).transform(m1)
#         print(m1.shape)
        
#     if m2.shape[1] > min_dim:
#         m2 = PCA(n_components=min_dim).fit(m2).transform(m2)
#         print(m2.shape)
        
#     print(f"OUT m1 shape: {m1.shape}, m2 shape: {m2.shape}")
    

In [47]:
dim = 650
ys = []
els = []
trfs = []
unis = []

for split in ['train', 'valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']:
    
    y_dict = read_dataset('label', task, split)
    X_e, y = dict_2_arr(read_dataset('elmo', task, split), y_dict)
    X_t, y = dict_2_arr(read_dataset('transformer', task, split), y_dict)
    X_u, y = dict_2_arr(read_dataset('unirep', task, split), y_dict)
    
    
    ys.append(y)
    els.append(X_e)  
    trfs.append(X_t)
    unis.append(X_u)
    
e_corpus = np.concatenate(els, axis=0)
t_corpus = np.concatenate(trfs, axis=0)
u_corpus = np.concatenate(unis, axis=0)
                          
e_corpus = preprocessing.normalize(e_corpus, norm='l2')
t_corpus = preprocessing.normalize(t_corpus, norm='l2')
u_corpus = preprocessing.normalize(u_corpus, norm='l2')
                          
combine_corpus = ensemble_SVD_concat([e_corpus, t_corpus, u_corpus], dim)
print(combine_corpus.shape)

X_train = combine_corpus[:len(ys[0])]
y_train = ys[0]

print(X_train.shape)
print(y_train.shape)

reduced shape (16281, 650)
(16281, 650)
(12305, 650)
(12305,)


In [48]:
# scale
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

# fit!
clf = fit_logistic(X_train, y_train)

fit time:  0:03:00.446279
model train score:  1.0
Done!


In [49]:
remain_splits = ['valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']
end = len(ys[0])

for i in range(len(remain_splits)):
    split = remain_splits[i]
    start = end
    end = len(ys[i+1])+start
    print(start,end)
    X = combine_corpus[start:end]
    y = ys[i+1]
    
    print(split)
    print(X.shape)
    print(y.shape)
    
    X = scaler.transform(X)
    
    test_score = clf.score(X, y)
    print(f"conact then SVD-{dim} model {split} score: ", test_score)

12305 13039
valid
(734, 650)
(734,)
conact then SVD-650 model valid score:  0.37329700272479566
13039 13757
test_fold_holdout
(718, 650)
(718,)
conact then SVD-650 model test_fold_holdout score:  0.24512534818941503
13757 15009
test_superfamily_holdout
(1252, 650)
(1252,)
conact then SVD-650 model test_superfamily_holdout score:  0.4472843450479233
15009 16281
test_family_holdout
(1272, 650)
(1272,)
conact then SVD-650 model test_family_holdout score:  0.9520440251572327


In [None]:
# def do_conact_SVD(dim):
#     print(f"=========================Concat-SVD {dim}=============================\n")
#     ys = []
#     els = []
#     trfs = []
#     unis = []

#     print("")
#     for split in ['train', 'valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']:

#         y_dict = read_dataset('label', task, split)
#         X_e, y = dict_2_arr(read_dataset('elmo', task, split), y_dict)
#         X_t, y = dict_2_arr(read_dataset('transformer', task, split), y_dict)
#         X_u, y = dict_2_arr(read_dataset('unirep', task, split), y_dict)


#         ys.append(y)
#         els.append(X_e)  
#         trfs.append(X_t)
#         unis.append(X_u)

#     e_corpus = np.concatenate(els, axis=0)
#     t_corpus = np.concatenate(trfs, axis=0)
#     u_corpus = np.concatenate(unis, axis=0)

#     e_corpus = preprocessing.normalize(e_corpus, norm='l2')
#     t_corpus = preprocessing.normalize(t_corpus, norm='l2')
#     u_corpus = preprocessing.normalize(u_corpus, norm='l2')

#     combine_corpus = ensemble_SVD_concat([e_corpus, t_corpus, u_corpus], dim)

#     X_train = combine_corpus[:len(ys[0])]
#     y_train = ys[0]
