In [137]:
from sklearn.linear_model import LogisticRegression
from timeit import default_timer as timer
from datetime import timedelta
import os
from sklearn import preprocessing
import numpy as np
from read_scripts import dict_2_arr 
from read_scripts import read_dataset 
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor

task = "remote_homology"

def fit_logistic(X, y):
    start = timer()
    clf = LogisticRegression(max_iter=5000)
    clf.fit(X, y)
    end = timer()
    print(f"fit time: ", timedelta(seconds=end-start))
    
    train_score = clf.score(X, y)
    print(f"model train score: ", train_score)
    
    # when run will play a ping sound!
    os.system("printf '\a'")
    os.system("printf '\a'")
    os.system("printf '\a'")
    print("Done!")
    
    return clf

In [138]:
def get_single_model_normalized_corpus(model):
    ys = []
    Xs = []

    # read all splits
    for split in ['train', 'valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']:

        y_dict = read_dataset('label', task, split)
        X, y = dict_2_arr(read_dataset(model, task, split), y_dict)
        
        ys.append(y)
        Xs.append(X)  
    
    # concat all splits
    corpus = np.concatenate(Xs, axis=0)
    
    return corpus, ys

In [172]:
def get_combined_normalized_corpus_weighted(weights):
    ys = []
    els = []
    trfs = []
    unis = []

    # read all splits
    for split in ['train', 'valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']:

        y_dict = read_dataset('label', task, split)
        X_e, y = dict_2_arr(read_dataset('elmo', task, split), y_dict)
        X_t, y = dict_2_arr(read_dataset('transformer', task, split), y_dict)
        X_u, y = dict_2_arr(read_dataset('unirep', task, split), y_dict)

        ys.append(y)
        els.append(X_e)  
        trfs.append(X_t)
        unis.append(X_u)
    
    # concat all splits
    e_corpus = np.concatenate(els, axis=0)
    t_corpus = np.concatenate(trfs, axis=0)
    u_corpus = np.concatenate(unis, axis=0)
    
    # normalize each indvidually 
    e_corpus = preprocessing.normalize(e_corpus, norm='l2') * weights[0]
    t_corpus = preprocessing.normalize(t_corpus, norm='l2') * weights[1]
    u_corpus = preprocessing.normalize(u_corpus, norm='l2') * weights[2]
    
    # concatenate all corpuses
    combined = np.concatenate([e_corpus, t_corpus, u_corpus], axis=1)
    
    return combined, ys

In [173]:
def train_test_over_corpus(corpus, ys, model_name):

    all_score = {}

    # get out training slice
    X_train = corpus[:len(ys[0])]
    y_train = ys[0]

    # scale
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)

    # fit!
    clf = fit_logistic(X_train, y_train)

    # record train score
    train_score = clf.score(X_train, y_train)
    print(f"{model_name} train score: ", train_score)

    all_score["train"]  = train_score

    # get slices for remaing splits and score
    remain_splits = ['valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']
    end = len(ys[0])
    for i in range(len(remain_splits)):
        split = remain_splits[i]
        start = end
        end = len(ys[i+1])+start

        X = corpus[start:end]
        y = ys[i+1]

        X = scaler.transform(X)

        test_score = clf.score(X, y)

        all_score[split]  = test_score

        print(f"{model_name} {split} score: ", test_score)

    return all_score

In [174]:
def train_auto_encoder(X, latent_dim, hidden_layers):

    n_input = X.shape[1]

    full_layers = hidden_layers + [latent_dim] + hidden_layers[::-1]
    print("layers", full_layers)
    
    reg = MLPRegressor(hidden_layer_sizes = full_layers, 
                       activation = 'relu', 
                       solver = 'adam', 
                       learning_rate_init = 0.0001, 
                       max_iter = 100, 
                       tol = 0.0000001, 
                       verbose = False)
    
    print("fitting auto_encoder...")
    reg.fit(X, X)
    
    auto_encoder_train_score = reg.score(X, X)
    print("auto_encoder_train_score: ", auto_encoder_train_score)
    return reg


def encoder(X, reg):
    print("encoding...")
    X = np.asmatrix(X)
    
    encoder1 = X*reg.coefs_[0] + reg.intercepts_[0]
    encoder1 = (np.exp(encoder1) - np.exp(-encoder1))/(np.exp(encoder1) + np.exp(-encoder1))
    
    encoder2 = encoder1*reg.coefs_[1] + reg.intercepts_[1]
    encoder2 = (np.exp(encoder2) - np.exp(-encoder2))/(np.exp(encoder2) + np.exp(-encoder2))
    
    latent = encoder2*reg.coefs_[2] + reg.intercepts_[2]
    latent = (np.exp(latent) - np.exp(-latent))/(np.exp(latent) + np.exp(-latent))
    
    print("encoded as: ", latent.shape)
    
    return np.asarray(latent)

def train_and_encode(X, latent_dim=200, hidden_layers=[500, 300]):
    
    reg = train_auto_encoder(X, latent_dim, hidden_layers)
    X_encoded = encoder(X, reg)
    
    return X_encoded

In [175]:
def do_conact_then_auto_encode_for_dim(X, ys, dim):
    
    hidden_layers = [1200, int((d+1200)/2)]
    X = train_and_encode(X, latent_dim=dim, hidden_layers=hidden_layers)
    scores = train_test_over_corpus(X, ys, f"concat_then_auto_encode dim {dim}")
    return scores

In [176]:
X, ys = get_combined_normalized_corpus_weighted([6,3,1])

In [177]:
dims = []
for d in [10, 20, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700]:
    dims.append(d)
all_dim_all_scores = {}

for d in dims:
    s = do_conact_then_auto_encode_for_dim(X, ys, d)
    all_dim_all_scores[str(d)] = s

layers [1200, 605, 10, 605, 1200]
fitting auto_encoder...




auto_encoder_train_score:  -0.18793691247078564
encoding...
encoded as:  (16281, 10)
fit time:  0:02:03.444124
model train score:  0.38228362454286874
Done!
concat_then_auto_encode dim 10 train score:  0.38228362454286874
concat_then_auto_encode dim 10 valid score:  0.14032697547683923
concat_then_auto_encode dim 10 test_fold_holdout score:  0.13370473537604458
concat_then_auto_encode dim 10 test_superfamily_holdout score:  0.17252396166134185
concat_then_auto_encode dim 10 test_family_holdout score:  0.5487421383647799
layers [1200, 610, 20, 610, 1200]
fitting auto_encoder...




auto_encoder_train_score:  -0.2058248321870798
encoding...
encoded as:  (16281, 20)
fit time:  0:02:19.995402
model train score:  0.6007314099959367
Done!
concat_then_auto_encode dim 20 train score:  0.6007314099959367
concat_then_auto_encode dim 20 valid score:  0.21934604904632152
concat_then_auto_encode dim 20 test_fold_holdout score:  0.18802228412256267
concat_then_auto_encode dim 20 test_superfamily_holdout score:  0.2715654952076677
concat_then_auto_encode dim 20 test_family_holdout score:  0.7287735849056604
layers [1200, 625, 50, 625, 1200]
fitting auto_encoder...




auto_encoder_train_score:  -0.13138658771454115
encoding...
encoded as:  (16281, 50)
fit time:  0:03:27.928876
model train score:  0.8867127184071516
Done!
concat_then_auto_encode dim 50 train score:  0.8867127184071516
concat_then_auto_encode dim 50 valid score:  0.3201634877384196
concat_then_auto_encode dim 50 test_fold_holdout score:  0.22284122562674094
concat_then_auto_encode dim 50 test_superfamily_holdout score:  0.3738019169329074
concat_then_auto_encode dim 50 test_family_holdout score:  0.8781446540880503
layers [1200, 650, 100, 650, 1200]
fitting auto_encoder...




auto_encoder_train_score:  0.025393329085391984
encoding...
encoded as:  (16281, 100)
fit time:  0:04:46.080643
model train score:  0.9775700934579439
Done!
concat_then_auto_encode dim 100 train score:  0.9775700934579439
concat_then_auto_encode dim 100 valid score:  0.35013623978201636
concat_then_auto_encode dim 100 test_fold_holdout score:  0.25487465181058494
concat_then_auto_encode dim 100 test_superfamily_holdout score:  0.4049520766773163
concat_then_auto_encode dim 100 test_family_holdout score:  0.9095911949685535
layers [1200, 675, 150, 675, 1200]
fitting auto_encoder...




auto_encoder_train_score:  0.037409317666398795
encoding...
encoded as:  (16281, 150)
fit time:  0:05:20.359345
model train score:  0.9947988622511175
Done!
concat_then_auto_encode dim 150 train score:  0.9947988622511175
concat_then_auto_encode dim 150 valid score:  0.3542234332425068
concat_then_auto_encode dim 150 test_fold_holdout score:  0.2381615598885794
concat_then_auto_encode dim 150 test_superfamily_holdout score:  0.43690095846645366
concat_then_auto_encode dim 150 test_family_holdout score:  0.9150943396226415
layers [1200, 700, 200, 700, 1200]
fitting auto_encoder...




auto_encoder_train_score:  0.04354044167265998
encoding...
encoded as:  (16281, 200)
fit time:  0:06:05.850699
model train score:  0.9984559122308004
Done!
concat_then_auto_encode dim 200 train score:  0.9984559122308004
concat_then_auto_encode dim 200 valid score:  0.3637602179836512
concat_then_auto_encode dim 200 test_fold_holdout score:  0.24930362116991645
concat_then_auto_encode dim 200 test_superfamily_holdout score:  0.4193290734824281
concat_then_auto_encode dim 200 test_family_holdout score:  0.9182389937106918
layers [1200, 725, 250, 725, 1200]
fitting auto_encoder...




auto_encoder_train_score:  0.0014086553638461474
encoding...
encoded as:  (16281, 250)
fit time:  0:05:30.682865
model train score:  0.9997561966680212
Done!
concat_then_auto_encode dim 250 train score:  0.9997561966680212
concat_then_auto_encode dim 250 valid score:  0.3555858310626703
concat_then_auto_encode dim 250 test_fold_holdout score:  0.2576601671309192
concat_then_auto_encode dim 250 test_superfamily_holdout score:  0.41533546325878595
concat_then_auto_encode dim 250 test_family_holdout score:  0.9190251572327044
layers [1200, 750, 300, 750, 1200]
fitting auto_encoder...




auto_encoder_train_score:  0.09248075169947526
encoding...
encoded as:  (16281, 300)
fit time:  0:06:48.813903
model train score:  0.9999187322226737
Done!
concat_then_auto_encode dim 300 train score:  0.9999187322226737
concat_then_auto_encode dim 300 valid score:  0.36648501362397823
concat_then_auto_encode dim 300 test_fold_holdout score:  0.22562674094707522
concat_then_auto_encode dim 300 test_superfamily_holdout score:  0.4273162939297125
concat_then_auto_encode dim 300 test_family_holdout score:  0.9276729559748428
layers [1200, 775, 350, 775, 1200]
fitting auto_encoder...




auto_encoder_train_score:  0.11037682796386543
encoding...
encoded as:  (16281, 350)
fit time:  0:07:43.093960
model train score:  0.9999187322226737
Done!
concat_then_auto_encode dim 350 train score:  0.9999187322226737
concat_then_auto_encode dim 350 valid score:  0.3801089918256131
concat_then_auto_encode dim 350 test_fold_holdout score:  0.2590529247910863
concat_then_auto_encode dim 350 test_superfamily_holdout score:  0.43929712460063897
concat_then_auto_encode dim 350 test_family_holdout score:  0.9237421383647799
layers [1200, 800, 400, 800, 1200]
fitting auto_encoder...




auto_encoder_train_score:  0.02411004235408671
encoding...
encoded as:  (16281, 400)
fit time:  0:06:35.309682
model train score:  1.0
Done!
concat_then_auto_encode dim 400 train score:  1.0
concat_then_auto_encode dim 400 valid score:  0.3678474114441417
concat_then_auto_encode dim 400 test_fold_holdout score:  0.24373259052924792
concat_then_auto_encode dim 400 test_superfamily_holdout score:  0.4464856230031949
concat_then_auto_encode dim 400 test_family_holdout score:  0.9316037735849056
layers [1200, 825, 450, 825, 1200]
fitting auto_encoder...




auto_encoder_train_score:  -0.09810211038650651
encoding...
encoded as:  (16281, 450)
fit time:  0:07:33.287856
model train score:  1.0
Done!
concat_then_auto_encode dim 450 train score:  1.0
concat_then_auto_encode dim 450 valid score:  0.3651226158038147
concat_then_auto_encode dim 450 test_fold_holdout score:  0.23119777158774374
concat_then_auto_encode dim 450 test_superfamily_holdout score:  0.4472843450479233
concat_then_auto_encode dim 450 test_family_holdout score:  0.9284591194968553
layers [1200, 850, 500, 850, 1200]
fitting auto_encoder...




auto_encoder_train_score:  -0.06179153493944744
encoding...
encoded as:  (16281, 500)
fit time:  0:07:43.253034
model train score:  1.0
Done!
concat_then_auto_encode dim 500 train score:  1.0
concat_then_auto_encode dim 500 valid score:  0.38283378746594005
concat_then_auto_encode dim 500 test_fold_holdout score:  0.24930362116991645
concat_then_auto_encode dim 500 test_superfamily_holdout score:  0.4353035143769968
concat_then_auto_encode dim 500 test_family_holdout score:  0.9292452830188679
layers [1200, 875, 550, 875, 1200]
fitting auto_encoder...




auto_encoder_train_score:  -0.024512166970674015
encoding...
encoded as:  (16281, 550)
fit time:  0:08:54.115537
model train score:  1.0
Done!
concat_then_auto_encode dim 550 train score:  1.0
concat_then_auto_encode dim 550 valid score:  0.3896457765667575
concat_then_auto_encode dim 550 test_fold_holdout score:  0.2562674094707521
concat_then_auto_encode dim 550 test_superfamily_holdout score:  0.43849840255591055
concat_then_auto_encode dim 550 test_family_holdout score:  0.9355345911949685
layers [1200, 900, 600, 900, 1200]
fitting auto_encoder...




auto_encoder_train_score:  -0.1770244400487152
encoding...
encoded as:  (16281, 600)
fit time:  0:09:42.607633
model train score:  1.0
Done!
concat_then_auto_encode dim 600 train score:  1.0
concat_then_auto_encode dim 600 valid score:  0.3923705722070845
concat_then_auto_encode dim 600 test_fold_holdout score:  0.24512534818941503
concat_then_auto_encode dim 600 test_superfamily_holdout score:  0.44568690095846647
concat_then_auto_encode dim 600 test_family_holdout score:  0.9363207547169812
layers [1200, 925, 650, 925, 1200]
fitting auto_encoder...




auto_encoder_train_score:  -0.3892306316814977
encoding...
encoded as:  (16281, 650)
fit time:  0:10:16.235997
model train score:  1.0
Done!
concat_then_auto_encode dim 650 train score:  1.0
concat_then_auto_encode dim 650 valid score:  0.38419618528610355
concat_then_auto_encode dim 650 test_fold_holdout score:  0.2562674094707521
concat_then_auto_encode dim 650 test_superfamily_holdout score:  0.4520766773162939
concat_then_auto_encode dim 650 test_family_holdout score:  0.9339622641509434
layers [1200, 950, 700, 950, 1200]
fitting auto_encoder...




auto_encoder_train_score:  -0.14419528035556914
encoding...
encoded as:  (16281, 700)
fit time:  0:09:48.001611
model train score:  1.0
Done!
concat_then_auto_encode dim 700 train score:  1.0
concat_then_auto_encode dim 700 valid score:  0.3814713896457766
concat_then_auto_encode dim 700 test_fold_holdout score:  0.2479108635097493
concat_then_auto_encode dim 700 test_superfamily_holdout score:  0.4496805111821086
concat_then_auto_encode dim 700 test_family_holdout score:  0.9378930817610063


In [178]:
import pickle
with open("concat_then_auto_encode_across_dims_weighted_631.p", "wb") as f:
    pickle.dump(all_dim_all_scores, f)