In [3]:
from sklearn.linear_model import LogisticRegression
from timeit import default_timer as timer
from datetime import timedelta
import os
from sklearn import preprocessing
import numpy as np
from read_scripts import dict_2_arr 
from read_scripts import read_dataset 
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import PCA

task = "remote_homology"

def fit_logistic(X, y):
    start = timer()
    clf = LogisticRegression(max_iter=5000)
    clf.fit(X, y)
    end = timer()
    print(f"fit time: ", timedelta(seconds=end-start))
    
    train_score = clf.score(X, y)
    print(f"model train score: ", train_score)
    
    # when run will play a ping sound!
    os.system("printf '\a'")
    os.system("printf '\a'")
    os.system("printf '\a'")
    print("Done!")
    
    return clf

# takes an array of dict_data
# and combines embeddings by averaging with self then appending with other
# convers to array with labels and returns
def ensemble_append_mean_reps_weighted(dicts, weights, labels):
    
    new_dict = dict()
    keys = dicts[0].keys()
    for key in keys:
        seqs = []
        i = 0
        for d in dicts:
            seq = np.mean(d[key], axis=0)
            seq = preprocessing.normalize([seq], norm='l2') * weights[i]
            seqs.append(seq)
            i += 1
        combined_seqs = np.concatenate(seqs, axis=1)
        new_dict[key] = combined_seqs

    emb_size = list(new_dict.values())[0].shape[1]
    X = np.zeros((len(new_dict), emb_size))
    y = np.zeros(len(new_dict))
    
    i = 0
    for key in new_dict:
        X[i] = new_dict[key]
        y[i] = labels[key]
        i += 1
        
    return X, y

In [12]:
def do_weighted_concat(weights, scale=True):
    
    all_scores = {}
    
    y_train = read_dataset('label', task, "train")
    X_train_e = read_dataset('elmo', task, "train")
    X_train_t = read_dataset('transformer', task, "train")
    X_train_u = read_dataset('unirep', task, "train")

    print("========================================================")
    print(f"weighted CONACT elmo: {weights[0]} transformer: {weights[1]} unirep: {weights[2]}")

    X_train_app, y_train = ensemble_append_mean_reps_weighted([X_train_e, X_train_t, X_train_u], weights, y_train)
    
    if scale:
        scaler = preprocessing.StandardScaler().fit(X_train_app)
        X_train_app = scaler.transform(X_train_app)
        
    clf = fit_logistic(X_train_app, y_train)
    
    all_scores['train'] = clf.score(X_train_app, y_train)
    
    for split in ['valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']:
        X_test_e = read_dataset('elmo', task, split)
        X_test_u = read_dataset('unirep', task, split)
        X_test_t = read_dataset('transformer', task, split)
        y_test_dict = read_dataset('label', task, split)
        X_test_app, y_test = ensemble_append_mean_reps_weighted([X_test_e, X_test_t, X_test_u], weights, y_test_dict)

        if scale:
            X_test_app = scaler.transform(X_test_app)
            
        test_score = clf.score(X_test_app, y_test)
        all_scores[split] = test_score
        
        print(f"weighted CONACT {split} score, scale={scale}: ", test_score)
        
    return all_scores, clf
    

In [6]:
all_scores_all_weights = {}
all_scores, clf = do_weighted_concat([3,2,1])

weighted CONACT elmo: 3 transformer: 2 unirep: 1
fit time:  1:07:21.505920
model train score:  1.0
Done!
weighted CONACT valid score:  0.3746594005449591
weighted CONACT test_fold_holdout score:  0.27298050139275765
weighted CONACT test_superfamily_holdout score:  0.46485623003194887
weighted CONACT test_family_holdout score:  0.9544025157232704


In [7]:
all_scores_all_weights = {}
all_scores, clf = do_weighted_concat([1,3,1])

weighted CONACT elmo: 1 transformer: 3 unirep: 1
fit time:  1:06:33.164358
model train score:  1.0
Done!
weighted CONACT valid score:  0.3746594005449591
weighted CONACT test_fold_holdout score:  0.2743732590529248
weighted CONACT test_superfamily_holdout score:  0.46485623003194887
weighted CONACT test_family_holdout score:  0.9544025157232704


In [10]:
all_scores_all_weights = {}
all_scores, clf = do_weighted_concat([1,12,1])

weighted CONACT elmo: 1 transformer: 12 unirep: 1
fit time:  1:04:17.974441
model train score:  1.0
Done!
weighted CONACT valid score:  0.3746594005449591
weighted CONACT test_fold_holdout score:  0.2743732590529248
weighted CONACT test_superfamily_holdout score:  0.46485623003194887
weighted CONACT test_family_holdout score:  0.9544025157232704


In [14]:
all_scores_all_weights = {}
all_scores, clf = do_weighted_concat([1,10,1], scale=False)

weighted CONACT elmo: 1 transformer: 10 unirep: 1
fit time:  0:32:20.049799
model train score:  0.9919544900446973
Done!
weighted CONACT valid score, scale=False:  0.3569482288828338
weighted CONACT test_fold_holdout score, scale=False:  0.25487465181058494
weighted CONACT test_superfamily_holdout score, scale=False:  0.41134185303514376
weighted CONACT test_family_holdout score, scale=False:  0.9205974842767296


In [15]:
all_scores_all_weights = {}
all_scores, clf = do_weighted_concat([1,3,1], scale=False)

weighted CONACT elmo: 1 transformer: 3 unirep: 1
fit time:  0:09:45.717715
model train score:  0.8186103210077205
Done!
weighted CONACT valid score, scale=False:  0.36239782016348776
weighted CONACT test_fold_holdout score, scale=False:  0.27019498607242337
weighted CONACT test_superfamily_holdout score, scale=False:  0.4169329073482428
weighted CONACT test_family_holdout score, scale=False:  0.9158805031446541


In [16]:
all_scores_all_weights = {}
all_scores, clf = do_weighted_concat([3,2,1], scale=False)

weighted CONACT elmo: 3 transformer: 2 unirep: 1
fit time:  0:09:59.401634
model train score:  0.9036976838683461
Done!
weighted CONACT valid score, scale=False:  0.3678474114441417
weighted CONACT test_fold_holdout score, scale=False:  0.28551532033426186
weighted CONACT test_superfamily_holdout score, scale=False:  0.4353035143769968
weighted CONACT test_family_holdout score, scale=False:  0.9355345911949685


In [17]:
all_scores_all_weights = {}
all_scores, clf = do_weighted_concat([6,3,1], scale=False)

weighted CONACT elmo: 6 transformer: 3 unirep: 1
fit time:  0:18:43.966390
model train score:  0.9959366111336855
Done!
weighted CONACT valid score, scale=False:  0.3787465940054496
weighted CONACT test_fold_holdout score, scale=False:  0.2883008356545961
weighted CONACT test_superfamily_holdout score, scale=False:  0.4528753993610224
weighted CONACT test_family_holdout score, scale=False:  0.940251572327044


In [18]:
all_scores_all_weights = {}
all_scores, clf = do_weighted_concat([8,4,1], scale=False)

weighted CONACT elmo: 8 transformer: 4 unirep: 1
fit time:  0:30:51.901065
model train score:  0.9996749288906949
Done!
weighted CONACT valid score, scale=False:  0.3801089918256131
weighted CONACT test_fold_holdout score, scale=False:  0.27715877437325903
weighted CONACT test_superfamily_holdout score, scale=False:  0.4536741214057508
weighted CONACT test_family_holdout score, scale=False:  0.9418238993710691


In [19]:
all_scores_all_weights = {}
all_scores, clf = do_weighted_concat([10,5,1], scale=False)

weighted CONACT elmo: 10 transformer: 5 unirep: 1
fit time:  0:32:15.722564
model train score:  1.0
Done!
weighted CONACT valid score, scale=False:  0.3801089918256131
weighted CONACT test_fold_holdout score, scale=False:  0.27298050139275765
weighted CONACT test_superfamily_holdout score, scale=False:  0.4512779552715655
weighted CONACT test_family_holdout score, scale=False:  0.9426100628930818


In [20]:
all_scores_all_weights = {}
all_scores, clf = do_weighted_concat([6,6,1], scale=False)

weighted CONACT elmo: 6 transformer: 6 unirep: 1
fit time:  0:20:49.086698
model train score:  0.9991060544494108
Done!
weighted CONACT valid score, scale=False:  0.385558583106267
weighted CONACT test_fold_holdout score, scale=False:  0.2827298050139276
weighted CONACT test_superfamily_holdout score, scale=False:  0.4488817891373802
weighted CONACT test_family_holdout score, scale=False:  0.949685534591195


In [21]:
all_scores_all_weights = {}
all_scores, clf = do_weighted_concat([6,2,1], scale=False)

weighted CONACT elmo: 6 transformer: 2 unirep: 1
fit time:  0:17:20.777833
model train score:  0.9934173100365705
Done!
weighted CONACT valid score, scale=False:  0.3773841961852861
weighted CONACT test_fold_holdout score, scale=False:  0.2841225626740947
weighted CONACT test_superfamily_holdout score, scale=False:  0.4472843450479233
weighted CONACT test_family_holdout score, scale=False:  0.9363207547169812


In [None]:
all_scores_all_weights = {}
all_scores, clf = do_weighted_concat([6,4,1], scale=False)

weighted CONACT elmo: 6 transformer: 4 unirep: 1
