In [137]:
from sklearn.linear_model import LogisticRegression
from timeit import default_timer as timer
from datetime import timedelta
import os
from sklearn import preprocessing
import numpy as np
from read_scripts import dict_2_arr 
from read_scripts import read_dataset 
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor

task = "remote_homology"

def fit_logistic(X, y):
    start = timer()
    clf = LogisticRegression(max_iter=5000)
    clf.fit(X, y)
    end = timer()
    print(f"fit time: ", timedelta(seconds=end-start))
    
    train_score = clf.score(X, y)
    print(f"model train score: ", train_score)
    
    # when run will play a ping sound!
    os.system("printf '\a'")
    os.system("printf '\a'")
    os.system("printf '\a'")
    print("Done!")
    
    return clf

In [138]:
def get_single_model_normalized_corpus(model):
    ys = []
    Xs = []

    # read all splits
    for split in ['train', 'valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']:

        y_dict = read_dataset('label', task, split)
        X, y = dict_2_arr(read_dataset(model, task, split), y_dict)
        
        ys.append(y)
        Xs.append(X)  
    
    # concat all splits
    corpus = np.concatenate(Xs, axis=0)
    
    return corpus, ys

In [139]:
def get_combined_normalized_corpus():
    ys = []
    els = []
    trfs = []
    unis = []

    # read all splits
    for split in ['train', 'valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']:

        y_dict = read_dataset('label', task, split)
        X_e, y = dict_2_arr(read_dataset('elmo', task, split), y_dict)
        X_t, y = dict_2_arr(read_dataset('transformer', task, split), y_dict)
        X_u, y = dict_2_arr(read_dataset('unirep', task, split), y_dict)

        ys.append(y)
        els.append(X_e)  
        trfs.append(X_t)
        unis.append(X_u)
    
    # concat all splits
    e_corpus = np.concatenate(els, axis=0)
    t_corpus = np.concatenate(trfs, axis=0)
    u_corpus = np.concatenate(unis, axis=0)
    
    # normalize each indvidually 
    e_corpus = preprocessing.normalize(e_corpus, norm='l2')
    t_corpus = preprocessing.normalize(t_corpus, norm='l2')
    u_corpus = preprocessing.normalize(u_corpus, norm='l2')
    
    # concatenate all corpuses
    combined = np.concatenate([e_corpus, t_corpus, u_corpus], axis=1)
    
    return combined, ys

In [140]:
def train_test_over_corpus(corpus, ys, model_name):

    all_score = {}

    # get out training slice
    X_train = corpus[:len(ys[0])]
    y_train = ys[0]

    # scale
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)

    # fit!
    clf = fit_logistic(X_train, y_train)

    # record train score
    train_score = clf.score(X_train, y_train)
    print(f"{model_name} train score: ", train_score)

    all_score["train"]  = train_score

    # get slices for remaing splits and score
    remain_splits = ['valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']
    end = len(ys[0])
    for i in range(len(remain_splits)):
        split = remain_splits[i]
        start = end
        end = len(ys[i+1])+start

        X = corpus[start:end]
        y = ys[i+1]

        X = scaler.transform(X)

        test_score = clf.score(X, y)

        all_score[split]  = test_score

        print(f"{model_name} {split} score: ", test_score)

    return all_score

In [162]:
def train_auto_encoder(X, latent_dim, hidden_layers):

    n_input = X.shape[1]

    full_layers = hidden_layers + [latent_dim] + hidden_layers[::-1]
    print("layers", full_layers)
    
    reg = MLPRegressor(hidden_layer_sizes = full_layers, 
                       activation = 'relu', 
                       solver = 'adam', 
                       learning_rate_init = 0.0001, 
                       max_iter = 100, 
                       tol = 0.0000001, 
                       verbose = True)
    
    print("fitting auto_encoder...")
    reg.fit(X, X)
    
    auto_encoder_train_score = reg.score(X, X)
    print("auto_encoder_train_score: ", final_loss)
    return reg


def encoder(X, reg):
    print("encoding...")
    X = np.asmatrix(X)
    
    encoder1 = X*reg.coefs_[0] + reg.intercepts_[0]
    encoder1 = (np.exp(encoder1) - np.exp(-encoder1))/(np.exp(encoder1) + np.exp(-encoder1))
    
    encoder2 = encoder1*reg.coefs_[1] + reg.intercepts_[1]
    encoder2 = (np.exp(encoder2) - np.exp(-encoder2))/(np.exp(encoder2) + np.exp(-encoder2))
    
    latent = encoder2*reg.coefs_[2] + reg.intercepts_[2]
    latent = (np.exp(latent) - np.exp(-latent))/(np.exp(latent) + np.exp(-latent))
    
    print("encoded as: ", latent.shape)
    
    return np.asarray(latent)

def train_and_encode(X, latent_dim=200, hidden_layers=[500, 300]):
    
    reg = train_auto_encoder(X, latent_dim, hidden_layers)
    X_encoded = encoder(X, reg)
    
    return X_encoded

In [61]:
X_e, ys = get_single_model_normalized_corpus('elmo')
X_t, ys = get_single_model_normalized_corpus('transformer')
X_u, ys = get_single_model_normalized_corpus('unirep')

X_t_encoded = train_and_encode(X_t)
X_u_encoded = train_and_encode(X_u)
X_e_encoded = train_and_encode(X_e)

conact_encoded = np.concatenate([X_t_encoded, X_u_encoded, X_e_encoded], axis=1)

scores = train_test_over_corpus(conact_encoded, ys, "auto_encode_then_concat")

In [163]:
def do_conact_then_auto_encode_for_dim(X, ys, dim):
    
    hidden_layers = [1200, int((d+1200)/2)]
    X = train_and_encode(X, latent_dim=dim, hidden_layers=hidden_layers)
    scores = train_test_over_corpus(X, ys, f"concat_then_auto_encode dim {dim}")

In [160]:
X, ys = get_combined_normalized_corpus()

(16281, 3692)


In [160]:
dims = []
max_dim = 800
for d in range(50, max_dim+50, 50):
    dims.append(d)
all_dim_all_scores = {}

for d in dims:
    s = do_conact_then_auto_encode_for_dim(X, ys, d)
    all_dim_all_scores[str(d)] = s

layers [900, 50, 900]
fitting auto_encoder...
auto_encoder_train_score:  0.6166138346822073
encoding...
encoded as:  (16281, 50)
fit time:  0:04:37.974206
model train score:  0.8786672084518489
Done!
concat_then_auto_encode dim 50 train score:  0.8786672084518489
concat_then_auto_encode dim 50 valid score:  0.3079019073569482
concat_then_auto_encode dim 50 test_fold_holdout score:  0.2395543175487465
concat_then_auto_encode dim 50 test_superfamily_holdout score:  0.3610223642172524
concat_then_auto_encode dim 50 test_family_holdout score:  0.8828616352201258
layers [900, 100, 900]
fitting auto_encoder...


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/julesberman/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-161-f892573d1cac>", line 8, in <module>
    s = do_conact_then_auto_encode_for_dim(X, ys, d)
  File "<ipython-input-159-d5be782d5241>", line 4, in do_conact_then_auto_encode_for_dim
    X = train_and_encode(X, latent_dim=dim, hidden_layers=hidden_layers)
  File "<ipython-input-158-dae7b0be38cf>", line 43, in train_and_encode
    reg = train_auto_encoder(X, latent_dim, hidden_layers)
  File "<ipython-input-158-dae7b0be38cf>", line 19, in train_auto_encoder
    auto_encoder_train_score = reg.score(X, X)
  File "/Users/julesberman/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 552, in score
    return r2_score(y, y_pred, sample_weight=sample_weight)
  File "/Users/julesberman/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/valid

TypeError: object of type 'NoneType' has no len()

In [None]:
import pickle
with open("concat_then_auto_encode_across_dims.p", "wb") as f:
    pickle.dump(all_dim_all_scores, f)