In [31]:
from sklearn.linear_model import LogisticRegression
from timeit import default_timer as timer
from datetime import timedelta
import os
from sklearn import preprocessing
import numpy as np
from read_scripts import dict_2_arr 
from read_scripts import read_dataset 
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor

task = "remote_homology"

def fit_logistic(X, y):
    start = timer()
    clf = LogisticRegression(max_iter=5000)
    clf.fit(X, y)
    end = timer()
    print(f"fit time: ", timedelta(seconds=end-start))
    
    train_score = clf.score(X, y)
    print(f"model train score: ", train_score)
    
    # when run will play a ping sound!
    os.system("printf '\a'")
    os.system("printf '\a'")
    os.system("printf '\a'")
    print("Done!")
    
    return clf

In [32]:
def get_single_model_normalized_corpus(model):
    ys = []
    Xs = []

    # read all splits
    for split in ['train', 'valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']:

        y_dict = read_dataset('label', task, split)
        X, y = dict_2_arr(read_dataset(model, task, split), y_dict)
        
        ys.append(y)
        Xs.append(X)  
    
    # concat all splits
    corpus = np.concatenate(Xs, axis=0)
    
    return corpus, ys

In [63]:
def get_combined_normalized_corpus():
    ys = []
    els = []
    trfs = []
    unis = []

    # read all splits
    for split in ['train', 'valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']:

        y_dict = read_dataset('label', task, split)
        X_e, y = dict_2_arr(read_dataset('elmo', task, split), y_dict)
        X_t, y = dict_2_arr(read_dataset('transformer', task, split), y_dict)
        X_u, y = dict_2_arr(read_dataset('unirep', task, split), y_dict)

        ys.append(y)
        els.append(X_e)  
        trfs.append(X_t)
        unis.append(X_u)
    
    # concat all splits
    e_corpus = np.concatenate(els, axis=0)
    t_corpus = np.concatenate(trfs, axis=0)
    u_corpus = np.concatenate(unis, axis=0)
    
    # normalize each indvidually 
    e_corpus = preprocessing.normalize(e_corpus, norm='l2')
    t_corpus = preprocessing.normalize(t_corpus, norm='l2')
    u_corpus = preprocessing.normalize(u_corpus, norm='l2')
    
    # concatenate all corpuses
    combined = np.concatenate([e_corpus, t_corpus, u_corpus], axis=1)
    
    return combined, ys

In [33]:
def train_test_over_corpus(corpus, ys, model_name):

    all_score = {}

    # get out training slice
    X_train = corpus[:len(ys[0])]
    y_train = ys[0]

    # scale
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)

    # fit!
    clf = fit_logistic(X_train, y_train)

    # record train score
    train_score = clf.score(X_train, y_train)
    print(f"{model_name} train score: ", train_score)

    all_score["train"]  = train_score

    # get slices for remaing splits and score
    remain_splits = ['valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']
    end = len(ys[0])
    for i in range(len(remain_splits)):
        split = remain_splits[i]
        start = end
        end = len(ys[i+1])+start

        X = corpus[start:end]
        y = ys[i+1]

        X = scaler.transform(X)

        test_score = clf.score(X, y)

        all_score[split]  = test_score

        print(f"{model_name} {split} score: ", test_score)

    return all_score

In [67]:
def train_auto_encoder(X, latent_dim, hidden_layers):

    n_input = X.shape[1]

    full_layers = hidden_layers + [latent_dim] + hidden_layers[::-1]
    print("layers", full_layers)
    
    reg = MLPRegressor(hidden_layer_sizes = full_layers, 
                       activation = 'relu', 
                       solver = 'adam', 
                       learning_rate_init = 0.0001, 
                       max_iter = 100, 
                       tol = 0.0000001, 
                       verbose = True)
    
    print("fitting auto_encoder...")
    reg.fit(X, X)
    
    auto_encoder_train_score = reg.score(X, X)
    print("auto_encoder_train_score:", auto_encoder_train_score)
    return reg


def encoder(X, reg):
    print("encoding...")
    X = np.asmatrix(X)
    
    encoder1 = X*reg.coefs_[0] + reg.intercepts_[0]
    encoder1 = (np.exp(encoder1) - np.exp(-encoder1))/(np.exp(encoder1) + np.exp(-encoder1))
    
    encoder2 = encoder1*reg.coefs_[1] + reg.intercepts_[1]
    encoder2 = (np.exp(encoder2) - np.exp(-encoder2))/(np.exp(encoder2) + np.exp(-encoder2))
    
    latent = encoder2*reg.coefs_[2] + reg.intercepts_[2]
    latent = (np.exp(latent) - np.exp(-latent))/(np.exp(latent) + np.exp(-latent))
    
    print("encoded as: ", latent.shape)
    
    return np.asarray(latent)

def train_and_encode(X, latent_dim=200, hidden_layers=[500, 300]):
    
    reg = train_auto_encoder(X, latent_dim, hidden_layers)
    X_encoded = encoder(X, reg)
    
    return X_encoded

In [61]:
X_e, ys = get_single_model_normalized_corpus('elmo')
X_t, ys = get_single_model_normalized_corpus('transformer')
X_u, ys = get_single_model_normalized_corpus('unirep')

X_t_encoded = train_and_encode(X_t)
X_u_encoded = train_and_encode(X_u)
X_e_encoded = train_and_encode(X_e)

conact_encoded = np.concatenate([X_t_encoded, X_u_encoded, X_e_encoded], axis=1)

scores = train_test_over_corpus(conact_encoded, ys, "auto_encode_then_concat")

In [68]:
X, ys = get_combined_normalized_corpus()

In [69]:
X = train_and_encode(X, hidden_layers=[2000, 1000, 500])

scores = train_test_over_corpus(X, ys, "concat_then_auto_encode")

layers [2000, 1000, 500, 200, 500, 1000, 2000]
fitting auto_encoder...
Iteration 1, loss = 0.00273546
Iteration 2, loss = 0.00258025
Iteration 3, loss = 0.00255058
Iteration 4, loss = 0.00253404
Iteration 5, loss = 0.00252201
Iteration 6, loss = 0.00251289
Iteration 7, loss = 0.00250583
Iteration 8, loss = 0.00250029
Iteration 9, loss = 0.00249586
Iteration 10, loss = 0.00249227
Iteration 11, loss = 0.00248926
Iteration 12, loss = 0.00248679
Iteration 13, loss = 0.00248467
Iteration 14, loss = 0.00248287
Iteration 15, loss = 0.00248134
Iteration 16, loss = 0.00248000
Iteration 17, loss = 0.00247886
Iteration 18, loss = 0.00247785
Iteration 19, loss = 0.00247697
Iteration 20, loss = 0.00247617
Iteration 21, loss = 0.00247545
Iteration 22, loss = 0.00247478
Iteration 23, loss = 0.00247416
Iteration 24, loss = 0.00247358
Iteration 25, loss = 0.00247301
Iteration 26, loss = 0.00247246
Iteration 27, loss = 0.00247191
Iteration 28, loss = 0.00247136
Iteration 29, loss = 0.00247081
Iteration 



final loss:  0.7976091125686712
encoding...
encoded as:  (16281, 500)
fit time:  0:10:03.322228
model train score:  1.0
Done!
concat_then_auto_encode train score:  1.0
concat_then_auto_encode valid score:  0.35967302452316074
concat_then_auto_encode test_fold_holdout score:  0.25487465181058494
concat_then_auto_encode test_superfamily_holdout score:  0.4353035143769968
concat_then_auto_encode test_family_holdout score:  0.934748427672956


In [98]:
import keras
from keras import layers
from keras import backend as K
from keras.layers import Lambda, Concatenate

def build_autoencoder(X):
    input_dim = X.shape[1]
    x = keras.Input(shape=(input_dim,))
    encoded = layers.Dense(500, activation='relu')(x)
    encoded = layers.Dense(300, activation='relu')(encoded)
    encoded = layers.Dense(latent_dim, activation='relu')(encoded)
    decoded = layers.Dense(300, activation='relu')(encoded)
    decoded = layers.Dense(500, activation='relu')(decoded)
    decoded = layers.Dense(input_dim, activation='sigmoid')(decoded)
    encoder = keras.Model(x1, encoded1)  
    return encoder, decoded

def build_DAEME(X1, X2, X3, latent_dim):
    

    encoder1, decoded1 = build_autoencoder(X1)
    

    encoder1 = keras.Model(x1, encoded1)
    encoder2 = keras.Model(x2, encoded2)
    
    encoder3 = keras.x3(x_full, encoded3)
    
    encoder_full = keras.Model(x_full, encoded_full)
    auto_encoder = keras.Model(x_full, decoded_full)    
    
    def DAEME_loss(e1, e2, e3, x1, x2, x3):

        def loss(y_true, y_pred):
            
            x1_e = e1.predict(x1)
            x2_e = e2.predict(x2)
            x3_e = e3.predict(x3)
            
            return K.sum(K.square(x1_e - x2_e), axis=-1)
        
        return loss
    
    auto_encoder.compile(optimizer='adam',
                  loss=DAEME_loss(encoder1, encoder2, encoder3, x1, x2, x3),
                  metrics=['accuracy'])

    return auto_encoder, encoder_full

In [93]:
X, ys = get_combined_normalized_corpus()