In [8]:
# run me first!

from sklearn.linear_model import LogisticRegression
from timeit import default_timer as timer
from datetime import timedelta
import os
from sklearn import preprocessing
import numpy as np
from read_scripts import dict_2_arr 
from read_scripts import read_dataset 
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import PCA

task = "remote_homology"

In [9]:
def get_combined_normalized_corpus():
    ys = []
    els = []
    trfs = []
    unis = []

    # read all splits
    for split in ['train', 'valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']:

        y_dict = read_dataset('label', task, split)
        X_e, y = dict_2_arr(read_dataset('elmo', task, split), y_dict)
        X_t, y = dict_2_arr(read_dataset('transformer', task, split), y_dict)
        X_u, y = dict_2_arr(read_dataset('unirep', task, split), y_dict)

        ys.append(y)
        els.append(X_e)  
        trfs.append(X_t)
        unis.append(X_u)
    
    # concat all splits
    e_corpus = np.concatenate(els, axis=0)
    t_corpus = np.concatenate(trfs, axis=0)
    u_corpus = np.concatenate(unis, axis=0)
    
    # normalize each indvidually 
    e_corpus = preprocessing.normalize(e_corpus, norm='l2')
    t_corpus = preprocessing.normalize(t_corpus, norm='l2')
    u_corpus = preprocessing.normalize(u_corpus, norm='l2')
    
    # concatenate all corpuses
    combined = np.concatenate([e_corpus, t_corpus, u_corpus], axis=1)
    
    return combined

In [10]:
def get_single_model_normalized_corpus(model):
    ys = []
    Xs = []

    # read all splits
    for split in ['train', 'valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']:

        y_dict = read_dataset('label', task, split)
        X, y = dict_2_arr(read_dataset(model, task, split), y_dict)
        
        ys.append(y)
        Xs.append(X)  
    
    # concat all splits
    corpus = np.concatenate(Xs, axis=0)
    
    return corpus, ys

In [11]:
X_e, y = get_single_model_normalized_corpus('elmo')
X_t, y = get_single_model_normalized_corpus('transformer')
X_u, y = get_single_model_normalized_corpus('unirep')

In [86]:
lr = 0.001
max_iter = 15
reg = 1

def one_to_N(embs_list, meta_dim):

    Ms = []
    Vs = []
    for Vi in embs_list:
        Mi = np.random.rand(Vi.shape[1], meta_dim)
        Ms.append(Mi)
        Vs.append(np.transpose(Vi))
    
    n_examples = Vs[0].shape[1]    
    Vstar = np.random.rand(meta_dim, n_examples)
    
    r = len(Ms)
    itr = 0
    while itr < max_iter:
        wn = Vstar
        term1 = np.zeros((meta_dim,n_examples))
        term2 = np.zeros((meta_dim,n_examples))
        for i in range(r):

            wi = Vs[i]
        
            wnT = np.transpose(wn)
            Mi = Ms[i]
            # M_i for next itr
            t1 = Mi.dot(wn.dot(wnT)) 
            t2 = reg*Mi
            t3 = wi.dot(wnT)
            Mip1 = Mi - lr*(t1 + t2 - t3)
            Ms[i] = Mip1
            
            # w for next itr
            MiT = np.transpose(Mi)
            res = MiT.dot(Mi.dot(wn)) 
            term1 = term1 + res
            term2 = term2 + MiT.dot(wi)


        Vstar = wn - lr*(term1 - term2)
        
        # compute cost over epoch
        cost = 0
        proj_norm = 0
        for i in range(r):
            Mi = Ms[i]
            wi = Vs[i]
            wi_hat = Mi.dot(Vstar)
            cost += np.linalg.norm(wi_hat - wi)**2 + reg*(np.linalg.norm(Mi)**2)
            proj_norm += np.linalg.norm(wi_hat - wi)
        
        print(f"Iteration {itr}, cost {cost}, proj_norm {proj_norm}")
        itr += 1
        
    return Vstar

In [87]:

embed = one_to_N([X_e, X_t, X_u], 500)

Iteration 0, cost 8.219392923166093e+23, proj_norm 1541323679058.655
Iteration 1, cost 5.552891963110048e+59, proj_norm 1.2668744904090345e+30


KeyboardInterrupt: 