In [4]:
import numpy as np
from read_script import read_dataset

In [5]:
# takes an array of dict_data
# and combines embeddings by averaging with self then appending with other
# convers to array with labels and returns
def ensemble_append_mean_reps(dicts, labels):
    
    new_dict = dict()
    keys = dicts[0].keys()
    for key in keys:
        seqs = []
        for d in dicts:
            seq = np.mean(d[key], axis=0)
            seqs.append(seq)
        combined_seqs = np.concatenate(seqs, axis=0)
        new_dict[key] = combined_seqs
        
    emb_size = list(new_dict.values())[0].shape[0]
    X = np.zeros((len(new_dict), emb_size))
    y = np.zeros(len(new_dict))
    
    i = 0
    for key in new_dict:
        X[i] = np.mean(new_dict[key], axis=0)
        y[i] = labels[key]
        i += 1
        
    return X, y

In [6]:
%%time
task = 'remote_homology'

X_train_e = read_dataset('elmo', task, 'train')
X_train_u = read_dataset('unirep', task, 'train')
y_train_dict = read_dataset('label', task, 'train')

CPU times: user 13.4 s, sys: 15 s, total: 28.3 s
Wall time: 28.4 s


In [7]:
%%time
X_train_app, y_train = ensemble_append_mean_reps([X_train_e, X_train_u], y_train_dict)
print(X_train_app.shape)

(12305, 2924)
CPU times: user 4.89 s, sys: 414 ms, total: 5.31 s
Wall time: 5.31 s


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [9]:
clf = LogisticRegression(random_state=0, solver='lbfgs').fit(X_train_app, y_train)



In [14]:
X_valid_e = read_dataset('elmo', task, 'valid')
X_valid_u = read_dataset('unirep', task, 'valid')
y_valid_dict = read_dataset('label', task, 'valid')
X_valid_app, y_valid = ensemble_append_mean_reps([X_valid_e, X_valid_u], y_valid_dict)

valid_score = clf.score(X_valid_app, y_valid)
print(f"append model valid score: ", valid_score)

append model valid score:  0.03814713896457766


In [15]:
X_test_e = read_dataset('elmo', task, 'test')
X_test_u = read_dataset('unirep', task, 'test')
y_test_dict = read_dataset('label', task, 'test')
X_test_app, y_test = ensemble_append_mean_reps([X_test_e, X_test_u], y_test_dict)

test_score = clf.score(X_test_app, y_test)
print(f"append model test score: ", test_score)

append model test score:  0.054317548746518104


In [13]:
%%time
clf_app = OneVsRestClassifier(LinearSVC())
clf_app.fit(X_train_app, y_train)

CPU times: user 1h 43min 52s, sys: 1min 13s, total: 1h 45min 5s
Wall time: 2h 53min 58s


In [16]:
X_valid_e = read_dataset('elmo', task, 'valid')
X_valid_u = read_dataset('unirep', task, 'valid')
y_valid_dict = read_dataset('label', task, 'valid')
X_valid_app, y_valid = ensemble_append_mean_reps([X_valid_e, X_valid_u], y_valid_dict)

valid_score = clf_app.score(X_valid_app, y_valid)
print(f"append model valid score: ", valid_score)

append model valid score:  0.03678474114441417


In [17]:
X_test_e = read_dataset('elmo', task, 'test')
X_test_u = read_dataset('unirep', task, 'test')
y_test_dict = read_dataset('label', task, 'test')
X_test_app, y_test = ensemble_append_mean_reps([X_test_e, X_test_u], y_test_dict)

test_score = clf_app.score(X_test_app, y_test)
print(f"append model test score: ", test_score)

append model test score:  0.057103064066852366
