In [4]:
import numpy as np
from timeit import default_timer as timer
from datetime import timedelta
from read_scripts import dict_2_arr 
from read_scripts import read_dataset 
from sklearn import preprocessing

def classifer_base_eval(model, task, classifer, scale=True, avgr=lambda x: np.mean(x, axis=0)):
        
    print(f"svm_base_eval for model: {model}, task: {task}")
    print("using classifier: ", classifer)
    print("==========================================================")
    
    print("\nTRAIN")
    split = 'train'
    X, y = dict_2_arr(read_dataset(model, task, split), read_dataset('label', task, split), avgr)
    if scale:
        print(f"scaling {split}")
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)
        
    print("fitting classifer...")
    start = timer()
    classifer.fit(X, y)
    end = timer()
    print(f"{model} {split} fit time: ", timedelta(seconds=end-start))
    score = classifer.score(X, y)
    print(f"{model} {split} score: \n", score)

    for split in ['valid', 'test_fold_holdout', 'test_superfamily_holdout', 'test_family_holdout']:
        X, y = dict_2_arr(read_dataset(model, task, split), read_dataset('label', task, split), avgr)
        if scale:
            X = scaler.transform(X)

        score = classifer.score(X, y)
        print(f"{model} {split} score: \n", score)
    
    return classifer


In [7]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=5000)
clf = classifer_base_eval("elmo", "remote_homology", clf)

svm_base_eval for model: elmo, task: remote_homology
using classifier:  LogisticRegression(max_iter=5000)

TRAIN
transforming data...
scaling train
fitting classifer...
elmo train fit time:  0:16:47.748088
elmo train score: 
 1.0
transforming data...
elmo valid score: 
 0.3555858310626703
transforming data...
elmo test_fold_holdout score: 
 0.24233983286908078
transforming data...
elmo test_superfamily_holdout score: 
 0.43370607028753994
transforming data...
elmo test_family_holdout score: 
 0.9316037735849056


In [8]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=5000)
clf = classifer_base_eval("unirep", "remote_homology", clf)

svm_base_eval for model: unirep, task: remote_homology
using classifier:  LogisticRegression(max_iter=5000)

TRAIN
transforming data...
scaling train
fitting classifer...
unirep train fit time:  0:53:31.188048
unirep train score: 
 1.0
transforming data...
unirep valid score: 
 0.279291553133515
transforming data...
unirep test_fold_holdout score: 
 0.22423398328690808
transforming data...
unirep test_superfamily_holdout score: 
 0.33706070287539935
transforming data...
unirep test_family_holdout score: 
 0.8584905660377359


In [5]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=5000)
clf = classifer_base_eval("transformer", "remote_homology", clf)

svm_base_eval for model: transformer, task: remote_homology
using classifier:  LogisticRegression(max_iter=5000)

TRAIN
transforming data...
scaling train
fitting classifer...
transformer train fit time:  0:20:05.775981
transformer train score: 
 1.0
transforming data...
transformer valid score: 
 0.3528610354223433
transforming data...
transformer test_fold_holdout score: 
 0.23259052924791088
transforming data...
transformer test_superfamily_holdout score: 
 0.3985623003194888
transforming data...
transformer test_family_holdout score: 
 0.9135220125786163


In [None]:
import os
# when run will play a ping sound!
os.system("printf '\a'")
os.system("printf '\a'")
os.system("printf '\a'")
print("Done!")