In [43]:
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from read_script import read_dataset

In [21]:
from tqdm import tqdm_notebook as tqdm

In [24]:
# converts the dict to an array
# averages protein embedding across sequence
def converDictToArrAndAverageEmbedding(data_dict, labels):

    emb_size = list(data_dict.values())[0].shape[1]
    X = np.zeros((len(data_dict), emb_size))
    y = np.zeros(len(data_dict))
    
    i = 0
    for key in data_dict:
        X[i] = np.mean(data_dict[key], axis=0)
        y[i] = labels[key]
        i += 1
        
    return X, y


In [25]:
def convertDictToArrAndRandomAverage(data_dict, labels):
    np.random.seed(42)
    
    emb_size = list(data_dict.values())[0].shape[1]
    X = np.zeros((len(data_dict), emb_size))
    y = np.zeros(len(data_dict))
    
    i = 0
    for key in tqdm(data_dict):
        x = data_dict[key]
        n_rows, n_cols = x.shape
        random_indices = np.random.choice(n_rows, size=min(50, n_rows), replace=False)
        x = x[random_indices, :]
        
        X[i] = np.mean(x, axis=0)
        y[i] = labels[key]
        i += 1
        
    return X, y

In [40]:
def converDictToArrAndAverageMax(data_dict, labels):

    emb_size = list(data_dict.values())[0].shape[1]
    X = np.zeros((len(data_dict), emb_size*2))
    y = np.zeros(len(data_dict))
    
    i = 0
    for key in data_dict:
        x = data_dict[key]
        ma = np.amax(x, axis=0) 
        me = np.mean(x, axis=0)
        x = np.concatenate((ma, me), axis=None)
        
        X[i] = x
        y[i] = labels[key]
        i += 1
        
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
        
    return X, y


In [26]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

def fit_multiclass_svm(X_dict, y_dict, converter):

    print("reshape data...")
    X, y = converter(X_dict, y_dict)
    print("X shape: ", X.shape)
    print("y shape: ", y.shape)
    
    print("fitting svm...")
    clf = OneVsRestClassifier(LinearSVC())
    clf.fit(X, y)
    
    return clf

In [27]:
def score_clf(X_dict, y_dict, clf, converter):
    
    print("reshape data...")
    X, y = converter(X_dict, y_dict)
    print("X shape: ", X.shape)
    print("y shape: ", y.shape)
    
    print("scoring svm...")
    res = clf.score(X, y)
    
    return res
    

In [30]:
from timeit import default_timer as timer
from datetime import timedelta


def svm_base_eval(model, task, converter):

    print(f"svm_base_eval for model: {model}, task: {task}")
    print("===============================================")
    
    print("\nTRAIN")
    # fit svm for elmo, remote_homology data set
    X_train = read_dataset(model, task, 'train')
    y_train = read_dataset('label', task, 'train')

    start = timer()
    clf = fit_multiclass_svm(X_train, y_train, converter)
    end = timer()
    print("fit time: ", timedelta(seconds=end-start))

    print("\nVALID")
    # validiate svm for elmo, remote_homology data set
    X_valid = read_dataset(model, task, 'valid')
    y_valid = read_dataset('label', task, 'valid')

    valid_score = score_clf(X_valid, y_valid, clf, converter)

    print(f"{model}  valid score: ", valid_score)

    print("\nTEST")
    # test svm for elmo, remote_homology data set
    X_test = read_dataset(model, task, 'test')
    y_test = read_dataset('label', task, 'test')

    test_score = score_clf(X_test, y_test, clf, converter)

    print(f"{model} test score: ", test_score)


In [31]:
%%time
svm_base_eval("elmo", "remote_homology", convertDictToArrAndRandomAverage)

svm_base_eval for model: elmo, task: remote_homology

TRAIN
reshape data...


HBox(children=(IntProgress(value=0, max=12305), HTML(value='')))


X shape:  (12305, 1024)
y shape:  (12305,)
fitting svm...
fit time:  0:09:22.084535

VALID
reshape data...


HBox(children=(IntProgress(value=0, max=734), HTML(value='')))


X shape:  (734, 1024)
y shape:  (734,)
scoring svm...
elmo  valid score:  0.37329700272479566

TEST
reshape data...


HBox(children=(IntProgress(value=0, max=718), HTML(value='')))


X shape:  (718, 1024)
y shape:  (718,)
scoring svm...
elmo test score:  0.22841225626740946
CPU times: user 9min 27s, sys: 10.1 s, total: 9min 38s
Wall time: 9min 38s


# baselines

In [16]:
svm_base_eval("unirep", "remote_homology")

svm_base_eval for model: unirep, task: remote_homology

TRAIN
reshape data...
X shape:  (12305, 1900)
y shape:  (12305,)
fitting svm...
fit time:  0:21:43.167036

VALID
reshape data...
X shape:  (734, 1900)
y shape:  (734,)
scoring svm...
unirep  valid score:  0.2847411444141689

TEST
reshape data...
X shape:  (718, 1900)
y shape:  (718,)
scoring svm...
unirep test score:  0.21030640668523676


In [7]:
svm_base_eval("elmo", "remote_homology")

svm_base_eval for model: elmo, task: remote_homology

TRAIN
reshape data...
X shape:  (12305, 1024)
y shape:  (12305,)
fitting svm...
fit time:  0:06:55.138425

VALID
reshape data...
X shape:  (734, 1024)
y shape:  (734,)
scoring svm...
elmo  valid score:  0.38419618528610355

TEST
reshape data...
X shape:  (718, 1024)
y shape:  (718,)
scoring svm...
elmo test score:  0.233983286908078
