In [1]:
import os
import numpy as np
import pandas as pd
from IPython.display import display, clear_output
from sklearn.metrics import mean_absolute_error, r2_score
from src.seq2features import Transformer, GetModels, W2V_Model
from src.runBuilder import RunBuilder
from src.regress import get_CV_MAE, get_CV_MAE_combined, get_test_score,get_test_score_combine


In [2]:
def getTop(df, of, at):
    return df[df.groupby(of)[at].transform(min) == df[at]] 

In [3]:
modelComb = {
    'alphabet':['prot_vec'],
    'kGram'   :[3],
    'window'  :[5],
    'vecSize' :[100]
}
model_loc = './model-creator/models/'
protVecs  = GetModels.from_param_dict(model_loc, modelComb)

In [4]:
modelComb = {
    'alphabet':['hydro', 'conf_simil'],
    'kGram'   :[3],
    'window'  :[5],
    'vecSize' :[100]
}
model_loc = './model-creator/models/'
alphabets = GetModels.from_param_dict(model_loc, modelComb)

In [5]:
modelComb = {
    'alphabet':['hydro'],
    'kGram'   :[3],
    'window'  :[5],
    'vecSize' :[100]
}
model_loc = './model-creator/models/'
alphabets1 = GetModels.from_param_dict(model_loc, modelComb)


modelComb = {
    'alphabet':['conf_simil'],
    'kGram'   :[3],
    'window'  :[5],
    'vecSize' :[100]
}
model_loc = './model-creator/models/'
alphabets2 = GetModels.from_param_dict(model_loc, modelComb)



# Dataset

In [6]:
dset = pd.read_csv('./dataset/regression/T50.txt')
dset.head()


Unnamed: 0,name,sequence,T50,is_train,m
0,0,MTIKEMPQPKTFGELKNLPLLNTDKPVQALMKIADELGEIFKFEAP...,55.0,True,0
1,11111111,KETSPIPQPKTFGPLGNLPLIDKDKPTLSLIKLAEEQGPIFQIHTP...,43.0,True,0
2,22222222,KQASAIPQPKTYGPLKNLPHLEKEQLSQSLWRIADELGPIFRFDFP...,49.0,True,0
3,21122121,KQASAIPQPKTYGPLKNLPHLEKEQLSQSLWRIADELGPIFRFDFP...,39.8,True,70
4,21202122,KQASAIPQPKTYGPLKNLPHLEKEQLSQSLWRIADELGPIFRFDFP...,52.9,True,55


In [7]:
xData = dset['sequence']
yData = dset['T50']

Train_indx = dset['is_train'] == True
Test_indx  = dset['is_train'] == False


In [8]:
alphabets

[<src.seq2features.W2V_Model at 0x29e12590fd0>,
 <src.seq2features.W2V_Model at 0x29e125bb080>]

# Only ProtVec

In [9]:
onlyProtVec = get_CV_MAE(xData, yData, Train_indx, Test_indx, protVecs, None)
onlyProtVec

Unnamed: 0,Model,kGram,window,vecSize,protVec,mean_cv_mean_absolute_error,median_cv_mean_absolute_error,runID
0,prot_vec,3,5,100,w/o,2.469304,2.466089,0


In [10]:
top_onlyProtVec = getTop(onlyProtVec, ['Model', 'protVec'], 'mean_cv_mean_absolute_error')
top_onlyProtVec

Unnamed: 0,Model,kGram,window,vecSize,protVec,mean_cv_mean_absolute_error,median_cv_mean_absolute_error,runID
0,prot_vec,3,5,100,w/o,2.469304,2.466089,0


# Only Alphabets

In [11]:
onlyAlphabets = get_CV_MAE(xData, yData, Train_indx, Test_indx, alphabets1+alphabets2, None)
onlyAlphabets

Unnamed: 0,Model,kGram,window,vecSize,protVec,mean_cv_mean_absolute_error,median_cv_mean_absolute_error,runID
0,hydro,3,5,100,w/o,2.808596,2.839244,0
1,conf_simil,3,5,100,w/o,2.43454,2.434674,1


In [12]:
top_onlyAlphabets = getTop(onlyAlphabets, ['Model', 'protVec'], 'mean_cv_mean_absolute_error')
top_onlyAlphabets

Unnamed: 0,Model,kGram,window,vecSize,protVec,mean_cv_mean_absolute_error,median_cv_mean_absolute_error,runID
0,hydro,3,5,100,w/o,2.808596,2.839244,0
1,conf_simil,3,5,100,w/o,2.43454,2.434674,1


# Alphabets with Top protVec

In [13]:
topProtVec = protVecs[top_onlyProtVec.runID.values[0]]

In [14]:
alpha_with_ProtVec = get_CV_MAE_combined(xData, yData, Train_indx, Test_indx,alphabets, protVecs)
alpha_with_ProtVec

Unnamed: 0,Model,kGram,window,vecSize,protVec,mean_cv_mean_absolute_error,median_cv_mean_absolute_error,runID
0,hydro,3,5,100,prot_vec_G3_S100_W5,2.540585,2.540623,0
1,conf_simil,3,5,100,prot_vec_G3_S100_W5,2.393596,2.396276,1


In [15]:
top_alpha_with_ProtVec = getTop(alpha_with_ProtVec, ['Model'], 'mean_cv_mean_absolute_error')
top_alpha_with_ProtVec

Unnamed: 0,Model,kGram,window,vecSize,protVec,mean_cv_mean_absolute_error,median_cv_mean_absolute_error,runID
0,hydro,3,5,100,prot_vec_G3_S100_W5,2.540585,2.540623,0
1,conf_simil,3,5,100,prot_vec_G3_S100_W5,2.393596,2.396276,1


In [16]:
alpha_comb = get_CV_MAE_combined(xData, yData, Train_indx, Test_indx, alphabets1, alphabets2)
alpha_comb

Unnamed: 0,Model,kGram,window,vecSize,protVec,mean_cv_mean_absolute_error,median_cv_mean_absolute_error,runID
0,hydro,3,5,100,conf_simil_G3_S100_W5,2.512195,2.510173,0


In [17]:
top_alpha_comb = getTop(alpha_comb, ['Model'], 'mean_cv_mean_absolute_error')
top_alpha_comb

Unnamed: 0,Model,kGram,window,vecSize,protVec,mean_cv_mean_absolute_error,median_cv_mean_absolute_error,runID
0,hydro,3,5,100,conf_simil_G3_S100_W5,2.512195,2.510173,0


# Test Scores for Top Models

## only ProtVec

In [18]:
top_onlyProtVec_test = get_test_score(xData, yData, Train_indx, Test_indx, [topProtVec], None)
top_onlyProtVec_test

Unnamed: 0,Model,kGram,window,vecSize,protVec,Train,Test,Parameters,Train_mean_absolute_error,mean_absolute_error,r2_score,kendalltau
0,prot_vec,3,5,100,w/o,242,19,C:10 epsilon:0.0001 gamma:0.005 kernel:rbf,1.543,2.728796,0.470119,0.464714


## only Alphabet

In [19]:
top_onlyAlphabets_models = [alphabets[x] for x in top_onlyAlphabets.runID]
top_onlyAlphabets_test = get_test_score(xData, yData, Train_indx, Test_indx, top_onlyAlphabets_models, None)
top_onlyAlphabets_test

Unnamed: 0,Model,kGram,window,vecSize,protVec,Train,Test,Parameters,Train_mean_absolute_error,mean_absolute_error,r2_score,kendalltau
0,hydro,3,5,100,w/o,242,19,C:100 epsilon:0.0001 gamma:0.001 kernel:rbf,1.733123,3.492769,0.470391,0.417654
1,conf_simil,3,5,100,w/o,242,19,C:100 epsilon:0.1 gamma:0.001 kernel:rbf,1.602339,2.97599,0.369879,0.523538


## Alphabet with top ProtVec

In [20]:
top_alpha_with_ProtVec_models = [alphabets[x] for x in top_alpha_with_ProtVec.runID.values//len(protVecs)]
top_protvec_with_alpha_models = [protVecs[x] for x in top_alpha_with_ProtVec.runID.values%len(protVecs)]

top_alpha_with_ProtVec_test = get_test_score_combine(xData, yData, Train_indx, Test_indx, 
                                             top_alpha_with_ProtVec_models, top_protvec_with_alpha_models)
top_alpha_with_ProtVec_test

Unnamed: 0,Model,kGram,window,vecSize,protVec,Train,Test,Parameters,Train_mean_absolute_error,mean_absolute_error,r2_score,kendalltau
0,hydro,3,5,100,prot_vec_G3_S100_W5,242,19,C:100 epsilon:0.1 gamma:0.001 kernel:rbf,1.485334,2.888166,0.490477,0.500009
1,conf_simil,3,5,100,prot_vec_G3_S100_W5,242,19,C:100 epsilon:0.1 gamma:0.001 kernel:rbf,1.417516,2.718703,0.381847,0.523538


# Aplphabet Model Combination

In [21]:
alpha1 = alphabets1[(top_alpha_comb.runID//len(alphabets1)).values[0]]
alpha2 = alphabets2[(top_alpha_comb.runID%len(alphabets2)).values[0]]

top_alpha_combi = [[alpha1,alpha2]]

top_alpha_combi_test = get_test_score(xData, yData, Train_indx, Test_indx, top_alpha_combi, None)
top_alpha_combi_test

Unnamed: 0,Model0,Model1,protVec,Train,Test,Parameters,Train_mean_absolute_error,mean_absolute_error,r2_score,kendalltau
0,hydro_G3_S100_W5,conf_simil_G3_S100_W5,w/o,242,19,C:100 epsilon:0.1 gamma:0.001 kernel:rbf,1.575778,2.861585,0.41555,0.558833


# Aplphabet Model Combination with ProtVec

In [22]:
top_alpha_combi = [[alphabets[x] for x in top_onlyAlphabets.runID]]
#top_alpha_combi = [[alphabets[x] for x in [0,8]]]
top_alpha_combi_with_ProtVec_test = get_test_score(xData, yData, Train_indx, Test_indx, top_alpha_combi, topProtVec)
top_alpha_combi_with_ProtVec_test

Unnamed: 0,Model0,Model1,protVec,Train,Test,Parameters,Train_mean_absolute_error,mean_absolute_error,r2_score,kendalltau
0,hydro_G3_S100_W5,conf_simil_G3_S100_W5,prot_vec_G3_S100_W5,242,19,C:100 epsilon:0.1 gamma:0.001 kernel:rbf,1.473863,3.057121,0.350615,0.511774


In [23]:
test_csv = pd.concat([top_onlyProtVec_test,top_onlyAlphabets_test,top_alpha_with_ProtVec_test,top_alpha_combi_test,top_alpha_combi_with_ProtVec_test], sort=False)
test_csv

Unnamed: 0,Model,kGram,window,vecSize,protVec,Train,Test,Parameters,Train_mean_absolute_error,mean_absolute_error,r2_score,kendalltau,Model0,Model1
0,prot_vec,3.0,5.0,100.0,w/o,242,19,C:10 epsilon:0.0001 gamma:0.005 kernel:rbf,1.543,2.728796,0.470119,0.464714,,
0,hydro,3.0,5.0,100.0,w/o,242,19,C:100 epsilon:0.0001 gamma:0.001 kernel:rbf,1.733123,3.492769,0.470391,0.417654,,
1,conf_simil,3.0,5.0,100.0,w/o,242,19,C:100 epsilon:0.1 gamma:0.001 kernel:rbf,1.602339,2.97599,0.369879,0.523538,,
0,hydro,3.0,5.0,100.0,prot_vec_G3_S100_W5,242,19,C:100 epsilon:0.1 gamma:0.001 kernel:rbf,1.485334,2.888166,0.490477,0.500009,,
1,conf_simil,3.0,5.0,100.0,prot_vec_G3_S100_W5,242,19,C:100 epsilon:0.1 gamma:0.001 kernel:rbf,1.417516,2.718703,0.381847,0.523538,,
0,,,,,w/o,242,19,C:100 epsilon:0.1 gamma:0.001 kernel:rbf,1.575778,2.861585,0.41555,0.558833,hydro_G3_S100_W5,conf_simil_G3_S100_W5
0,,,,,prot_vec_G3_S100_W5,242,19,C:100 epsilon:0.1 gamma:0.001 kernel:rbf,1.473863,3.057121,0.350615,0.511774,hydro_G3_S100_W5,conf_simil_G3_S100_W5
