## Evaluation different embedding methods
### 1. Import packages

In [7]:
import pandas as pd
import numpy as np
import config as cfg
import benchmark_common as bcommon
import benchmark_train as btrain
from sklearn.model_selection import train_test_split
import tools.funclib as funclib

### 2. Loading data

In [8]:
print('step 1 loading task data')
data_task1_train = pd.read_feather(cfg.FILE_TASK1_TRAIN)
data_task1_test_2020 = pd.read_feather(cfg.FILE_TASK1_TEST_2020)
data_task1_test_2022 = pd.read_feather(cfg.FILE_TASK1_TEST_2022)

step 1 loading task data


In [9]:
embd_method = 'esm32'
print(f'step 2: Loading features, embdding method={embd_method}')
feature_df = bcommon.load_data_embedding(embedding_type=embd_method)

print('step 3: train isEnzyme model')
task1_train_X, task1_train_Y = btrain.get_train_X_Y(traindata=data_task1_train, feature_bankfile=feature_df, task=1)
task1_test_X_2020, task1_test_Y_2020 = btrain.get_train_X_Y(traindata=data_task1_test_2020, feature_bankfile=feature_df, task=1)
task1_test_X_2022, task1_test_Y_2022 = btrain.get_train_X_Y(traindata=data_task1_test_2022, feature_bankfile=feature_df, task=1)

t1_x_train, t1_x_vali, t1_y_train, t1_y_vali = train_test_split(task1_train_X,np.array(task1_train_Y).ravel(),test_size=cfg.VALIDATION_RATE,random_state=1)
# t1_eval_set = [(t1_x_train, t1_y_train), (t1_x_vali, t1_y_vali), (task1_test_X_2020, task1_test_Y_2020)]

# methods=['knn','lr', 'xg', 'dt', 'rf', 'gbdt']
# # methods=['xg', 'dt', 'rf', 'gbdt']
# for method in methods:
#     funclib.evaluate_2(baslineName=method, X_train_std=t1_x_train, Y_train=t1_y_train, X_test_std=task1_test_X, Y_test=task1_test_Y, type='binary')

funclib.evaluate_2(baslineName='knn', X_train_std=t1_x_train, Y_train=t1_y_train, X_test_std=task1_test_X_2020, Y_test=task1_test_Y_2020, type='binary')
funclib.evaluate_2(baslineName='knn', X_train_std=t1_x_train, Y_train=t1_y_train, X_test_std=task1_test_X_2022, Y_test=task1_test_Y_2022, type='binary')

step 2: Loading features, embdding method=esm32
step 3: train isEnzyme model
knn 		0.922546 	0.936865 		0.911117 	0.893765 	0.914808 	 tp: 2953 fp: 199 fn: 351 tn: 3598
knn 		0.917656 	0.938159 		0.900502 	0.887498 	0.912125 	 tp: 4536 fp: 299 fn: 575 tn: 5204


### 3. Evaluation

In [3]:
EMBEDDING_METHODs = ['unirep', 'esm0', 'esm33', 'one-hot' , 'esm32']
for embd_method in EMBEDDING_METHODs:
    print(f'step 2: Loading features, embdding method={embd_method}')
    feature_df = bcommon.load_data_embedding(embedding_type=embd_method)

    print('step 3: train isEnzyme model')
    task1_train_X, task1_train_Y = btrain.get_train_X_Y(traindata=data_task1_train, feature_bankfile=feature_df, task=1)
    task1_test_X, task1_test_Y = btrain.get_train_X_Y(traindata=data_task1_test, feature_bankfile=feature_df, task=1)
    t1_x_train, t1_x_vali, t1_y_train, t1_y_vali = train_test_split(task1_train_X,np.array(task1_train_Y).ravel(),test_size=cfg.VALIDATION_RATE,random_state=1)
    t1_eval_set = [(t1_x_train, t1_y_train), (t1_x_vali, t1_y_vali), (task1_test_X, task1_test_Y)]

    methods=['knn','lr', 'xg', 'dt', 'rf', 'gbdt']
    # methods=['xg', 'dt', 'rf', 'gbdt']
    for method in methods:
        funclib.evaluate_2(baslineName=method, X_train_std=t1_x_train, Y_train=t1_y_train, X_test_std=task1_test_X, Y_test=task1_test_Y, type='binary')

step 2: Loading features, embdding method=unirep
step 3: train isEnzyme model
knn 		0.882325 	0.889943 		0.875662 	0.862258 	0.875882 	 tp: 4407 fp: 545 fn: 704 tn: 4958
lr 		0.869324 	0.899914 		0.845418 	0.819800 	0.857991 	 tp: 4190 fp: 466 fn: 921 tn: 5037
xg 		0.866403 	0.914664 		0.831521 	0.796909 	0.851736 	 tp: 4073 fp: 380 fn: 1038 tn: 5123
dt 		0.819013 	0.849781 		0.795837 	0.758169 	0.801365 	 tp: 3875 fp: 685 fn: 1236 tn: 4818
rf 		0.887319 	0.933747 		0.852975 	0.824496 	0.875727 	 tp: 4214 fp: 299 fn: 897 tn: 5204
gbdt 		0.857076 	0.911778 		0.818880 	0.778517 	0.839894 	 tp: 3979 fp: 385 fn: 1132 tn: 5118
step 2: Loading features, embdding method=esm0
step 3: train isEnzyme model
knn 		0.812983 	0.797148 		0.828539 	0.820387 	0.808601 	 tp: 4193 fp: 1067 fn: 918 tn: 4436
lr 		0.760693 	0.751320 		0.769413 	0.751908 	0.751614 	 tp: 3843 fp: 1272 fn: 1268 tn: 4231
xg 		0.819861 	0.827565 		0.813296 	0.790648 	0.808685 	 tp: 4041 fp: 842 fn: 1070 tn: 4661
dt 		0.748634 	0

In [4]:
EMBEDDING_METHODs = ['unirep', 'esm0', 'esm33', 'one-hot' , 'esm32']
for embd_method in EMBEDDING_METHODs:
    print(f'step 2: Loading features, embdding method={embd_method}')
    feature_df = bcommon.load_data_embedding(embedding_type=embd_method)

    print('step 3: train isEnzyme model')
    task1_train_X, task1_train_Y = btrain.get_train_X_Y(traindata=data_task1_train, feature_bankfile=feature_df, task=1)
    task1_test_X, task1_test_Y = btrain.get_train_X_Y(traindata=data_task1_test_2022, feature_bankfile=feature_df, task=1)
    t1_x_train, t1_x_vali, t1_y_train, t1_y_vali = train_test_split(task1_train_X,np.array(task1_train_Y).ravel(),test_size=cfg.VALIDATION_RATE,random_state=1)
    t1_eval_set = [(t1_x_train, t1_y_train), (t1_x_vali, t1_y_vali), (task1_test_X, task1_test_Y)]

    methods=['knn','lr', 'xg', 'dt', 'rf', 'gbdt']
    # methods=['xg', 'dt', 'rf', 'gbdt']
    for method in methods:
        funclib.evaluate_2(baslineName=method, X_train_std=t1_x_train, Y_train=t1_y_train, X_test_std=task1_test_X, Y_test=task1_test_Y, type='binary')

step 2: Loading features, embdding method=unirep
step 3: train isEnzyme model
knn 		0.882325 	0.889943 		0.875662 	0.862258 	0.875882 	 tp: 4407 fp: 545 fn: 704 tn: 4958
lr 		0.869324 	0.899914 		0.845418 	0.819800 	0.857991 	 tp: 4190 fp: 466 fn: 921 tn: 5037
xg 		0.866403 	0.914664 		0.831521 	0.796909 	0.851736 	 tp: 4073 fp: 380 fn: 1038 tn: 5123
dt 		0.815904 	0.845329 		0.793646 	0.756016 	0.798182 	 tp: 3864 fp: 707 fn: 1247 tn: 4796
rf 		0.887319 	0.933747 		0.852975 	0.824496 	0.875727 	 tp: 4214 fp: 299 fn: 897 tn: 5204
gbdt 		0.857076 	0.911778 		0.818880 	0.778517 	0.839894 	 tp: 3979 fp: 385 fn: 1132 tn: 5118
step 2: Loading features, embdding method=esm0
step 3: train isEnzyme model
knn 		0.813360 	0.797415 		0.829036 	0.820974 	0.809023 	 tp: 4196 fp: 1066 fn: 915 tn: 4437
lr 		0.760317 	0.751027 		0.768951 	0.751321 	0.751174 	 tp: 3840 fp: 1273 fn: 1271 tn: 4230
xg 		0.821274 	0.829034 		0.814660 	0.792213 	0.810205 	 tp: 4049 fp: 835 fn: 1062 tn: 4668
dt 		0.740531 	0

In [None]:
print(f'step 2: Loading features, embdding method={embd_method}')
feature_df = bcommon.load_data_embedding(embedding_type=embd_method)

print('step 3: train isEnzyme model')
task1_train_X, task1_train_Y = btrain.get_train_X_Y(traindata=data_task1_train, feature_bankfile=feature_df, task=1)
task1_test_X, task1_test_Y = btrain.get_train_X_Y(traindata=data_task1_test, feature_bankfile=feature_df, task=1)
t1_x_train, t1_x_vali, t1_y_train, t1_y_vali = train_test_split(task1_train_X,np.array(task1_train_Y).ravel(),test_size=cfg.VALIDATION_RATE,random_state=1)
t1_eval_set = [(t1_x_train, t1_y_train), (t1_x_vali, t1_y_vali), (task1_test_X, task1_test_Y)]

methods=['knn','lr', 'xg', 'dt', 'rf', 'gbdt']
# methods=['xg', 'dt', 'rf', 'gbdt']
for method in methods:
    funclib.evaluate_2(baslineName=method, X_train_std=t1_x_train, Y_train=t1_y_train, X_test_std=task1_test_X, Y_test=task1_test_Y, type='binary')

In [3]:
bcommon.load_data_embedding(embedding_type='esm0')

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f1271,f1272,f1273,f1274,f1275,f1276,f1277,f1278,f1279,f1280
0,P84233,-0.146607,0.001917,0.023837,0.032904,-0.007226,0.019266,0.002471,-0.056562,0.004020,...,0.076818,-0.100577,-0.017798,0.084299,0.003090,0.097957,0.004955,0.001013,-0.010846,0.083537
1,P0A7F3,-0.198304,0.008354,0.019647,0.029310,-0.018305,0.045471,0.075197,-0.044946,-0.003504,...,0.057923,-0.086899,-0.014135,0.082852,0.006422,0.123944,0.019837,0.004477,0.002239,0.069839
2,P03212,-0.198705,0.011568,0.028825,0.039880,-0.014772,0.022454,0.045277,0.001027,0.002064,...,0.053603,-0.093371,-0.010378,0.082583,0.001769,0.104650,0.023149,0.009615,-0.015051,0.070366
3,P01158,0.429650,0.021301,0.058200,0.021442,-0.042830,-0.407126,-0.226068,0.249569,-0.051777,...,-0.001685,-0.192207,0.037732,-0.007199,0.187262,0.146447,0.024233,-0.028932,0.168984,0.008123
4,P0A6U4,-0.120271,0.002771,0.017928,0.021672,-0.045205,0.112135,-0.020785,0.029727,-0.018858,...,0.007001,-0.082027,-0.018403,0.026921,0.032905,0.038215,0.009250,-0.001736,-0.123505,0.047905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499435,A5WW24,-0.016626,0.000310,0.004762,0.003728,-0.016473,0.018177,-0.012397,-0.029998,-0.006616,...,0.005417,-0.049407,0.001244,0.020916,0.013201,0.023115,-0.000529,0.003491,-0.028188,0.004180
499436,Q7SBA0,-0.192217,-0.003872,0.013206,0.033440,0.000400,-0.059870,0.041330,0.018678,-0.018491,...,0.015614,-0.092371,-0.026066,0.078957,0.017766,0.081856,0.024689,0.004242,0.007449,0.099686
499437,P0DW10,-0.204385,-0.003039,0.014855,0.028831,0.003722,-0.068608,0.051244,0.035182,-0.020905,...,0.017614,-0.089416,-0.030291,0.071393,0.028361,0.087015,0.021463,0.003824,0.001678,0.085389
499438,C9DG80,-0.139306,0.004557,0.018049,0.025205,-0.000576,0.008378,0.031189,0.036802,-0.026377,...,0.018757,-0.103098,-0.020845,0.076714,0.029975,0.126174,0.027243,-0.001471,0.047446,0.096782


In [4]:
bcommon.load_data_embedding(embedding_type='esm32')

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f1271,f1272,f1273,f1274,f1275,f1276,f1277,f1278,f1279,f1280
0,P84233,-0.146607,0.001917,0.023837,0.032904,-0.007226,0.019266,0.002471,-0.056562,0.004020,...,0.076818,-0.100577,-0.017798,0.084299,0.003090,0.097957,0.004955,0.001013,-0.010846,0.083537
1,P0A7F3,-0.198304,0.008354,0.019647,0.029310,-0.018305,0.045471,0.075197,-0.044946,-0.003504,...,0.057923,-0.086899,-0.014135,0.082852,0.006422,0.123944,0.019837,0.004477,0.002239,0.069839
2,P03212,-0.198705,0.011568,0.028825,0.039880,-0.014772,0.022454,0.045277,0.001027,0.002064,...,0.053603,-0.093371,-0.010378,0.082583,0.001769,0.104650,0.023149,0.009615,-0.015051,0.070366
3,P01158,0.429650,0.021301,0.058200,0.021442,-0.042830,-0.407126,-0.226068,0.249569,-0.051777,...,-0.001685,-0.192207,0.037732,-0.007199,0.187262,0.146447,0.024233,-0.028932,0.168984,0.008123
4,P0A6U4,-0.120271,0.002771,0.017928,0.021672,-0.045205,0.112135,-0.020785,0.029727,-0.018858,...,0.007001,-0.082027,-0.018403,0.026921,0.032905,0.038215,0.009250,-0.001736,-0.123505,0.047905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503248,A5WW24,12.929022,6.188353,-14.868094,-24.106243,-6.759967,-0.711146,2.120794,-14.952251,-2.365670,...,12.572624,7.570918,-14.053569,-8.367974,6.807021,22.575987,4.863638,13.445599,-13.337751,-5.242069
503249,Q7SBA0,-0.403294,4.927026,6.466229,3.745415,-8.426816,-5.004267,-10.816579,-3.676574,-17.562426,...,18.616261,-3.098237,-7.562515,-3.094149,-14.189250,-2.007169,15.152642,-4.724200,-6.876630,14.522579
503250,P0DW10,-0.309852,4.897358,6.892168,3.506357,-9.486552,-5.952254,-10.688879,-3.418377,-15.975063,...,20.828060,-3.560479,-6.202727,-3.415660,-13.150867,-0.568852,16.026691,-4.340854,-8.537683,15.001244
503251,C9DG80,5.572292,1.661716,1.281703,5.881115,-12.469779,-0.236342,-1.271194,0.140252,-6.617912,...,18.437107,-1.917273,3.563461,-5.729240,-24.487797,1.210020,7.129851,0.015560,-4.438664,1.182109
