## Evaluation different embedding methods
### 1. Import packages

In [7]:
import pandas as pd
import numpy as np
import config as cfg
import benchmark_common as bcommon
import benchmark_train as btrain
from sklearn.model_selection import train_test_split
import tools.funclib as funclib

### 2. Loading data

In [8]:
print('step 1 loading task data')
data_task1_train = pd.read_feather(cfg.FILE_TASK1_TRAIN)
data_task1_test_2020 = pd.read_feather(cfg.FILE_TASK1_TEST_2020)
data_task1_test_2022 = pd.read_feather(cfg.FILE_TASK1_TEST_2022)

step 1 loading task data


In [9]:
embd_method = 'esm32'
print(f'step 2: Loading features, embdding method={embd_method}')
feature_df = bcommon.load_data_embedding(embedding_type=embd_method)

print('step 3: train isEnzyme model')
task1_train_X, task1_train_Y = btrain.get_train_X_Y(traindata=data_task1_train, feature_bankfile=feature_df, task=1)
task1_test_X_2020, task1_test_Y_2020 = btrain.get_train_X_Y(traindata=data_task1_test_2020, feature_bankfile=feature_df, task=1)
task1_test_X_2022, task1_test_Y_2022 = btrain.get_train_X_Y(traindata=data_task1_test_2022, feature_bankfile=feature_df, task=1)

t1_x_train, t1_x_vali, t1_y_train, t1_y_vali = train_test_split(task1_train_X,np.array(task1_train_Y).ravel(),test_size=cfg.VALIDATION_RATE,random_state=1)
# t1_eval_set = [(t1_x_train, t1_y_train), (t1_x_vali, t1_y_vali), (task1_test_X_2020, task1_test_Y_2020)]

# methods=['knn','lr', 'xg', 'dt', 'rf', 'gbdt']
# # methods=['xg', 'dt', 'rf', 'gbdt']
# for method in methods:
#     funclib.evaluate_2(baslineName=method, X_train_std=t1_x_train, Y_train=t1_y_train, X_test_std=task1_test_X, Y_test=task1_test_Y, type='binary')

funclib.evaluate_2(baslineName='knn', X_train_std=t1_x_train, Y_train=t1_y_train, X_test_std=task1_test_X_2020, Y_test=task1_test_Y_2020, type='binary')
funclib.evaluate_2(baslineName='knn', X_train_std=t1_x_train, Y_train=t1_y_train, X_test_std=task1_test_X_2022, Y_test=task1_test_Y_2022, type='binary')

step 2: Loading features, embdding method=esm32
step 3: train isEnzyme model
knn 		0.922546 	0.936865 		0.911117 	0.893765 	0.914808 	 tp: 2953 fp: 199 fn: 351 tn: 3598
knn 		0.917656 	0.938159 		0.900502 	0.887498 	0.912125 	 tp: 4536 fp: 299 fn: 575 tn: 5204


### 3. Evaluation

In [3]:
EMBEDDING_METHODs = ['unirep', 'esm0', 'esm33', 'one-hot' , 'esm32']
for embd_method in EMBEDDING_METHODs:
    print(f'step 2: Loading features, embdding method={embd_method}')
    feature_df = bcommon.load_data_embedding(embedding_type=embd_method)

    print('step 3: train isEnzyme model')
    task1_train_X, task1_train_Y = btrain.get_train_X_Y(traindata=data_task1_train, feature_bankfile=feature_df, task=1)
    task1_test_X, task1_test_Y = btrain.get_train_X_Y(traindata=data_task1_test, feature_bankfile=feature_df, task=1)
    t1_x_train, t1_x_vali, t1_y_train, t1_y_vali = train_test_split(task1_train_X,np.array(task1_train_Y).ravel(),test_size=cfg.VALIDATION_RATE,random_state=1)
    t1_eval_set = [(t1_x_train, t1_y_train), (t1_x_vali, t1_y_vali), (task1_test_X, task1_test_Y)]

    methods=['knn','lr', 'xg', 'dt', 'rf', 'gbdt']
    # methods=['xg', 'dt', 'rf', 'gbdt']
    for method in methods:
        funclib.evaluate_2(baslineName=method, X_train_std=t1_x_train, Y_train=t1_y_train, X_test_std=task1_test_X, Y_test=task1_test_Y, type='binary')

step 2: Loading features, embdding method=unirep
step 3: train isEnzyme model
knn 		0.882325 	0.889943 		0.875662 	0.862258 	0.875882 	 tp: 4407 fp: 545 fn: 704 tn: 4958
lr 		0.869324 	0.899914 		0.845418 	0.819800 	0.857991 	 tp: 4190 fp: 466 fn: 921 tn: 5037
xg 		0.866403 	0.914664 		0.831521 	0.796909 	0.851736 	 tp: 4073 fp: 380 fn: 1038 tn: 5123
dt 		0.819013 	0.849781 		0.795837 	0.758169 	0.801365 	 tp: 3875 fp: 685 fn: 1236 tn: 4818
rf 		0.887319 	0.933747 		0.852975 	0.824496 	0.875727 	 tp: 4214 fp: 299 fn: 897 tn: 5204
gbdt 		0.857076 	0.911778 		0.818880 	0.778517 	0.839894 	 tp: 3979 fp: 385 fn: 1132 tn: 5118
step 2: Loading features, embdding method=esm0
step 3: train isEnzyme model
knn 		0.812983 	0.797148 		0.828539 	0.820387 	0.808601 	 tp: 4193 fp: 1067 fn: 918 tn: 4436
lr 		0.760693 	0.751320 		0.769413 	0.751908 	0.751614 	 tp: 3843 fp: 1272 fn: 1268 tn: 4231
xg 		0.819861 	0.827565 		0.813296 	0.790648 	0.808685 	 tp: 4041 fp: 842 fn: 1070 tn: 4661
dt 		0.748634 	0

In [4]:
EMBEDDING_METHODs = ['unirep', 'esm0', 'esm33', 'one-hot' , 'esm32']
for embd_method in EMBEDDING_METHODs:
    print(f'step 2: Loading features, embdding method={embd_method}')
    feature_df = bcommon.load_data_embedding(embedding_type=embd_method)

    print('step 3: train isEnzyme model')
    task1_train_X, task1_train_Y = btrain.get_train_X_Y(traindata=data_task1_train, feature_bankfile=feature_df, task=1)
    task1_test_X, task1_test_Y = btrain.get_train_X_Y(traindata=data_task1_test_2022, feature_bankfile=feature_df, task=1)
    t1_x_train, t1_x_vali, t1_y_train, t1_y_vali = train_test_split(task1_train_X,np.array(task1_train_Y).ravel(),test_size=cfg.VALIDATION_RATE,random_state=1)
    t1_eval_set = [(t1_x_train, t1_y_train), (t1_x_vali, t1_y_vali), (task1_test_X, task1_test_Y)]

    methods=['knn','lr', 'xg', 'dt', 'rf', 'gbdt']
    # methods=['xg', 'dt', 'rf', 'gbdt']
    for method in methods:
        funclib.evaluate_2(baslineName=method, X_train_std=t1_x_train, Y_train=t1_y_train, X_test_std=task1_test_X, Y_test=task1_test_Y, type='binary')

step 2: Loading features, embdding method=unirep
step 3: train isEnzyme model
knn 		0.882325 	0.889943 		0.875662 	0.862258 	0.875882 	 tp: 4407 fp: 545 fn: 704 tn: 4958
lr 		0.869324 	0.899914 		0.845418 	0.819800 	0.857991 	 tp: 4190 fp: 466 fn: 921 tn: 5037
xg 		0.866403 	0.914664 		0.831521 	0.796909 	0.851736 	 tp: 4073 fp: 380 fn: 1038 tn: 5123
dt 		0.815904 	0.845329 		0.793646 	0.756016 	0.798182 	 tp: 3864 fp: 707 fn: 1247 tn: 4796
rf 		0.887319 	0.933747 		0.852975 	0.824496 	0.875727 	 tp: 4214 fp: 299 fn: 897 tn: 5204
gbdt 		0.857076 	0.911778 		0.818880 	0.778517 	0.839894 	 tp: 3979 fp: 385 fn: 1132 tn: 5118
step 2: Loading features, embdding method=esm0
step 3: train isEnzyme model
knn 		0.813360 	0.797415 		0.829036 	0.820974 	0.809023 	 tp: 4196 fp: 1066 fn: 915 tn: 4437
lr 		0.760317 	0.751027 		0.768951 	0.751321 	0.751174 	 tp: 3840 fp: 1273 fn: 1271 tn: 4230
xg 		0.821274 	0.829034 		0.814660 	0.792213 	0.810205 	 tp: 4049 fp: 835 fn: 1062 tn: 4668
dt 		0.740531 	0

In [None]:
print(f'step 2: Loading features, embdding method={embd_method}')
feature_df = bcommon.load_data_embedding(embedding_type=embd_method)

print('step 3: train isEnzyme model')
task1_train_X, task1_train_Y = btrain.get_train_X_Y(traindata=data_task1_train, feature_bankfile=feature_df, task=1)
task1_test_X, task1_test_Y = btrain.get_train_X_Y(traindata=data_task1_test, feature_bankfile=feature_df, task=1)
t1_x_train, t1_x_vali, t1_y_train, t1_y_vali = train_test_split(task1_train_X,np.array(task1_train_Y).ravel(),test_size=cfg.VALIDATION_RATE,random_state=1)
t1_eval_set = [(t1_x_train, t1_y_train), (t1_x_vali, t1_y_vali), (task1_test_X, task1_test_Y)]

methods=['knn','lr', 'xg', 'dt', 'rf', 'gbdt']
# methods=['xg', 'dt', 'rf', 'gbdt']
for method in methods:
    funclib.evaluate_2(baslineName=method, X_train_std=t1_x_train, Y_train=t1_y_train, X_test_std=task1_test_X, Y_test=task1_test_Y, type='binary')