
# Make and fine-tune taks models

<!--
 * @Author: Zhenkun Shi
 * @Date: 2022-10-09 03:06:45
 * @LastEditors: Zhenkun Shi
 * @LastEditTime: 2022-10-09 03:08:45
 * @FilePath: /DMLF/make_task_model.ipynb
 * @Description: 
 * 
 * Copyright (c) 2022 by tibd, All Rights Reserved. 
-->

### 1. Import packages

In [3]:
import pandas as pd
import numpy as np
import joblib, os,time, argparse
import benchmark_common as bcommon
import config as cfg
import benchmark_test as btest
import benchmark_train as btrain
import benchmark_evaluation as eva
import tools.funclib as funclib
import tools.embedding_esm as esmebd
from tqdm import tqdm

from sklearn import metrics
from sklearn.model_selection import train_test_split
from gc import callbacks
from xgboost import XGBClassifier
from xgboost.callback import EarlyStopping

from pandarallel import pandarallel #  import pandaralle
pandarallel.initialize() 

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 104 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 2. Load task data and features

In [4]:
EMBEDDING_METHOD = 'esm32'
print('step 1 loading task data')

data_task1_train = pd.read_feather(cfg.FILE_TASK1_TRAIN)
data_task2_train = pd.read_feather(cfg.FILE_TASK2_TRAIN)
data_task3_train = pd.read_feather(cfg.FILE_TASK3_TRAIN)

data_task1_test = pd.read_feather(cfg.FILE_TASK1_TEST)
data_task2_test = pd.read_feather(cfg.FILE_TASK2_TEST)
data_task3_test = pd.read_feather(cfg.FILE_TASK3_TEST)

print(f'step 2: Loading features, embdding method={EMBEDDING_METHOD}')
feature_df = bcommon.load_data_embedding(embedding_type=EMBEDDING_METHOD)

step 1 loading task data
step 2: Loading features, embdding method=esm32


### 3. Task1 Model

In [5]:
print('step 3: train isEnzyme model')
task1_train_X, task1_train_Y = btrain.get_train_X_Y(traindata=data_task1_train, feature_bankfile=feature_df, task=1)
task1_test_X, task1_test_Y = btrain.get_train_X_Y(traindata=data_task1_test, feature_bankfile=feature_df, task=1)
t1_x_train, t1_x_vali, t1_y_train, t1_y_vali = train_test_split(task1_train_X,np.array(task1_train_Y).ravel(),test_size=cfg.VALIDATION_RATE,random_state=1)
t1_eval_set = [(t1_x_train, t1_y_train), (t1_x_vali, t1_y_vali), (task1_test_X, task1_test_Y)]

step 3: train isEnzyme model


In [17]:
cfg.TRAIN_ISENZYME_LEARNING_STEPS = 20000

early_stop = EarlyStopping( rounds=10, 
                    save_best=True,
                    maximize=False,
                    data_name = 'validation_1',
                    metric_name='auc'
                )

model = XGBClassifier(
    objective='binary:logistic', 
    random_state=13, 
    use_label_encoder=False, 
    n_jobs=-2, 
    eval_metric='auc',
    max_depth=6,
    # callbacks = [early_stop],
    n_estimators= cfg.TRAIN_ISENZYME_LEARNING_STEPS,
    tree_method = 'gpu_hist',
    learning_rate = 0.01,
    gpu_id=0
    )

# print(model)
# model.fit(X, Y.ravel())
history = model.fit(t1_x_train, t1_y_train,  eval_set=t1_eval_set, verbose=100)
joblib.dump(model, cfg.ISENZYME_MODEL)
print('XGBoost模型训练完成')

[0]	validation_0-auc:0.84186	validation_1-auc:0.84140	validation_2-auc:0.86341
[100]	validation_0-auc:0.88185	validation_1-auc:0.88011	validation_2-auc:0.89027
[200]	validation_0-auc:0.89308	validation_1-auc:0.89064	validation_2-auc:0.89717
[300]	validation_0-auc:0.90176	validation_1-auc:0.89837	validation_2-auc:0.89980
[400]	validation_0-auc:0.90862	validation_1-auc:0.90446	validation_2-auc:0.90188
[500]	validation_0-auc:0.91398	validation_1-auc:0.90909	validation_2-auc:0.90341
[600]	validation_0-auc:0.91817	validation_1-auc:0.91262	validation_2-auc:0.90431
[700]	validation_0-auc:0.92168	validation_1-auc:0.91545	validation_2-auc:0.90477
[800]	validation_0-auc:0.92471	validation_1-auc:0.91784	validation_2-auc:0.90520
[900]	validation_0-auc:0.92737	validation_1-auc:0.91989	validation_2-auc:0.90554
[1000]	validation_0-auc:0.92988	validation_1-auc:0.92180	validation_2-auc:0.90589
[1100]	validation_0-auc:0.93225	validation_1-auc:0.92358	validation_2-auc:0.90619
[1200]	validation_0-auc:0.93

In [18]:
model_tst = joblib.load(cfg.ISENZYME_MODEL)
predict = model_tst.predict(task1_test_X)
predictprob = model_tst.predict_proba(task1_test_X)
print('baslineName', '\t\t', 'accuracy','\t', 'precision(PPV) \t NPV \t\t', 'recall','\t', 'f1', '\t\t', '\t confusion Matrix')
eva.caculateMetrix(groundtruth=task1_test_Y, predict=predict, baselineName='XGBoost', type='binary')

baslineName 		 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 	 confusion Matrix
XGBoost 		0.845204 	0.860199 		0.832759 	0.810213 	0.834458 	 tp: 4141 fp: 673 fn: 970 tn: 4830


In [1]:
import xgboost

In [51]:
res_deepec = pd.read_csv(cfg.FILE_DEEPEC_RESULTS, sep='\t',names=['id', 'ec_number'], header=0 )
res_deepec.ec_number=res_deepec.apply(lambda x: x['ec_number'].replace('EC:',''), axis=1)
res_deepec.columns = ['id','ec_deepec']
# res_deepec['isemzyme_deepec']=res_deepec.ec_deepec.apply(lambda x: True if str(x)!='nan' else False)
# res_deepec['functionCounts_deepec'] = res_deepec.ec_deepec.apply(lambda x :len(str(x).split(',')))
res_deepec

Unnamed: 0,id,ec_deepec
0,P9WG69,2.3.1.9
1,P9WQ38,6.2.1.26
2,Q9CZN7,2.1.2.1
3,O51767,3.6.4.13
4,Q9LRI8,1.3.3.4
...,...,...
1512,B8AJL3,1.14.13.76
1513,D8MIA0,3.4.21.95
1514,A7S5D9,1.21.1.1
1515,I1HL09,1.14.13.112


In [52]:
aa=data_task1_test.merge(res_deepec, on='id', how='left')
aa=aa.drop_duplicates(subset='id', keep='first').reset_index(drop=True)
aa['isenzyme_pred']=aa.ec_deepec.apply(lambda x: True if str(x)!='nan' else False)
aa

Unnamed: 0,id,seq,isenzyme,ec_deepec,isenzyme_pred
0,P63153,MRGREFPLVLLALVLCQAPRGPAAPVSVGGGTVLAKMYPRGNHWAV...,False,,False
1,P62861,MQLFVRAQELHTFEVTGQETVAQIKAHVASLEGIAPEDQVVLLAGA...,False,,False
2,P69031,MAFLKKSLFLVLFLGLVSLSICEQEKREEENQEEDEENEAASEEKR...,False,,False
3,P69019,MAFLKKSLFLVLFLGLVSLSICEKEKRQNEEDEDENEAANHEEGSE...,False,,False
4,Q5RF96,MARGGDTGCTGPSETSASGAVAIAFPGLEGPPADAQYQTLALTVPK...,False,,False
...,...,...,...,...,...
10609,A0A2K5TU92,MSVNYAAGLSPYADKGKCGLPEIFDPPEELERKVWELARLVWQSSH...,True,,False
10610,A0A3R0A696,MKHWKKMAASLIAISTMVAVVPTTYAMESEDSQPQTTDTATVQTTK...,True,,False
10611,Q5ZV91,MDEIKKDDELSQWLSTYGTITAERILGRYNISLPQDEILEAINIPS...,False,,False
10612,M1H607,MDAIKKKMQAMKLEKDDAMDRADTLEQQNKEANIRAEKAEEEVHNL...,False,,False


In [54]:
aa[aa.isenzyme==aa.isenzyme_pred]

Unnamed: 0,id,seq,isenzyme,ec_deepec,isenzyme_pred
0,P63153,MRGREFPLVLLALVLCQAPRGPAAPVSVGGGTVLAKMYPRGNHWAV...,False,,False
1,P62861,MQLFVRAQELHTFEVTGQETVAQIKAHVASLEGIAPEDQVVLLAGA...,False,,False
2,P69031,MAFLKKSLFLVLFLGLVSLSICEQEKREEENQEEDEENEAASEEKR...,False,,False
3,P69019,MAFLKKSLFLVLFLGLVSLSICEKEKRQNEEDEDENEAANHEEGSE...,False,,False
4,Q5RF96,MARGGDTGCTGPSETSASGAVAIAFPGLEGPPADAQYQTLALTVPK...,False,,False
...,...,...,...,...,...
10605,I1HL09,MEALSMVGSGGVYSWPAALLVAAIVVSASVRWWGIKRQPTTTESKA...,True,1.14.13.112,True
10606,Q95ZS0,MNDPEQYEPSSSTESVLMPPPALPQYFQRPAAAPQVYSTLEPSVQN...,False,,False
10607,A2WLP4,MVSAAAGWAAPAFAVAAVVIWVVLCGELLRRRRRGAGSGKGDAAAA...,True,1.14.13.112,True
10611,Q5ZV91,MDEIKKDDELSQWLSTYGTITAERILGRYNISLPQDEILEAINIPS...,False,,False


In [46]:
str(aa.ec_deepec[0])=='nan'

True

In [59]:

print('baslineName', '\t\t', 'accuracy','\t', 'precision(PPV) \t NPV \t\t', 'recall','\t', 'f1', '\t\t', '\t confusion Matrix')
eva.caculateMetrix(groundtruth=aa.isenzyme, predict=aa.isenzyme_pred, baselineName='deepec', type='binary')

baslineName 		 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 	 confusion Matrix
deepec 		0.638308 	0.944134 		0.590612 	0.264527 	0.413266 	 tp: 1352 fp: 80 fn: 3759 tn: 5423
