
# Make and fine-tune taks models

<!--
 * @Author: Zhenkun Shi
 * @Date: 2022-10-09 03:06:45
 * @LastEditors: Zhenkun Shi
 * @LastEditTime: 2022-10-09 03:08:45
 * @FilePath: /DMLF/make_task_model.ipynb
 * @Description: 
 * 
 * Copyright (c) 2022 by tibd, All Rights Reserved. 
-->

### 1. Import packages

In [1]:
import pandas as pd
import numpy as np
import joblib, os,time, argparse
import benchmark_common as bcommon
import config as cfg
import benchmark_test as btest
import benchmark_train as btrain
import benchmark_evaluation as eva
import tools.funclib as funclib
import tools.embedding_esm as esmebd
from tqdm import tqdm

from sklearn import metrics
from sklearn.model_selection import train_test_split
from gc import callbacks
from xgboost import XGBClassifier
from xgboost.callback import EarlyStopping

from pandarallel import pandarallel #  import pandaralle
pandarallel.initialize() 

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 52 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### 2. Load task data and features

In [3]:
EMBEDDING_METHOD = 'esm32'
print('step 1 loading task data')

data_task1_train = pd.read_feather(cfg.FILE_TASK1_TRAIN)
data_task2_train = pd.read_feather(cfg.FILE_TASK2_TRAIN)
data_task3_train = pd.read_feather(cfg.FILE_TASK3_TRAIN)

data_task1_test = pd.read_feather(cfg.FILE_TASK1_TEST)
data_task2_test = pd.read_feather(cfg.FILE_TASK2_TEST)
data_task3_test = pd.read_feather(cfg.FILE_TASK3_TEST)

print(f'step 2: Loading features, embdding method={EMBEDDING_METHOD}')
feature_df = bcommon.load_data_embedding(embedding_type=EMBEDDING_METHOD)

step 1 loading task data
step 2: Loading features, embdding method=esm32


### 3. Task1 Model

In [4]:
print('step 3: train isEnzyme model')
task1_train_X, task1_train_Y = btrain.get_train_X_Y(traindata=data_task1_train, feature_bankfile=feature_df, task=1)
task1_test_X, task1_test_Y = btrain.get_train_X_Y(traindata=data_task1_test, feature_bankfile=feature_df, task=1)
t1_x_train, t1_x_vali, t1_y_train, t1_y_vali = train_test_split(task1_train_X,np.array(task1_train_Y).ravel(),test_size=cfg.VALIDATION_RATE,random_state=1)
t1_eval_set = [(t1_x_train, t1_y_train), (t1_x_vali, t1_y_vali), (task1_test_X, task1_test_Y)]

step 3: train isEnzyme model


In [16]:
cfg.TRAIN_ISENZYME_LEARNING_STEPS = 30000

early_stop = EarlyStopping( rounds=20, 
                    save_best=True,
                    maximize=False,
                    data_name = 'validation_1',
                    metric_name='logloss'
                )

model = XGBClassifier(
    objective='binary:logistic', 
    random_state=13, 
    use_label_encoder=False, 
    n_jobs=-2, 
    eval_metric='logloss',
    max_depth=6,
    callbacks = [early_stop],
    n_estimators= cfg.TRAIN_ISENZYME_LEARNING_STEPS,
    tree_method = 'gpu_hist',
    learning_rate = 0.005,
    gpu_id=0
    )

# print(model)
# model.fit(X, Y.ravel())
history = model.fit(t1_x_train, t1_y_train,  eval_set=t1_eval_set, verbose=100)
joblib.dump(model, cfg.ISENZYME_MODEL)
print('XGBoost模型训练完成')

[0]	validation_0-logloss:0.69077	validation_1-logloss:0.69077	validation_2-logloss:0.69126
[100]	validation_0-logloss:0.51812	validation_1-logloss:0.51883	validation_2-logloss:0.55879
[200]	validation_0-logloss:0.41999	validation_1-logloss:0.42139	validation_2-logloss:0.48445
[300]	validation_0-logloss:0.35701	validation_1-logloss:0.35905	validation_2-logloss:0.43798
[400]	validation_0-logloss:0.31206	validation_1-logloss:0.31483	validation_2-logloss:0.40230
[500]	validation_0-logloss:0.27867	validation_1-logloss:0.28213	validation_2-logloss:0.37677
[600]	validation_0-logloss:0.25286	validation_1-logloss:0.25696	validation_2-logloss:0.35605
[700]	validation_0-logloss:0.23182	validation_1-logloss:0.23650	validation_2-logloss:0.33897
[800]	validation_0-logloss:0.21436	validation_1-logloss:0.21956	validation_2-logloss:0.32476
[900]	validation_0-logloss:0.19991	validation_1-logloss:0.20561	validation_2-logloss:0.31271
[1000]	validation_0-logloss:0.18748	validation_1-logloss:0.19373	validat

In [5]:
model_tst = joblib.load(cfg.ISENZYME_MODEL)
predict = model_tst.predict(task1_test_X)
predictprob = model_tst.predict_proba(task1_test_X)


print('task1_single:\n----------------')
print('baslineName', '\t\t', 'accuracy','\t', 'precision(PPV) \t NPV \t\t', 'recall','\t', 'f1', '\t\t', '\t confusion Matrix')
eva.caculateMetrix(groundtruth=task1_test_Y, predict=predict, baselineName='XGBoost', type='binary')


diamond_task1 = funclib.getblast(train=data_task1_train[['id','seq']], test=data_task1_test[['id','seq']])
THRES_BIT=35
res_task1_diamond=diamond_task1[(diamond_task1.pident>30)]
res_task1_diamond =res_task1_diamond[['id','sseqid','pident','mismatch', 'bitscore']].merge(data_task1_train[['id','isenzyme']], how='left', left_on='sseqid', right_on='id')
res_task1_diamond = res_task1_diamond[['id_x', 'isenzyme', 'pident', 'mismatch', 'bitscore']].rename(columns={'id_x':'id','isenzyme':'isenzyme_pred_diamond'})
res_task1_diamond = res_task1_diamond[res_task1_diamond.pident>THRES_BIT]
res_task1_diamond

res_task1_xg = data_task1_test.copy()
res_task1_xg['isenzyme_pred_xg'] = predict
res_task1_xg.isenzyme_pred_xg = res_task1_xg.isenzyme_pred_xg.astype('bool')
res_task1_xg= res_task1_xg.merge(res_task1_diamond, on='id', how='left')
res_task1_xg.pident= res_task1_xg.pident.fillna(0)
res_task1_xg.mismatch= res_task1_xg.mismatch.fillna(100000)
res_task1_xg.bitscore= res_task1_xg.bitscore.fillna(0)
res_task1_xg.isenzyme_pred_diamond =res_task1_xg.isenzyme_pred_diamond.fillna('-')
res_task1_xg['isenzyme_pred_inti'] = res_task1_xg.apply(lambda x: x.isenzyme_pred_diamond if x.isenzyme_pred_diamond!='-' else x.isenzyme_pred_xg, axis=1)


print('task1_inti:\n----------------')
print('baslineName', '\t\t', 'accuracy','\t', 'precision(PPV) \t NPV \t\t', 'recall','\t', 'f1', '\t\t', '\t confusion Matrix')
eva.caculateMetrix(groundtruth=res_task1_xg.isenzyme, predict=res_task1_xg.isenzyme_pred_inti, baselineName='Integrate', type='binary')

task1_single:
----------------
baslineName 		 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 	 confusion Matrix
XGBoost 		0.923215 	0.953739 		0.898639 	0.883389 	0.917217 	 tp: 4515 fp: 219 fn: 596 tn: 5284
Write finished
Write finished
diamond makedb --in /tmp/train.fasta -d /tmp/train.dmnd --quiet
diamond blastp -d /tmp/train.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1 --quiet
task1_inti:
----------------
baslineName 		 accuracy 	 precision(PPV) 	 NPV 		 recall 	 f1 		 	 confusion Matrix
Integrate 		0.924063 	0.956328 		0.898253 	0.882606 	0.917989 	 tp: 4511 fp: 206 fn: 600 tn: 5297


In [11]:
res_task1_xg[(res_task1_xg.isenzyme_pred_diamond!='-')&(res_task1_xg.isenzyme==res_task1_xg.isenzyme_pred_diamond)]

Unnamed: 0,id,seq,isenzyme,isenzyme_pred_xg,isenzyme_pred_diamond,pident,mismatch,bitscore,isenzyme_pred_inti
0,P63153,MRGREFPLVLLALVLCQAPRGPAAPVSVGGGTVLAKMYPRGNHWAV...,False,False,False,71.9,41.0,215.0,False
1,P62861,MQLFVRAQELHTFEVTGQETVAQIKAHVASLEGIAPEDQVVLLAGA...,False,False,False,100.0,0.0,140.0,False
2,P69031,MAFLKKSLFLVLFLGLVSLSICEQEKREEENQEEDEENEAASEEKR...,False,False,False,73.4,16.0,57.4,False
3,P69019,MAFLKKSLFLVLFLGLVSLSICEKEKRQNEEDEDENEAANHEEGSE...,False,False,False,75.4,17.0,77.4,False
5,Q9UM00,MPRKRKCDLRAVRVGLLLGGGGVYGSRFRFTFPGCRALSPWRVRVQ...,False,False,False,100.0,0.0,344.0,False
...,...,...,...,...,...,...,...,...,...
10606,Q95ZS0,MNDPEQYEPSSSTESVLMPPPALPQYFQRPAAAPQVYSTLEPSVQN...,False,False,False,51.5,47.0,96.3,False
10607,A2WLP4,MVSAAAGWAAPAFAVAAVVIWVVLCGELLRRRRRGAGSGKGDAAAA...,True,True,True,99.2,4.0,927.0,True
10608,A0A250YGJ5,MSVNYAAGLSPYADKGKCGLPEIFDPPEELERKVWELARLVRQSSN...,True,True,True,87.3,45.0,618.0,True
10609,A0A2K5TU92,MSVNYAAGLSPYADKGKCGLPEIFDPPEELERKVWELARLVWQSSH...,True,True,True,93.5,23.0,657.0,True


In [12]:
5894/6285

0.937788385043755