In [1]:
import numpy as np
import pandas as pd
import time
import datetime
import sys
import os
from tqdm import tqdm
from functools import reduce
from Bio import SeqIO
import joblib
sys.path.append("../tools/")
import funclib

sys.path.append("../")
import benchmark_train as btrain
import benchmark_test as btest
import benchmark_common as bcommon
import config as cfg
import benchmark_evaluation as eva
import embedding_esm as esmebd

import production as pdc

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

from pandarallel import pandarallel #  import pandaralle
pandarallel.initialize() # init

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
start =time.process_time()

# 1. 读入数据
print('step 1: loading data') 
input_df = funclib.load_fasta_to_table(cfg.DATADIR +'test1.fasta') # test fasta
latest_sprot = pd.read_feather(cfg.FILE_LATEST_SPROT_FEATHER) #sprot db
end = time.process_time()
print('loading time: %s Seconds'%(end-start))

# 2. 查找数据
print('step 2: find existing data')
find_data =input_df.merge(latest_sprot, on='seq', how='left')
exist_data= find_data[~find_data.name.isnull()].iloc[:,np.r_[0,2,1,12,7,9:12]].rename(columns={'id_x':'id','id_y':'id_uniprot'})
noExist_data = find_data[find_data.name.isnull()]
noExist_data.reset_index(drop=True, inplace=True)
noExist_data = noExist_data.iloc[:,np.r_[0,2,1,12,7,9:12]].rename(columns={'id_x':'id','id_y':'id_uniprot'})

end = time.process_time()
print('matching time: %s Seconds'%(end-start))

# 3. EMBedding
print('step 3: Embedding')
rep0, rep32, rep33 = esmebd.get_rep_multi_sequence(sequences=noExist_data, model='esm1b_t33_650M_UR50S',seqthres=1022)

# 4. sequence alignment
print('step 4: sequence alignment')
if ~os.path.exists(cfg.FILE_BLAST_PRODUCTION_DB):
    funclib.table2fasta(latest_sprot, cfg.FILE_BLAST_PRODUCTION_FASTA)
    cmd = r'diamond makedb --in {0} -d {1}'.format(cfg.FILE_BLAST_PRODUCTION_FASTA, cfg.FILE_BLAST_PRODUCTION_DB)
    os.system(cmd)

blast_res = funclib.getblast_usedb(db=cfg.FILE_BLAST_PRODUCTION_DB, test=noExist_data)
blast_res = blast_res[['id', 'sseqid']].merge(latest_sprot, left_on='sseqid', right_on='id', how='left').iloc[:,np.r_[0,2:14]]
blast_res = blast_res.iloc[:,np.r_[0,1,11,12,6,8:11]].rename(columns={'id_x':'id','id_y':'id_uniprot'})
end = time.process_time()
print('alignment time: %s Seconds'%(end-start))


# 5. isEnzyme Prediction
print('step 5: predict isEnzyme')
model_isEnzyme = joblib.load(cfg.ISENZYME_MODEL)
pred_isEnzyme = pd.DataFrame()
pred_isEnzyme['id']=rep32.id
pred_isEnzyme['isEnzyme_pred'] = model_isEnzyme.predict(rep32.iloc[:,1:])

# 6. How many Prediction
print('step 6: predict function counts')
pred_howmany = pdc.predict_function_counts(rep32)

# 7. EC Prediction
print('step 7: predict EC')
pred_ec = pdc.predict_ec_slice(test_data=rep32)
pred_ec = noExist_data[['id','seq','seqlength']].merge(pred_ec, on='id', how='left')
pred_ec['seqlength']=pred_ec.seq.parallel_apply(lambda x: len(x) )

print('step 8: integrate results')

output_df = pdc.integrate_out_put(existing_table=exist_data,
                              blast_table=blast_res,
                              isEnzyme_pred_table = pred_isEnzyme, 
                              how_many_table = pred_howmany, 
                              ec_table = pred_ec
                            )
print('step 9: writting results')                
# output_df.to_csv( args.o, sep='\t')

end = time.process_time()
print('All done running time: %s Seconds'%(end-start))



step 1: loading data
loading time: 1.4482937819999986 Seconds
step 2: find existing data
matching time: 1.831895981999999 Seconds
step 3: Embedding
Transferred model to GPU


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 22.75it/s]


step 4: sequence alignment
Write finished


diamond v2.0.8.146 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org

#CPU threads: 80
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: /home/shizhenkun/codebase/DMLF/data/production_blast.fasta
Opening the database file...  [0.062s]
Loading sequences...  [0.976s]
Masking sequences...  [0.345s]
Writing sequences...  [0.185s]
Hashing sequences...  [0.058s]
Loading sequences...  [0s]
Writing trailer...  [0.003s]
Closing the input file...  [0.002s]
Closing the database file...  [0.233s]
Database hash = d47ea1d106d79f6ffaaaf02f417a2936
Processed 565254 sequences, 203850821 letters.
Total time = 1.867s


Write finished
diamond blastp -d /home/shizhenkun/codebase/DMLF/data/production_blast.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1 --quiet
alignment time: 84.217107472 Seconds
step 5: predict isEnzyme
step 6: predict function counts
step 7: predict EC
slice files prepared success
./slice_predict /home/shizhenkun/codebase/DMLF/tmp/ptest_2021_11_12_11_10_26.txt /home/shizhenkun/codebase/DMLF/model/slice_esm32 /home/shizhenkun/codebase/DMLF/tmp/ptest_2021_11_12_11_10_26.tsv -o 32 -b 0 -t 32 -q 0


Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


Time taken to find approx nearest neighbors = 0.004369
Total prediction time: 0.41775 s
Prediction time per point: 29.8393 ms
step 8: integrate results
step 9: writting results
All done running time: 135.963755133 Seconds


In [3]:
output_df

Unnamed: 0,id,id_uniprot,seq,seqlength,ec_number,date_integraged,date_sequence_update,date_annotation_update,res_type
0,t3,Q5RF96,MARGGDTGCTGPSETSASMMMMMMMMMGAVAIAFPGLEGPPADAQY...,178.0,3.4.-.-,30-AUG-2005,16-OCT-2019,02-JUN-2021,blast_match
1,t2,P69019,MAFLKKSLFLVLFLGKKKKKKKKKKKLVSLSICEKEKRQNEEDEDE...,83.0,-,01-FEB-2005,17-JUN-2020,02-JUN-2021,blast_match
2,t1,P69031,MAFLKRRDLISIKSLFLVLFLGLVSLSICEQEKREEENQEEDEENE...,78.0,-,01-FEB-2005,17-JUN-2020,02-JUN-2021,blast_match
3,P69031,P69031,MAFLKKSLFLVLFLGLVSLSICEQEKREEENQEEDEENEAASEEKR...,71.0,-,01-FEB-2005,17-JUN-2020,02-JUN-2021,db_match
4,Q29PU2,Q29PU2,MAKGGNKLMKLKSVLKKLNSFNTKPNQPPAQTNHSRSSAVSAFPSE...,127.0,-,25-APR-2018,04-APR-2006,02-JUN-2021,db_match
...,...,...,...,...,...,...,...,...,...
459,r1,,HLYCMFAQQATADYGGGNQYSYPYMPPGCWLPWLYNQCKQDHDDCN...,500.0,-,,,,dmlf_pred
460,t4,,KERKTGMFVMHEIGTPMEVRESGCDDKHHPHLKMMRYDEFADAILW...,60.0,-,,,,dmlf_pred
461,r9,,PFKGSAWHYEQIVIYHKHYPARHQYTQWQQWHCCTKAFNVVNECKW...,500.0,-,,,,dmlf_pred
462,r4,,LNDFITDTFYVDMKNAPKPEGGEPEIERDGMSMAKGEPRTQWNSAY...,500.0,-,,,,dmlf_pred


In [5]:
len(noExist_data)

14