In [9]:
import numpy as np
import pandas as pd
import time
import datetime
import sys
import os
from tqdm import tqdm
from functools import reduce
from Bio import SeqIO
import joblib
sys.path.append("../tools/")
import funclib

sys.path.append("../")
import benchmark_train as btrain
import benchmark_test as btest
import benchmark_common as bcommon
import config as cfg
import benchmark_evaluation as eva
import embedding_esm as esmebd

import production as pdc

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

from pandarallel import pandarallel #  import pandaralle
pandarallel.initialize() # init

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 80 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:

input_file = cfg.DATADIR + 'test1.fasta'
output_file = cfg.RESULTSDIR + 'ec_res.tsv'
compute_mode = 'p'
topk = 10

pdc.step_by_step_run(   input_fasta=input_file, 
                    output_tsv=output_file, 
                    mode=compute_mode, 
                    topnum=topk
                )

run in annoation mode
step 1: loading data
step 2: find existing data
step 3: Embedding
Transferred model to GPU


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:01<00:00, 13.69it/s]


step 4: sequence alignment
Write finished


diamond v2.0.8.146 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org

#CPU threads: 80
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: /home/shizhenkun/codebase/DMLF/data/production_blast.fasta
Opening the database file...  [0.066s]
Loading sequences...  [0.973s]
Masking sequences...  [0.4s]
Writing sequences...  [0.185s]
Hashing sequences...  [0.059s]
Loading sequences...  [0s]
Writing trailer...  [0.003s]
Closing the input file...  [0.002s]
Closing the database file...  [0.202s]
Database hash = d47ea1d106d79f6ffaaaf02f417a2936
Processed 565254 sequences, 203850821 letters.
Total time = 1.894s


Write finished
diamond blastp -d /home/shizhenkun/codebase/DMLF/data/production_blast.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1 --quiet
step 5: predict isEnzyme
step 6: predict function counts
step 7: predict EC
slice files prepared success
./slice_predict /home/shizhenkun/codebase/DMLF/tmp/ptest_2021_11_19_03_37_33.txt /home/shizhenkun/codebase/DMLF/model/slice_esm32 /home/shizhenkun/codebase/DMLF/tmp/ptest_2021_11_19_03_37_33.tsv -o 32 -b 0 -t 32 -q 0


Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


Time taken to find approx nearest neighbors = 0.004848
Total prediction time: 0.544204 s
Prediction time per point: 38.8717 ms
step 8: integrate results
step 9: writting results
All done running time: 255.406469339 Seconds


In [2]:
input_file = cfg.DATADIR + 'test.fasta'
output_file = cfg.RESULTSDIR + 'ec_res.tsv'
mode = 'p'
topk = 10

In [3]:
start = time.process_time()
if mode =='p':
    print('run in annoation mode')
if mode =='r':
    print('run in annoation mode')

# 1. 读入数据
print('step 1: loading data')
input_df = funclib.load_fasta_to_table(input_file) # test fasta
latest_sprot = pd.read_feather(cfg.FILE_LATEST_SPROT_FEATHER) #sprot db

# 2. 查找数据
print('step 2: find existing data')
find_data =input_df.merge(latest_sprot, on='seq', how='left')
exist_data= find_data[~find_data.name.isnull()].iloc[:,np.r_[0,2,1,12,7,9:12]].rename(columns={'id_x':'id','id_y':'id_uniprot'})
noExist_data = find_data[find_data.name.isnull()]
noExist_data.reset_index(drop=True, inplace=True)
noExist_data = noExist_data.iloc[:,np.r_[0,2,1,12,7,9:12]].rename(columns={'id_x':'id','id_y':'id_uniprot'})

if len(noExist_data) == 0:
    exist_data.to_csv(output_tsv, sep='\t')
    end = time.process_time()
    print('All done running time: %s Seconds'%(end-start))
    

# 3. EMBedding
print('step 3: Embedding')
if mode =='p':
    rep0, rep32, rep33 = esmebd.get_rep_multi_sequence(sequences=noExist_data, model='esm1b_t33_650M_UR50S',seqthres=1022)
if mode == 'r':
    rep0, rep32, rep33 = esmebd.get_rep_multi_sequence(sequences=input_df, model='esm1b_t33_650M_UR50S',seqthres=1022)

# 4. sequence alignment
print('step 4: sequence alignment')
if ~os.path.exists(cfg.FILE_BLAST_PRODUCTION_DB):
    funclib.table2fasta(latest_sprot, cfg.FILE_BLAST_PRODUCTION_FASTA)
    cmd = r'diamond makedb --in {0} -d {1}'.format(cfg.FILE_BLAST_PRODUCTION_FASTA, cfg.FILE_BLAST_PRODUCTION_DB)
    os.system(cmd)
if mode =='p':
    blast_res = funclib.getblast_usedb(db=cfg.FILE_BLAST_PRODUCTION_DB, test=noExist_data)
if mode == 'r':
    blast_res = funclib.getblast_usedb(db=cfg.FILE_BLAST_PRODUCTION_DB, test=input_df)
blast_res = blast_res[['id', 'sseqid']].merge(latest_sprot, left_on='sseqid', right_on='id', how='left').iloc[:,np.r_[0,2:14]]
blast_res = blast_res.iloc[:,np.r_[0,1,11,12,6,8:11]].rename(columns={'id_x':'id','id_y':'id_uniprot'})

# 5. isEnzyme Prediction
print('step 5: predict isEnzyme')
model_isEnzyme = joblib.load(cfg.ISENZYME_MODEL)
pred_isEnzyme = pd.DataFrame()
pred_isEnzyme['id']=rep32.id
pred_isEnzyme['isEnzyme_pred'] = model_isEnzyme.predict(rep32.iloc[:,1:])

# 6. How many Prediction
print('step 6: predict function counts')
pred_howmany = pdc.predict_function_counts(rep32)

# 7. EC Prediction
print('step 7: predict EC')
pred_ec = pdc.predict_ec_slice(test_data=rep32)
if mode=='p':
    pred_ec = noExist_data[['id','seq']].merge(pred_ec, on='id', how='left')
# if mode == 'r':
#     pred_ec = input_df[['id', 'seq']].merge(pred_ec, on='id', how='left')
# pred_ec['seqlength']=pred_ec.seq.parallel_apply(lambda x: len(x) )

run in annoation mode
step 1: loading data
step 2: find existing data
step 3: Embedding
Transferred model to GPU


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 18.36it/s]


step 4: sequence alignment
Write finished


diamond v2.0.8.146 (C) Max Planck Society for the Advancement of Science
Documentation, support and updates available at http://www.diamondsearch.org

#CPU threads: 80
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database input file: /home/shizhenkun/codebase/DMLF/data/production_blast.fasta
Opening the database file...  [0.034s]
Loading sequences...  [0.977s]
Masking sequences...  [0.489s]
Writing sequences...  [0.184s]
Hashing sequences...  [0.059s]
Loading sequences...  [0s]
Writing trailer...  [0.003s]
Closing the input file...  [0.003s]
Closing the database file...  [0.214s]
Database hash = d47ea1d106d79f6ffaaaf02f417a2936
Processed 565254 sequences, 203850821 letters.
Total time = 1.966s


Write finished
diamond blastp -d /home/shizhenkun/codebase/DMLF/data/production_blast.dmnd  -q  /tmp/test.fasta -o /tmp/test_fasta_results.tsv -b5 -c1 -k 1 --quiet
step 5: predict isEnzyme
step 6: predict function counts
step 7: predict EC
slice files prepared success
./slice_predict /home/shizhenkun/codebase/DMLF/tmp/ptest_2021_11_17_08_01_58.txt /home/shizhenkun/codebase/DMLF/model/slice_esm32 /home/shizhenkun/codebase/DMLF/tmp/ptest_2021_11_17_08_01_58.tsv -o 32 -b 0 -t 32 -q 0


Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


Time taken to find approx nearest neighbors = 0.005432
Total prediction time: 0.59104 s
Prediction time per point: 147.76 ms


In [8]:
pred_ec

Unnamed: 0,id,seq,top0,top1,top2,top3,top4,top5,top6,top7,...,top11,top12,top13,top14,top15,top16,top17,top18,top19,seqlength
0,C0HLG3,MAETVQYYNSYSDASIASCAFVDSGKDKIDKTKLVTYTSRLAASPA...,3.2.2.22,2.3.3.9,2.8.4.1,3.1.21.4,3.1.27.3,6.2.1.14,3.4.22.44,3.4.22.45,...,2.7.7.19,1.3.7.2,3.6.4.-,1.3.7.5,4.7.1.1,3.4.22.48,3.4.21.97,1.14.13.25,2.4.99.20,136
1,U4PCM1,MTAVSNIASFNMNGGMMRGNQMPNVTLTIQPSTSSMQNSQPRIMNN...,3.6.4.13,2.3.2.27,3.6.4.12,2.7.11.23,2.7.11.1,2.7.11.22,3.1.13.4,3.1.-.-,...,2.7.11.25,3.1.27.-,3.1.3.16,2.1.1.-,3.4.21.-,3.-.-.-,3.5.1.98,3.1.1.97,3.6.4.3,820
2,F2SH39,MGLFSKTQTTSELPVKETDVESSATSAQPSKGPSINDLSDDTREAS...,1.1.1.1,2.7.11.1,2.7.11.22,1.2.1.3,6.3.5.7,3.4.21.-,2.4.1.16,2.6.1.1,...,2.7.11.23,1.2.1.36,3.4.21.4,3.1.3.4,1.1.1.284,3.2.1.21,3.6.4.12,2.7.1.192,3.6.3.44,583
3,W6QRN8,MRVEAGGDMRDKLMWIRLYILGNVGQTFGDMKRYIGMWSGMLFPIS...,1.-.-.-,3.1.-.-,3.-.-.-,3.1.1.81,2.4.1.-,1.14.14.-,1.14.-.-,2.7.1.-,...,1.14.13.177,1.14.13.174,1.14.13.78,1.5.3.7,1.14.99.-,1.1.1.330,2.-.-.-,1.14.13.12,1.14.13.198,932


In [5]:
if mode=='p':
    pred_ec = noExist_data[['id','seq']].merge(pred_ec, on='id', how='left')

In [7]:
pred_ec['seqlength']=pred_ec.seq.parallel_apply(lambda x: len(x) )

In [125]:
import matplotlib as mpl
import matplotlib.pyplot as plt

import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)

for module in mpl, np, sklearn, tf, keras:
    print(module.__name__, module.__version__)

fashion_mnist = keras.datasets.fashion_mnist
(x_train_all, y_train_all), (x_test, y_test) = fashion_mnist.load_data()
x_valid, x_train = x_train_all[:5000]/255, x_train_all[5000:]/255
y_valid, y_train = y_train_all[:5000], y_train_all[5000:]

print(x_valid.shape, y_valid.shape)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

def show_single_image(img_arr):
    plt.imshow(img_arr, cmap = "binary")
    plt.show()
    
#show_single_image(x_train[0])

def show_imgs(n_rows, n_cols, x_data, y_data, class_names):
    assert len(x_data) == len(y_data)
    assert n_rows * n_cols < len(x_data)
    
    plt.figure(figsize = (n_cols * 1.4, n_rows * 1.6))
    
    for row in range(n_rows):
        for col in range(n_cols):
            index = n_cols * row + col
            plt.subplot(n_rows, n_cols, index+1)
            plt.imshow(x_data[index], cmap="binary", interpolation = 'nearest')
            plt.axis('off')
            plt.title(class_names[y_data[index]])
            
    plt.show()
    
class_names = ['T-shirt', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'shirt', 'Sneaker',
             'Bag', 'Ankle boot']

#show_imgs(3,5,x_train,y_train,class_names)

# tf.keras.models.Sequential()

model = keras.models.Sequential()
model.add(keras.layers.Flatten(input_shape = [28, 28]))
model.add(keras.layers.Dense(300, activation = 'relu'))  #  relu: y = max(0, x)
model.add(keras.layers.Dense(100, activation = 'relu'))
model.add(keras.layers.Dense(10,activation = 'softmax')) # softmax: y = [e^x1/sum e^x2/sum ...]

# reason for sparse: y->one_hot
model.compile(loss = "sparse_categorical_crossentropy",
             optimizer = "sgd",
             metrics = ["accuracy"])

model.layers

model.summary()

history = model.fit(x_train, y_train, epochs=10,
                   validation_data = (x_valid, y_valid))



2.6.1
matplotlib 3.2.2
numpy 1.19.5
sklearn 0.24.2
tensorflow 2.6.1
keras.api._v2.keras 2.6.0
(5000, 28, 28) (5000,)
(55000, 28, 28) (55000,)
(10000, 28, 28) (10000,)
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 784)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 300)               235500    
_________________________________________________________________
dense_5 (Dense)              (None, 100)               30100     
_________________________________________________________________
dense_6 (Dense)              (None, 10)                1010      
Total params: 266,610
Trainable params: 266,610
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Ep