# Update Production Data and Model

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-12-24  

This file contains update codes for the production server. The update should be scheduled every eight weeks.

## 1. Import packages

In [1]:
import numpy as np
import pandas as pd
import sys

import config as cfg
from functools import reduce
import joblib

sys.path.append("./tools/")
import funclib
import exact_ec_from_uniprot as exactec
import minitools as mtool
import benchmark_common as bcommon
import embedding_esm as esmebd

from pandarallel import pandarallel 
pandarallel.initialize() 
import benchmark_train as btrain

%load_ext autoreload
%autoreload 2

ModuleNotFoundError: No module named 'joblib'

## 2. Define Functions

In [2]:
# install axel for download dataset
def install_axel():
    isExists = !which axel
    if 'axel' in str(isExists[0]):
        return True
    else:
        !sudo apt install axel -y

# add missing '-' for ec number
def refill_ec(ec):   
    if ec == '-':
        return ec
    levelArray = ec.split('.')
    if  levelArray[3]=='':
        levelArray[3] ='-'
    ec = '.'.join(levelArray)
    return ec

def specific_ecs(ecstr):
    if '-' not in ecstr or len(ecstr)<4:
        return ecstr
    ecs = ecstr.split(',')
    if len(ecs)==1:
        return ecstr
    
    reslist=[]
    
    for ec in ecs:
        recs = ecs.copy()
        recs.remove(ec)
        ecarray = np.array([x.split('.') for x in recs])
        
        if '-' not in ec:
            reslist +=[ec]
            continue
        linearray= ec.split('.')
        if linearray[1] == '-':
            #l1 in l1s and l2 not empty
            if (linearray[0] in  ecarray[:,0]) and (len(set(ecarray[:,0]) - set({'-'}))>0):
                continue
        if linearray[2] == '-':
            # l1, l2 in l1s l2s, l3 not empty
            if (linearray[0] in  ecarray[:,0]) and (linearray[1] in  ecarray[:,1]) and (len(set(ecarray[:,2]) - set({'-'}))>0):
                continue
        if linearray[3] == '-':
            # l1, l2, l3 in l1s l2s l3s, l4 not empty
            if (linearray[0] in  ecarray[:,0]) and (linearray[1] in  ecarray[:,1]) and (linearray[2] in  ecarray[:,2]) and (len(set(ecarray[:,3]) - set({'-'}))>0):
                continue
                
        reslist +=[ec]
    return ','.join(reslist)

#format ec
def format_ec(ecstr):
    ecArray= ecstr.split(',')
    ecArray=[x.strip() for x in ecArray] #strip blank
    ecArray=[refill_ec(x) for x in ecArray] #format ec to full
    ecArray = list(set(ecArray)) # remove duplicates
    
    return ','.join(ecArray)

## 3. Download latest data from unisprot

In [8]:
# download location ./tmp

# ! mv $cfg.DATADIR'uniprot_sprot_latest.dat.gz' $cfg.TEMPDIR$currenttime'_uniprot_sprot_latest.dat.gz'
# install_axel()
! axel -n 10 https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz -o ./data/uniprot_sprot_latest.dat.gz -q -c

## 4. Preprocessing

In [9]:
exactec.run_exact_task(infile=cfg.DATADIR+'uniprot_sprot_latest.dat.gz', outfile=cfg.DATADIR+'sprot_latest.tsv')

#加载数据并转换时间格式
sprot_latest = pd.read_csv(cfg.DATADIR+'sprot_latest.tsv', sep='\t',header=0) #读入文件
sprot_latest = mtool.convert_DF_dateTime(inputdf = sprot_latest)

sprot_latest.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot_latest.reset_index(drop=True, inplace=True)

#sprot_latest format EC
sprot_latest['ec_number'] = sprot_latest.ec_number.parallel_apply(lambda x: format_ec(x))
sprot_latest['ec_number'] = sprot_latest.ec_number.parallel_apply(lambda x: specific_ecs(x))
sprot_latest['functionCounts'] = sprot_latest.ec_number.parallel_apply(lambda x: 0 if x=='-'  else len(x.split(',')))

# Trim Strging
with pd.option_context('mode.chained_assignment', None):
    sprot_latest.ec_number = sprot_latest.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    sprot_latest.seq = sprot_latest.seq.parallel_apply(lambda x : str(x).strip()) #seq trim

sprot_latest.to_feather(cfg.DATADIR + 'latest_sprot.feather')


566996it [04:36, 2050.78it/s]


finished use time 275.733 s


## 5. Caculation Features

In [13]:
train= pd.read_feather(cfg.DATADIR + 'latest_sprot.feather')
print('train size: {0}'.format(len(train)))

train size: 478954


In [None]:
! mv $cfg.DATADIR'sprot_latest_rep0.feather' $cfg.DATADIR'featureBank/sprot_latest_rep0.feather'
! mv $cfg.DATADIR'sprot_latest_rep32.feather' $cfg.DATADIR'featureBank/sprot_latest_rep32.feather'
! mv $cfg.DATADIR'sprot_latest_rep33.feather' $cfg.DATADIR'featureBank/sprot_latest_rep33.feather'
! mv $cfg.DATADIR'sprot_latest_unirep.feather' $cfg.DATADIR'featureBank/sprot_latest_unirep.feather'

# !pip install fair-esm
tr_rep0, tr_rep32, tr_rep33 = esmebd.get_rep_multi_sequence(sequences=train, model='esm1b_t33_650M_UR50S',seqthres=1022)
tr_rep0.to_feather(cfg.DATADIR + 'sprot_latest_rep0.feather')
tr_rep32.to_feather(cfg.DATADIR + 'sprot_latest_rep32.feather')
tr_rep33.to_feather(cfg.DATADIR + 'sprot_latest_rep33.feather')



mv: cannot stat '/home/shizhenkun/codebase/DMLF/data/sprot_latest_rep0.feather': No such file or directory
mv: cannot stat '/home/shizhenkun/codebase/DMLF/data/sprot_latest_rep32.feather': No such file or directory
mv: cannot stat '/home/shizhenkun/codebase/DMLF/data/sprot_latest_rep33.feather': No such file or directory
mv: cannot stat '/home/shizhenkun/codebase/DMLF/data/sprot_latest_unirep.feather': No such file or directory
Transferred model to GPU


 10%|█████████                                                                                  | 47813/478954 [1:00:08<12:13:55,  9.79it/s]

In [16]:
tr_rep0

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f1271,f1272,f1273,f1274,f1275,f1276,f1277,f1278,f1279,f1280
0,P60995,0.170989,0.025657,0.025744,0.020174,-0.076020,-0.191066,0.085143,0.504738,-0.036427,...,0.210663,-0.102772,0.087211,-0.048700,0.193667,0.169694,0.027922,-0.060504,0.285435,-0.069696
1,P02396,-0.111111,0.002945,0.018040,0.028491,-0.006897,-0.017328,-0.033452,-0.107921,-0.010232,...,0.037690,-0.116853,-0.023483,0.064209,0.017497,0.096947,0.004760,0.000524,0.056006,0.069281
2,P02362,-0.188286,-0.007029,0.006670,0.029186,-0.002706,0.045382,0.045775,0.001362,-0.016976,...,0.034140,-0.077455,-0.041554,0.071670,-0.001690,0.097198,0.019849,0.001559,-0.008701,0.083810
3,P02565,-0.026709,-0.000562,0.013712,0.003235,-0.038873,0.023068,-0.035896,-0.070689,-0.016175,...,0.015740,-0.104805,-0.004712,0.036699,0.034201,0.067125,-0.000467,0.003438,-0.048334,0.014916
4,P02827,-0.117281,0.004121,0.021203,0.024243,-0.045509,0.132612,-0.007418,0.035875,-0.022678,...,-0.003237,-0.081184,-0.021881,0.024642,0.038222,0.041906,0.009969,0.000780,-0.109430,0.042850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478949,C0P9J6,-0.092407,0.010308,0.022830,0.019094,-0.056988,0.064407,-0.032028,0.017303,-0.020415,...,-0.028978,-0.085610,-0.015938,0.048861,0.018688,0.082718,0.016897,-0.003126,-0.093333,0.065130
478950,C0PTH8,-0.130944,0.006237,0.021503,0.022638,-0.044376,0.128483,0.023310,0.020793,-0.019937,...,-0.005362,-0.079125,-0.017278,0.019771,0.032946,0.054444,0.021701,-0.001576,-0.117068,0.037637
478951,D5KXD2,-0.098420,0.001806,0.023656,0.018481,-0.055579,0.117249,0.030040,0.014601,-0.021143,...,0.004568,-0.075909,-0.011751,0.036392,0.019632,0.073973,0.020997,-0.001993,-0.101203,0.055563
478952,A0A2Z4HPZ0,-0.123796,0.002207,0.021613,0.022929,-0.055302,0.112782,0.004356,0.053342,-0.014881,...,0.000617,-0.074901,-0.015275,0.033363,0.018851,0.058429,0.019783,-0.003662,-0.126644,0.052289


In [18]:
train_esm_latest = pd.read_feather(cfg.DATADIR + 'sprot_latest_rep32.feather')
train_esm_latest = train.merge(train_esm_latest, on='id', how='left')

## 6. Split X Y

In [19]:
# task 1
X_train_task1 = np.array(train_esm_latest.iloc[:,12:])
Y_train_task1 = np.array(train_esm_latest.isenzyme.astype('int')).flatten()
train_enzyme = train_esm_latest[train_esm_latest.isenzyme].reset_index(drop=True)

# task 2
X_train_task2_s = np.array(train_enzyme.iloc[:,12:])
Y_train_task2_s = train_enzyme.functionCounts.apply(lambda x : 0 if x==1 else 1).astype('int').values

train_task2M=train_enzyme[train_enzyme.functionCounts>=2].reset_index(drop=True)
X_train_task2_m = np.array(train_task2M.iloc[:,12:])
Y_train_task2_m = np.array(train_task2M.functionCounts.astype('int')-2).flatten()

#task 3
train_set_task3= funclib.split_ecdf_to_single_lines(train_enzyme.iloc[:,np.r_[0,10,5]])
train_set_task3=train_set_task3.merge(train_esm_latest.iloc[:,np.r_[0,12:1292]], on='id', how='left')

#4. Loading EC Numbers
print('loading ec to label dict')
dict_ec_label = btrain.make_ec_label(train_label=train_set_task3['ec_number'], test_label=train_set_task3['ec_number'], file_save= cfg.FILE_EC_LABEL_DICT, force_model_update=cfg.UPDATE_MODEL)

train_set_task3['ec_label']=train_set_task3.ec_number.parallel_apply(lambda x: dict_ec_label.get(x))    
X_train_task3 = np.array(train_set_task3.iloc[:,3:])
Y_train_task3 = np.array(train_set_task3.ec_label.astype('int')).flatten()

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 231834/231834 [05:02<00:00, 766.36it/s]


loading ec to label dict


## 7. Train Model