# Prepare Production Data For Web-Service

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2024-08-13  

## 1. Import packages

In [1]:
import numpy as np
import pandas as pd
import config as cfg

from tools import filetool as ftool
from tools import exact_ec_from_uniprot as exactec
from tools import minitools as mtool
from tools import uniprottool as unitool
from tools import embedding_esm as esmebd
from tools import embdding_onehot as onehotebd

from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)


%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. Download latest from unisprot

In [2]:
url = 'https://rest.uniprot.org/uniprotkb/search?query=reviewed=true&format=tsv&fields=accession,id,ec,sequence,date_created,date_modified,date_sequence_modified&size=500'

In [12]:
# Download latest UniProt data and save it as a feather file.
uniprot_latest = unitool.get_batch_data_from_uniprot_rest_api(url=url)

pd.DataFrame(uniprot_latest, columns=['id', 'name', 'ec_number', 'seq', 'date_integraged', 'date_annotation_update', 'date_sequence_update']).to_feather(cfg.TEMPDIR+'usplatest.feather')

## 3. Preprocessing

In [35]:
web_uniprot_latest = pd.read_feather(cfg.TEMPDIR+'usplatest.feather')
#加载数据并转换时间格式
web_uniprot_latest = mtool.convert_DF_dateTime(inputdf = web_uniprot_latest)

# Trim string
with pd.option_context('mode.chained_assignment', None):
    web_uniprot_latest.ec_number = web_uniprot_latest.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    web_uniprot_latest.seq = web_uniprot_latest.seq.parallel_apply(lambda x : str(x).strip()) #seq trim
    
web_uniprot_latest.ec_number = web_uniprot_latest.ec_number.replace('','-').replace(';',',') #ec replace


#format EC numbers
web_uniprot_latest['ec_number'] = web_uniprot_latest.ec_number.parallel_apply(lambda x: mtool.format_ec(x))
web_uniprot_latest['ec_number'] = web_uniprot_latest.ec_number.parallel_apply(lambda x: mtool.specific_ecs(x)) #按最高级别指定EC
web_uniprot_latest['functionCounts'] = web_uniprot_latest.ec_number.parallel_apply(lambda x: 0 if x=='-'  else len(x.split(',')))
    
web_uniprot_latest.head(3)

Unnamed: 0,id,name,ec_number,seq,date_integraged,date_annotation_update,date_sequence_update,functionCounts
0,P62594,BLAT_SALTI,3.5.2.6,MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIE...,1986-07-21,2024-05-29,1986-07-21,1
1,P62970,TRH_BOMOR,-,QHP,1986-07-21,2022-12-14,1986-07-21,0
2,P02713,ACHG_CHICK,-,MRCSDLLLLFLLALCVLPGISCRNQEEKLLQDLMTNYNRHLRPALR...,1986-07-21,2024-05-29,1987-08-13,0


In [37]:
web_uniprot_latest[web_uniprot_latest.functionCounts>1]

Unnamed: 0,id,name,ec_number,seq,date_integraged,date_annotation_update,date_sequence_update,functionCounts


In [33]:
web_uniprot_latest[web_uniprot_latest.ec_number.str.contains(';')]

Unnamed: 0,id,name,ec_number,seq,date_integraged,date_annotation_update,date_sequence_update,functionCounts
47,P03680,DPOL_BPPH2,2.7.7.7; 3.1.11.-,MKHMPRKMYSCDFETTTKVEDCRVWAYGYMNIEDHSEYKIGNSLDE...,1986-07-21,2024-05-29,1986-07-21,1
50,P03523,L_VSIVA,2.1.1.375; 2.7.7.48; 2.7.7.88; 3.6.1.-,MEVHDFETDEFNDFNEDDYATREFLNPDERMTYLNHADYNLNSPLI...,1986-07-21,2024-07-24,1995-11-01,1
101,P03587,RDRP_TOML,2.1.1.-; 2.7.7.-; 2.7.7.48; 3.6.4.13,MAYTQTATSSALLETVRGNNTLVNDLAKRRLYDTAVDEFNARDRRP...,1986-07-21,2024-07-24,2000-12-01,1
140,P03305,POLG_FMDVO,2.7.7.48; 3.4.22.28; 3.4.22.46; 3.6.1.15,MNTTDCFIALVQAIREIKALFLSRTTGKMELTLYNGEKKTFYSRPN...,1986-07-21,2024-07-24,1986-07-21,1
141,P03303,POLG_HRV14,2.7.7.48; 3.4.22.28; 3.4.22.29; 3.6.1.15,MGAQVSTQKSGSHENQNILTNGSNQTFTVINYYKDAASTSSAGQSL...,1986-07-21,2024-07-24,2007-01-23,1
...,...,...,...,...,...,...,...,...
571812,P74535,ARGZ_SYNY3,3.5.3.27; 4.3.1.12,MADDIRILMCPPDHYDVDYVINPWMEGNIHKSSQERAVEQWKKLHQ...,2024-07-24,2024-07-24,1997-02-01,1
571814,A0A1L9WQI2,VRCA_ASPA1,2.5.1.-; 4.2.3.-,MEFKFSAVVDPSTYQTQGLCDGLTVRYHKNTELEEIDCLRCQEHWR...,2024-07-24,2024-07-24,2017-03-15,1
571820,Q6PMV1,POLG_FMDVP,2.7.7.48; 3.4.22.28; 3.4.22.46; 3.6.1.15,MNTTDCFIALLYALREIKALFLSRTQGKMEFTLYNGEKKVFYSRPN...,2024-07-24,2024-07-24,2004-07-05,1
571839,A2I7M2,POLG_FMDA1,2.7.7.48; 3.4.22.28; 3.4.22.46; 3.6.1.15,MNTTDCFVALIHIFREIKALFLSRTQGKMEFTLHNGEKKTFYSRPN...,2024-07-24,2024-07-24,2007-02-20,1


In [5]:
#加载数据并转换时间格式
sprot_latest = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot_latest.tsv', sep='\t',header=0) #读入文件
sprot_latest = mtool.convert_DF_dateTime(inputdf = sprot_latest)

# 6.1 Drop Duplicates
sprot_latest.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot_latest.reset_index(drop=True, inplace=True)

sprot_latest = sprot_latest.iloc[:,np.r_[0,2:7,10:12]]

# Trim string
with pd.option_context('mode.chained_assignment', None):
    sprot_latest.ec_number = sprot_latest.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    sprot_latest.seq = sprot_latest.seq.parallel_apply(lambda x : str(x).strip()) #seq trim


#format EC numbers
sprot_latest['ec_number'] = sprot_latest.ec_number.parallel_apply(lambda x: mtool.format_ec(x))
sprot_latest['ec_number'] = sprot_latest.ec_number.parallel_apply(lambda x: mtool.specific_ecs(x)) #按最高级别指定EC
sprot_latest['functionCounts'] = sprot_latest.ec_number.parallel_apply(lambda x: 0 if x=='-'  else len(x.split(',')))

#save file
sprot_latest.to_feather(cfg.DIR_UNIPROT + 'sprot_latest.feather')

sprot_latest.head(3)

Unnamed: 0,id,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,seq,seqlength
0,P02762,False,False,0,-,0,MKMLLLLCLGLTLVCVHAEEASSTGRNFNVEKINGEWHTIILASDK...,180
1,P58773,False,False,0,-,0,MDAIKKKMQMLKLDKENALDRAEQAEADKKAAEERSKQLEDELVAL...,284
2,P68925,False,False,0,-,0,MKNGFYATYRSKNKGKDKRSINLSVFLNSLLADNHHLQVGSNYLYI...,279


In [11]:
pd.read_csv(f'{cfg.DIR_UNIPROT}sprot_latest.tsv', sep='\t')

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,Q6GZX4,001R_FRG3G,False,False,0,-,0,28-JUN-2011,19-JUL-2004,02-JUN-2021,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,256
1,Q6GZX3,002L_FRG3G,False,False,0,-,0,28-JUN-2011,19-JUL-2004,14-DEC-2022,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,320
2,Q197F8,002R_IIV3,False,False,0,-,0,16-JUN-2009,11-JUL-2006,23-FEB-2022,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,458
3,Q197F7,003L_IIV3,False,False,0,-,0,16-JUN-2009,11-JUL-2006,12-AUG-2020,MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGA...,156
4,Q6GZX2,003R_FRG3G,False,False,0,-,0,28-JUN-2011,19-JUL-2004,12-AUG-2020,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,438
...,...,...,...,...,...,...,...,...,...,...,...,...
568739,Q6UY62,Z_SABVB,False,False,0,-,0,20-JAN-2009,05-JUL-2004,12-OCT-2022,MGNSKSKSKLSANQYEQQTVNSTKQVAILKRQAEPSLYGRHNCRCC...,100
568740,P08105,Z_SHEEP,False,False,0,-,0,01-AUG-1988,01-AUG-1988,25-MAY-2022,MSSSLEITSFYSFIWTPHIGPLLFGIGLWFSMFKEPSHFCPCQHPH...,79
568741,Q88470,Z_TACVF,False,False,0,-,0,06-DEC-2005,23-JAN-2007,12-OCT-2022,MGNCNRTQKPSSSSNNLEKPPQAAEFRRTAEPSLYGRYNCKCCWFA...,95
568742,A9JR22,Z_TAMVU,False,False,0,-,0,20-JAN-2009,05-FEB-2008,03-AUG-2022,MGLRYSKEVRDRHGDKDPEGRIPITQTMPQTLYGRYNCKSCWFANK...,95


## 5 Update Feature Bank
### 5.1 Load Exsiting Features

In [26]:
# load data
sprot_latest = pd.read_feather(cfg.DIR_UNIPROT + 'sprot_latest.feather')

# load features
if ftool.isfileExists(cfg.FILE_FEATURE_ESM0):
    feature_esm0 = pd.read_feather(cfg.FILE_FEATURE_ESM0)
    feature_esm32 = pd.read_feather(cfg.FILE_FEATURE_ESM32)
    feature_esm33 = pd.read_feather(cfg.FILE_FEATURE_ESM33)
    feature_unirep = pd.read_feather(cfg.FILE_FEATURE_UNIREP)
    feature_onehot = pd.read_feather(cfg.FILE_FEATURE_ONEHOT)
    #caculate embedding list
    needesm = sprot_latest[~sprot_latest.id.isin(list(set(feature_esm33.id)))]
    needunirep = sprot_latest[~sprot_latest.id.isin(list(set(feature_unirep.id)))]
    needonehot = sprot_latest[~sprot_latest.id.isin(list(set(feature_onehot.id)))]
else:
    needesm = sprot_latest
    needunirep = sprot_latest
    needonehot = sprot_latest

In [5]:
pd.read_feather(cfg.FILE_FEATURE_ESM0)

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f1271,f1272,f1273,f1274,f1275,f1276,f1277,f1278,f1279,f1280
0,X6R8R1,-0.089050,0.006082,0.027113,0.020443,-0.058691,0.028277,-0.024602,0.102531,-0.018018,...,-0.016573,-0.092569,-0.014087,0.047702,-0.005135,0.121687,0.029113,-0.006858,-0.089662,0.063289
1,X6R8D5,-0.147871,0.012179,0.024843,0.040460,-0.005072,-0.024804,0.047539,0.061463,-0.002784,...,0.068238,-0.093530,-0.005282,0.059619,-0.011701,0.125361,0.023701,-0.002634,0.023725,0.050738
2,X5M8U1,-0.036810,0.000630,0.006120,0.003848,-0.018212,0.012790,-0.010132,-0.036419,-0.006744,...,-0.003781,-0.056431,0.000772,0.021638,0.015857,0.022233,0.002920,0.003550,-0.035073,0.007439
3,X5M5N0,-0.098374,0.000973,0.019109,0.009071,-0.037709,0.046117,-0.000138,-0.098571,-0.012081,...,-0.009432,-0.099681,-0.007209,0.032135,0.028557,0.042570,0.010058,0.002823,-0.065502,0.007961
4,X5JB51,-0.093689,0.004022,0.025616,0.005294,-0.046547,0.026919,-0.033440,-0.062409,-0.014050,...,-0.015097,-0.121174,-0.013965,0.029741,0.057090,0.063444,0.015059,0.002577,-0.059735,0.014819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505485,A0A017SEY2,-0.173912,0.005573,0.031189,0.029572,-0.031872,0.072243,0.009888,0.017371,-0.032710,...,-0.015069,-0.091455,-0.033778,0.069308,0.038649,0.135328,0.029984,-0.002217,0.035023,0.078774
505486,A0A017SEX7,-0.072138,-0.000170,0.011328,0.009451,-0.026131,0.010254,-0.036061,-0.031905,-0.011295,...,-0.008188,-0.089992,-0.001387,0.034956,0.031941,0.026984,0.006219,0.003577,-0.037853,0.016109
505487,A0A017SEF3,-0.075038,0.001168,0.032608,0.022068,-0.049269,0.034600,-0.002145,0.066551,-0.025950,...,-0.005576,-0.092709,-0.016841,0.059589,0.017101,0.125306,0.027483,-0.000778,-0.022447,0.063800
505488,A0A017SE85,-0.148619,0.011337,0.035423,0.022539,-0.059372,0.028088,0.031627,0.015070,-0.030167,...,-0.010247,-0.085584,-0.024509,0.059508,-0.005355,0.119766,0.035571,-0.002045,-0.027042,0.065159


### 5.2 ESM embedding 

In [6]:
# !pip install fair-esm
if len(needesm)>0:
    tr_rep0, tr_rep32, tr_rep33 = esmebd.get_rep_multi_sequence(sequences=needesm, model='esm1b_t33_650M_UR50S',seqthres=1022)
    #merge existing
    feature_esm0 = pd.concat([feature_esm0, tr_rep0], axis=0).reset_index(drop=True)
    feature_esm32 = pd.concat([feature_esm0, tr_rep32], axis=0).reset_index(drop=True)
    feature_esm33 = pd.concat([feature_esm0, tr_rep33], axis=0).reset_index(drop=True)

    #save
    feature_esm0.to_feather(cfg.FILE_FEATURE_ESM0)
    feature_esm32.to_feather(cfg.FILE_FEATURE_ESM32)
    feature_esm33.to_feather(cfg.FILE_FEATURE_ESM33)

NameError: name 'needesm' is not defined

### 5.3 Unirep Embedding

In [15]:
if len(needunirep) > 0:
    from tools import embedding_unirep as unirep
    tr_unirep = unirep.getunirep(needunirep, 200)

    feature_unirep = pd.concat([feature_unirep, tr_unirep],axis=0).reset_index(drop=True)
    feature_unirep.to_feather(cfg.FILE_FEATURE_UNIREP)

No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
100%|██████████| 6/6 [36:29<00:00, 364.92s/it]

length not match





### 5.4 One-hot

In [19]:
if len(needonehot) > 0:
    tr_onehot = onehotebd.get_onehot(sequences=needonehot, padding=True, padding_window=1500)
    feature_onehot = pd.concat([feature_onehot, tr_onehot],axis=0).reset_index(drop=True)
    feature_onehot.to_feather(cfg.FILE_FEATURE_ONEHOT)

## 6. Build production datasets

In [34]:
# task1
task1 = sprot_latest.iloc[:,np.r_[0,7,1]]
task1.to_feather(cfg.DIR_DATASETS + 'production/task1.feather')

#task2
task2 = sprot_latest[sprot_latest.functionCounts >0].reset_index(drop=True)
task2 = task2.iloc[:,np.r_[0,7,3]]
task2.to_feather(cfg.DIR_DATASETS + 'production/task2.feather')

#task3
task3 = sprot_latest[sprot_latest.functionCounts >0].reset_index(drop=True)
task3 = task3.iloc[:,np.r_[0,7,4]]
task3.to_feather(cfg.DIR_DATASETS + 'production/task3.feather')