# Prepare Production Data For Web-Service

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2023-02-02  

## 1. Import packages

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import config as cfg
from functools import reduce

from tools import filetool as ftool
from tools import exact_ec_from_uniprot as exactec
from tools import funclib
from tools import minitools as mtool
from tools import embedding_esm as esmebd
from tools import embdding_onehot as onehotebd

from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)


%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 52 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. Download latest from unisprot

In [2]:
# download location cfg.FILE_SPROT_LATEST
ftool.wget(download_url=cfg.URL_SPROT_LATEST, save_file=cfg.FILE_SPROT_LATEST)

wget -q https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz -O /home/shizhenkun/codebase/DMLF/data/uniprot/uniprot_sprot_leatest.dat.gz


## 3. Extract records from rawdata

In [3]:
exactec.run_exact_task(infile=cfg.FILE_SPROT_LATEST, outfile=f'{cfg.DIR_UNIPROT}sprot_latest.tsv')

568744it [03:47, 2499.02it/s]

finished use time 227.108 s





## 4. Preprocessing

In [5]:
#加载数据并转换时间格式
sprot_latest = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot_latest.tsv', sep='\t',header=0) #读入文件
sprot_latest = mtool.convert_DF_dateTime(inputdf = sprot_latest)

# 6.1 Drop Duplicates
sprot_latest.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot_latest.reset_index(drop=True, inplace=True)

sprot_latest = sprot_latest.iloc[:,np.r_[0,2:7,10:12]]

# Trim string
with pd.option_context('mode.chained_assignment', None):
    sprot_latest.ec_number = sprot_latest.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    sprot_latest.seq = sprot_latest.seq.parallel_apply(lambda x : str(x).strip()) #seq trim


#format EC numbers
sprot_latest['ec_number'] = sprot_latest.ec_number.parallel_apply(lambda x: mtool.format_ec(x))
sprot_latest['ec_number'] = sprot_latest.ec_number.parallel_apply(lambda x: mtool.specific_ecs(x)) #按最高级别指定EC
sprot_latest['functionCounts'] = sprot_latest.ec_number.parallel_apply(lambda x: 0 if x=='-'  else len(x.split(',')))

#save file
sprot_latest.to_feather(cfg.DIR_UNIPROT + 'sprot_latest.feather')

sprot_latest.head(3)

Unnamed: 0,id,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,seq,seqlength
0,P02762,False,False,0,-,0,MKMLLLLCLGLTLVCVHAEEASSTGRNFNVEKINGEWHTIILASDK...,180
1,P58773,False,False,0,-,0,MDAIKKKMQMLKLDKENALDRAEQAEADKKAAEERSKQLEDELVAL...,284
2,P68925,False,False,0,-,0,MKNGFYATYRSKNKGKDKRSINLSVFLNSLLADNHHLQVGSNYLYI...,279


## 5 Update Feature Bank
### 5.1 Load Exsiting Features

In [26]:
# load data
sprot_latest = pd.read_feather(cfg.DIR_UNIPROT + 'sprot_latest.feather')

# load features
if ftool.isfileExists(cfg.FILE_FEATURE_ESM0):
    feature_esm0 = pd.read_feather(cfg.FILE_FEATURE_ESM0)
    feature_esm32 = pd.read_feather(cfg.FILE_FEATURE_ESM32)
    feature_esm33 = pd.read_feather(cfg.FILE_FEATURE_ESM33)
    feature_unirep = pd.read_feather(cfg.FILE_FEATURE_UNIREP)
    feature_onehot = pd.read_feather(cfg.FILE_FEATURE_ONEHOT)
    #caculate embedding list
    needesm = sprot_latest[~sprot_latest.id.isin(list(set(feature_esm33.id)))]
    needunirep = sprot_latest[~sprot_latest.id.isin(list(set(feature_unirep.id)))]
    needonehot = sprot_latest[~sprot_latest.id.isin(list(set(feature_onehot.id)))]
else:
    needesm = sprot_latest
    needunirep = sprot_latest
    needonehot = sprot_latest

### 5.2 ESM embedding 

In [9]:
# !pip install fair-esm
if len(needesm)>0:
    tr_rep0, tr_rep32, tr_rep33 = esmebd.get_rep_multi_sequence(sequences=needesm, model='esm1b_t33_650M_UR50S',seqthres=1022)
    #merge existing
    feature_esm0 = pd.concat([feature_esm0, tr_rep0], axis=0).reset_index(drop=True)
    feature_esm32 = pd.concat([feature_esm0, tr_rep32], axis=0).reset_index(drop=True)
    feature_esm33 = pd.concat([feature_esm0, tr_rep33], axis=0).reset_index(drop=True)

    #save
    feature_esm0.to_feather(cfg.FILE_FEATURE_ESM0)
    feature_esm32.to_feather(cfg.FILE_FEATURE_ESM32)
    feature_esm33.to_feather(cfg.FILE_FEATURE_ESM33)

100%|██████████| 1963/1963 [19:21<00:00,  1.69it/s]


### 5.3 Unirep Embedding

In [15]:
if len(needunirep) > 0:
    from tools import embedding_unirep as unirep
    tr_unirep = unirep.getunirep(needunirep, 200)

    feature_unirep = pd.concat([feature_unirep, tr_unirep],axis=0).reset_index(drop=True)
    feature_unirep.to_feather(cfg.FILE_FEATURE_UNIREP)

No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
100%|██████████| 6/6 [36:29<00:00, 364.92s/it]

length not match





### 5.4 One-hot

In [19]:
if len(needonehot) > 0:
    tr_onehot = onehotebd.get_onehot(sequences=needonehot, padding=True, padding_window=1500)
    feature_onehot = pd.concat([feature_onehot, tr_onehot],axis=0).reset_index(drop=True)
    feature_onehot.to_feather(cfg.FILE_FEATURE_ONEHOT)

## 6. Build production datasets

In [34]:
# task1
task1 = sprot_latest.iloc[:,np.r_[0,7,1]]
task1.to_feather(cfg.DIR_DATASETS + 'production/task1.feather')

#task2
task2 = sprot_latest[sprot_latest.functionCounts >0].reset_index(drop=True)
task2 = task2.iloc[:,np.r_[0,7,3]]
task2.to_feather(cfg.DIR_DATASETS + 'production/task2.feather')

#task3
task3 = sprot_latest[sprot_latest.functionCounts >0].reset_index(drop=True)
task3 = task3.iloc[:,np.r_[0,7,4]]
task3.to_feather(cfg.DIR_DATASETS + 'production/task3.feather')