# Construct Datasets for benchmarking tasks

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2022-10-05  

## Dataset1. Enzyme None-enzyme Dataset
The enzyme dataset is consists of two parts: a training set and a testing set.   
The training set is from snapshot Feb-2018 and excludes those deleted items in snapshot Feb-2022.    
The training set is consists of 467,973 records, of which 222,290 are enzymes, and 245,683 are not enzymes.   
The testing set is from snapshot Feb-2022 and excludes these items that appeared in snapshot Feb-2018.   
The testing set is consists of 8033 records, of which 3579 are enzymes, and 4454 are not-enzymes.   
Unlike previous works,  we did not filter any sequences in terms of length and homology to make the data more inclusive. We make a label for each sequence, 1 for enzyme and 0 for not-enzyme.   

## Dataset2. Enzyme Quantity Dataset
The enzyme quantity dataset only contains enzyme data, contain 13,108 records. The function quantity ranges from 2 to 10.

## Dataset 3: EC number Dataset

Similar to the enzyme quantity dataset, the EC number dataset is consists of 225,221 enzyme records, 221,642 are training-set, and the rest 3579 are testing-set, covering 4852 EC numbers. Up to Feb 2020, there still exist 267 EC numbers that the model can not handle in the benchmarking. Thus, we exclude the sequences with these 267 EC numbers in the evaluation process. But, this problem can be resolved in the production scenario because we use the entire data from Swiss-Prot. Now the EC coverage is 5307 and can be automatically extended, for the training is real-time based on the publication of Swiss-Prot every 8 weeks. 

## 1. Import packages

In [2]:
import numpy as np
import pandas as pd
import sys,os
from tqdm import tqdm
import config as cfg
from functools import reduce

from tools import filetool as ftool
from tools import exact_ec_from_uniprot as exactec
from tools import funclib
from tools import minitools as mtool
from tools import embedding_esm as esmebd

from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)


%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 104 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. Define Functions

In [2]:
# add missing '-' for ec number
def refill_ec(ec):   
    if ec == '-':
        return ec
    levelArray = ec.split('.')
    if  levelArray[3]=='':
        levelArray[3] ='-'
    ec = '.'.join(levelArray)
    return ec

def specific_ecs(ecstr):
    if '-' not in ecstr or len(ecstr)<4:
        return ecstr
    ecs = ecstr.split(',')
    if len(ecs)==1:
        return ecstr
    
    reslist=[]
    
    for ec in ecs:
        recs = ecs.copy()
        recs.remove(ec)
        ecarray = np.array([x.split('.') for x in recs])
        
        if '-' not in ec:
            reslist +=[ec]
            continue
        linearray= ec.split('.')
        if linearray[1] == '-':
            #l1 in l1s and l2 not empty
            if (linearray[0] in  ecarray[:,0]) and (len(set(ecarray[:,0]) - set({'-'}))>0):
                continue
        if linearray[2] == '-':
            # l1, l2 in l1s l2s, l3 not empty
            if (linearray[0] in  ecarray[:,0]) and (linearray[1] in  ecarray[:,1]) and (len(set(ecarray[:,2]) - set({'-'}))>0):
                continue
        if linearray[3] == '-':
            # l1, l2, l3 in l1s l2s l3s, l4 not empty
            if (linearray[0] in  ecarray[:,0]) and (linearray[1] in  ecarray[:,1]) and (linearray[2] in  ecarray[:,2]) and (len(set(ecarray[:,3]) - set({'-'}))>0):
                continue
                
        reslist +=[ec]
    return ','.join(reslist)

#format ec
def format_ec(ecstr):
    ecArray= ecstr.split(',')
    ecArray=[x.strip() for x in ecArray] #strip blank
    ecArray=[refill_ec(x) for x in ecArray] #format ec to full
    ecArray = list(set(ecArray)) # remove duplicates
    
    return ','.join(ecArray)

## 3. Download rawdata from unisprot

In [3]:
#snapshot 2018-02
ftool.wget(download_url=cfg.URL_SPROT_SNAP201802, save_file=cfg.FILE_SPROT_SNAP201802)
#snapshot 2022-02
ftool.wget(download_url=cfg.URL_SPROT_SNAP202202, save_file=cfg.FILE_SPROT_SNAP202202)

wget -q https://ftp.uniprot.org/pub/databases/uniprot/previous_releases/release-2018_02/knowledgebase/uniprot_sprot-only2018_02.tar.gz -O /home/shizhenkun/codebase/DMLF/data/uniprot/uniprot_sprot-only2018_02.tar.gz
wget -q https://ftp.uniprot.org/pub/databases/uniprot/previous_releases/release-2022_02/knowledgebase/uniprot_sprot-only2022_02.tar.gz -O /home/shizhenkun/codebase/DMLF/data/uniprot/uniprot_sprot-only2022_02.tar.gz


## 4. Extract records from rawdata

In [4]:
cmd_array = [
    # 2018 data
    f'tar -zxvf {cfg.FILE_SPROT_SNAP201802} -C {cfg.DIR_UNIPROT}',
    f'mv {cfg.DIR_UNIPROT}uniprot_sprot.dat.gz {cfg.DIR_UNIPROT}sprot2018.data.gz', 
    f'rm -f {cfg.DIR_UNIPROT}uniprot_sprot.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot_varsplic.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot.xml.gz',
    # 2022 data
    f'tar -zxvf {cfg.FILE_SPROT_SNAP202202} -C {cfg.DIR_UNIPROT}',
    f'mv {cfg.DIR_UNIPROT}uniprot_sprot.dat.gz {cfg.DIR_UNIPROT}sprot2022.data.gz', 
    f'rm -f {cfg.DIR_UNIPROT}uniprot_sprot.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot_varsplic.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot.xml.gz'
]

[os.system(item) for item in cmd_array]

uniprot_sprot.dat.gz
uniprot_sprot.fasta.gz
uniprot_sprot_varsplic.fasta.gz
uniprot_sprot.xml.gz
uniprot_sprot.dat.gz
uniprot_sprot.fasta.gz
uniprot_sprot_varsplic.fasta.gz
uniprot_sprot.xml.gz


[0, 0, 0, 0, 0, 0]

In [34]:
#snapshot 2018-02
exactec.run_exact_task(infile=f'{cfg.DIR_UNIPROT}sprot2018.data.gz', outfile=f'{cfg.DIR_UNIPROT}sprot2018.tsv')
#snapshot 2022-02
exactec.run_exact_task(infile=f'{cfg.DIR_UNIPROT}sprot2022.data.gz', outfile=f'{cfg.DIR_UNIPROT}sprot2022.tsv')

556825it [03:03, 3034.27it/s]


finished use time 183.900 s


567483it [03:39, 2581.31it/s]

finished use time 220.102 s





## 5. Load records & Drop Duplicates

In [5]:
#加载数据并转换时间格式
sprot2018 = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot2018.tsv', sep='\t',header=0) #读入文件
sprot2018 = mtool.convert_DF_dateTime(inputdf = sprot2018)

sprot2022 = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot2022.tsv', sep='\t',header=0) #读入文件
sprot2022 = mtool.convert_DF_dateTime(inputdf = sprot2022)


#Drop Duplicates
sprot2018.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot2018.reset_index(drop=True, inplace=True)
sprot2022.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot2022.reset_index(drop=True, inplace=True)


In [50]:
sprot2018

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,Q3J1A3,LHB1_RHOS4,False,False,0,-,0,1986-07-21,2007-01-23,2017-10-25,MADKSDLGYTGLTDEQAQELHSVYMSGLWLFSAVAIVAHLAVYIWRPWF,49
1,P02157,MYG_MELME,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MGLSDGEWQLVLNVWGKVEADLAGHGQEVLIRLFKGHPETLEKFDK...,154
2,P02178,MYG_MEGNO,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MVLSDAEWQLVLNIWAKVEADVAGHGQDILIRLFKGHPETLEKFDK...,154
3,P02194,MYG_MACRU,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MGLSDGEWQLVLNIWGKVETDEGGHGKDVLIRLFKGHPETLEKFDK...,154
4,P01915,HB22_MOUSE,False,False,0,-,0,1986-07-21,1986-07-21,2017-10-25,MVWLPRVPCVAAVILLLTVLSPPVALVRDTRPRFLEYVTSECHFYN...,264
...,...,...,...,...,...,...,...,...,...,...,...,...
469129,Q21221,AHO3_CAEEL,True,False,1,3.1.2.22,4,2018-02-28,2004-11-23,2018-02-28,MSSGAPSGSSMSSTPGSPPPRAGGPNSVSFKDLCCLFCCPPFPSSI...,332
469130,Q6QJ72,PDL2_ARATH,True,False,1,4.2.1.96,4,2018-02-28,2004-07-05,2018-02-28,MSRLLLPKLFSISRTQVPAASLFNNLYRRHKRFVHWTSKMSTDSVR...,187
469131,C0HL68,ES1GA_ODOGR,False,False,0,-,0,2018-02-28,2018-02-28,2018-02-28,GLFSKPAGKGIKNLIPKGVKHIGKEVGKDVIRTGIDVAGCKIKGEC,46
469132,C0HK74,VKT3_HETMG,False,False,0,-,0,2018-02-28,2018-02-28,2018-02-28,GSICLEPKVVGPCTAYFPRFYFDSETGKCTPFIYGGCEGNGNNFET...,56


In [6]:
sprot2022

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,P00250,FER_APHSA,False,False,0,-,0,1986-07-21,2007-01-23,2022-05-25,MASYKVTLKTPDGDNVITVPDDEYILDVAEEEGLDLPYSCRAGACS...,97
1,P03420,FUS_HRSVA,False,False,0,-,0,1986-07-21,1986-07-21,2022-05-25,MELLILKANAITTILTAVTFCFASGQNITEEFYQSTCSAVSKGYLS...,574
2,P0ACF7,DBHB_SHIFL,False,False,0,-,0,1986-07-21,1986-07-21,2022-05-25,MNKSQLIDKIAAGADISKAAAGRALDAIIASVTESLKEGDDVALVG...,90
3,P01901,HA1B_MOUSE,False,False,0,-,0,1986-07-21,1986-07-21,2022-05-25,MVPCTLLLLLAAALAPTQTRAGPHSLRYFVTAVSRPGLGEPRYMEV...,369
4,P01245,SOMA_HORSE,False,False,0,-,0,1986-07-21,1995-11-01,2022-05-25,MAAGPRTSVLLAFGLLCLPWPQDVGAFPAMPLSSLFANAVLRAQHL...,216
...,...,...,...,...,...,...,...,...,...,...,...,...
479421,A0A2K5TU92,SIR6_MACFA,True,True,3,"2.3.1.-, 2.3.1.286, 2.4.2.-",4,2022-05-25,2021-06-02,2022-05-25,MSVNYAAGLSPYADKGKCGLPEIFDPPEELERKVWELARLVWQSSH...,355
479422,A0A3R0A696,ARAFA_BIFL2,True,False,1,3.2.1.55,4,2022-05-25,2019-04-10,2022-05-25,MKHWKKMAASLIAISTMVAVVPTTYAMESEDSQPQTTDTATVQTTK...,1065
479423,Q5ZV91,DOTZ_LEGPH,False,False,0,-,0,2022-05-25,2004-11-23,2022-05-25,MDEIKKDDELSQWLSTYGTITAERILGRYNISLPQDEILEAINIPS...,294
479424,M1H607,TPM_PORPE,False,False,0,-,0,2022-05-25,2013-05-01,2022-05-25,MDAIKKKMQAMKLEKDDAMDRADTLEQQNKEANIRAEKAEEEVHNL...,284


## 6. Preprocessing
### 6.1 format EC

In [7]:
#sprot2018
sprot2018['ec_number'] = sprot2018.ec_number.parallel_apply(lambda x: format_ec(x))
sprot2018['ec_number'] = sprot2018.ec_number.parallel_apply(lambda x: specific_ecs(x))
sprot2018['functionCounts'] = sprot2018.ec_number.parallel_apply(lambda x: 0 if x=='-'  else len(x.split(',')))
print('sprot 2018 finished')

#sprot2022
sprot2022['ec_number'] = sprot2022.ec_number.parallel_apply(lambda x: format_ec(x))
sprot2022['ec_number'] = sprot2022.ec_number.parallel_apply(lambda x: specific_ecs(x))
sprot2022['functionCounts'] = sprot2022.ec_number.parallel_apply(lambda x: 0 if x=='-'  else len(x.split(',')))

print('sprot 2022 finished')

sprot 2018 finished
sprot 2022 finished


In [13]:
sprot2018.to_feather(cfg.DIR_UNIPROT + '/snap201802.feather')
sprot2022.to_feather(cfg.DIR_UNIPROT + '/snap202202.feather')

### 6.2 Split Tain Test

In [8]:
train = sprot2018.iloc[:,np.r_[0,2:8,10:12]]
test = sprot2022.iloc[:,np.r_[0,2:8,10:12]]
test =test[~test.seq.isin(train.seq)]
test.reset_index(drop=True, inplace=True)

### 6.3 Remove changed seqence in test set

In [10]:
test = test[~test.id.isin(test.merge(train, on='id', how='inner').id.values)]
test.reset_index(drop=True, inplace=True)

### 6.4 Trim string

In [11]:
with pd.option_context('mode.chained_assignment', None):
    train.ec_number = train.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    train.seq = train.seq.parallel_apply(lambda x : str(x).strip()) #seq trim
    
    test.ec_number = test.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    test.seq = test.seq.parallel_apply(lambda x : str(x).strip()) #seq trim

### 6.5 Save train test

In [12]:
train.to_feather(cfg.DATADIR + 'datasets/train.feather')
test.to_feather(cfg.DATADIR + 'datasets/test.feather')

## 7. Build benchmarking datasets
### 7.1 Task 1 isEnzyme

In [3]:
train = pd.read_feather(cfg.DATADIR + 'datasets/train.feather')
test = pd.read_feather(cfg.DATADIR + 'datasets/test.feather')

In [6]:
task1_train = train.iloc[:,np.r_[0,7,1]]
task1_test = test.iloc[:,np.r_[0,7,1]]

task1_train.to_feather(cfg.DATADIR + 'task1/train.feather')
task1_test.to_feather(cfg.DATADIR + 'task1/test.feather')

### 7.2 Task2 Function Counts

In [8]:
task2_train = train[train.functionCounts >0]
task2_train.reset_index(drop=True, inplace=True)
task2_train = task2_train.iloc[:,np.r_[0,7,3]]

task2_test = test[test.functionCounts >0]
task2_test.reset_index(drop=True, inplace=True)
task2_test = task2_test.iloc[:,np.r_[0,7,3]]

task2_train.to_feather(cfg.DATADIR + 'task2/train.feather')
task2_test.to_feather(cfg.DATADIR + 'task2/test.feather')


### 7.3 Task3 EC Number

In [4]:
task3_train = train[train.functionCounts >0]
task3_train.reset_index(drop=True, inplace=True)
task3_train = task3_train.iloc[:,np.r_[0,7,4]]

task3_test = test[test.functionCounts >0]
task3_test.reset_index(drop=True, inplace=True)
task3_test = task3_test.iloc[:,np.r_[0,7,4]]

task3_train.to_feather(cfg.DATADIR + 'task3/train.feather')
task3_test.to_feather(cfg.DATADIR + 'task3/test.feather')

## 8 Make Feature Bank

### 8.1 ESM embedding 

In [2]:
# loading sprot data
snap18 = pd.read_feather(cfg.DIR_UNIPROT + '/snap201802.feather')
snap22 = pd.read_feather(cfg.DIR_UNIPROT + '/snap202202.feather')
# merge
full_snap_data = pd.concat([snap18, snap22], axis=0)
full_snap_data = full_snap_data.sort_values(by=['id', 'date_annotation_update'], ascending=False)
full_snap_data = full_snap_data[['id', 'seq']].drop_duplicates(subset='id', keep='first')
full_snap_data.reset_index(drop=True, inplace=True)


# loading exsisting features
if ftool.isfileExists(cfg.FILE_FEATURE_ESM0):
    feature_esm0 = pd.read_feather(cfg.FILE_FEATURE_ESM0)
    feature_esm32 = pd.read_feather(cfg.FILE_FEATURE_ESM32)
    feature_esm33 = pd.read_feather(cfg.FILE_FEATURE_ESM33)
    feature_unirep = pd.read_feather(cfg.FILE_FEATURE_UNIREP)
    #caculate embedding list
    needesm = full_snap_data[~full_snap_data.id.isin(list(set(feature_esm33.id)))]
    needunirep = full_snap_data[~full_snap_data.id.isin(list(set(feature_unirep.id)))]
else:
    needesm = full_snap_data
    needunirep = full_snap_data



In [35]:
# !pip install fair-esm
tr_rep0, tr_rep32, tr_rep33 = esmebd.get_rep_multi_sequence(sequences=needesm, model='esm1b_t33_650M_UR50S',seqthres=1022)

#merge existing
feature_esm0 = pd.concat([feature_esm0, tr_rep0], axis=0).reset_index(drop=True)
feature_esm32 = pd.concat([feature_esm0, tr_rep32], axis=0).reset_index(drop=True)
feature_esm33 = pd.concat([feature_esm0, tr_rep33], axis=0).reset_index(drop=True)

#save
feature_esm0.to_feather(cfg.FILE_FEATURE_ESM0)
feature_esm32.to_feather(cfg.FILE_FEATURE_ESM32)
feature_esm33.to_feather(cfg.FILE_FEATURE_ESM33)

Transferred model to GPU


100%|██████████| 17710/17710 [11:28<00:00, 25.73it/s]


In [3]:
from tools import embedding_unirep as unirep
tr_unirep = unirep.getunirep(needunirep, 40)

feature_unirep = pd.concat([feature_unirep, tr_unirep],axis=0).reset_index(drop=True)
feature_unirep.to_feather(cfg.FILE_FEATURE_UNIREP)


100%|██████████| 459/459 [2:10:53<00:00, 17.11s/it]  

length not match



