# Prepare Datasets for Benchmarking Tasks

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2022-10-05  


## Dataset1. Enzyme None-enzyme Dataset
The enzyme dataset is consists of two parts: <u>a training set</u> and <u>a testing set</u>.   
The training set is from snapshot Feb-2018 and ***excludes*** those <u>deleted items</u> and <u>sequences changed items</u> in snapshot Feb-2022.    
The training set is consists of ***469,134*** records, of which ***222,567*** are enzymes, and ***246,567*** are none-enzymes.   
The testing set is from snapshot Feb-2022 and excludes these items that appeared in snapshot Feb-2018.   
The testing set is consists of ***10,614*** records, of which ***5111*** are enzymes, and ***5503*** are none-enzymes.   
Unlike previous works,  we did not filter any sequences in terms of length and homology to make the data more inclusive. We make a label for each sequence, 1 for enzyme and 0 for none-enzyme.   

## Dataset2. Enzyme Function Quantity Dataset
The enzyme quantity dataset only contains enzyme data, contain ***222,567*** records. The function quantity ranges from 1 to 8.

## Dataset 3: EC Dataset

The EC dataset consists of 227,678 enzyme records, 222,567 are training-set, and the rest 5111 are testing-set, covering 6,031 EC numbers. Up to Feb 2022, ***cmopared with [ExplorEnz](https://www.enzyme-database.org/stats.php) CURRENT EC = 6674***, there still exist 643 EC numbers that the model can not handle in the benchmarking. Thus, we exclude the sequences with these 267 EC numbers in the evaluation process. But, this problem can be resolved in the production scenario because we use the entire data from Swiss-Prot. Now the EC coverage is 6,031 and can be automatically extended, for the training is real-time based on the publication of Swiss-Prot every 8 weeks. 

## 1. Import packages

In [2]:
import numpy as np
import pandas as pd
import sys,os
from tqdm import tqdm
import config as cfg
from functools import reduce

from tools import filetool as ftool
from tools import exact_ec_from_uniprot as exactec
from tools import funclib
from tools import minitools as mtool

from tools import embdding_onehot as onehotebd

from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)


%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 52 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 2. Download rawdata from unisprot

> IF first time run pls uncomment the cell below

In [2]:
# #snapshot 2018-02
# ftool.wget(download_url=cfg.URL_SPROT_SNAP201802, save_file=cfg.FILE_SPROT_SNAP201802)

#snapshot 2019-12
ftool.wget(download_url=cfg.URL_SPROT_SNAP201902, save_file=cfg.FILE_SPROT_SNAP201902)

# #snapshot 2020-06
# ftool.wget(download_url=cfg.URL_SPROT_SNAP202006, save_file=cfg.FILE_SPROT_SNAP202006)

#snapshot 2021-02
ftool.wget(download_url=cfg.URL_SPROT_SNAP202102, save_file=cfg.FILE_SPROT_SNAP202102)

# #snapshot 2022-02
# ftool.wget(download_url=cfg.URL_SPROT_SNAP202202, save_file=cfg.FILE_SPROT_SNAP202202)

wget -q https://ftp.uniprot.org/pub/databases/uniprot/previous_major_releases/release-2019_02/knowledgebase/uniprot_sprot-only2019_02.tar.gz -O /home/shizhenkun/codebase/DMLF/data/uniprot/uniprot_sprot-only2019_02.tar.gz
wget -q https://ftp.uniprot.org/pub/databases/uniprot/previous_major_releases/release-2021_02/knowledgebase/uniprot_sprot-only2021_02.tar.gz -O /home/shizhenkun/codebase/DMLF/data/uniprot/uniprot_sprot-only2021_02.tar.gz


## 3. Extract records from rawdata

In [19]:
cmd_array = [
    # 2018 data
    f'tar -zxvf {cfg.FILE_SPROT_SNAP201802} -C {cfg.DIR_UNIPROT}',
    f'mv {cfg.DIR_UNIPROT}uniprot_sprot.dat.gz {cfg.DIR_UNIPROT}sprot2018.data.gz', 
    f'rm -f {cfg.DIR_UNIPROT}uniprot_sprot.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot_varsplic.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot.xml.gz',
    
    # 2019 data
    f'tar -zxvf {cfg.FILE_SPROT_SNAP201902} -C {cfg.DIR_UNIPROT}',
    f'mv {cfg.DIR_UNIPROT}uniprot_sprot.dat.gz {cfg.DIR_UNIPROT}sprot2019.data.gz', 
    f'rm -f {cfg.DIR_UNIPROT}uniprot_sprot.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot_varsplic.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot.xml.gz',

    # 2020 data
    f'tar -zxvf {cfg.FILE_SPROT_SNAP202006} -C {cfg.DIR_UNIPROT}',
    f'mv {cfg.DIR_UNIPROT}uniprot_sprot.dat.gz {cfg.DIR_UNIPROT}sprot2020.data.gz', 
    f'rm -f {cfg.DIR_UNIPROT}uniprot_sprot.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot_varsplic.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot.xml.gz',

    # 2021 data
    f'tar -zxvf {cfg.FILE_SPROT_SNAP202102} -C {cfg.DIR_UNIPROT}',
    f'mv {cfg.DIR_UNIPROT}uniprot_sprot.dat.gz {cfg.DIR_UNIPROT}sprot2021.data.gz', 
    f'rm -f {cfg.DIR_UNIPROT}uniprot_sprot.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot_varsplic.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot.xml.gz',

    # 2022 data
    f'tar -zxvf {cfg.FILE_SPROT_SNAP202202} -C {cfg.DIR_UNIPROT}',
    f'mv {cfg.DIR_UNIPROT}uniprot_sprot.dat.gz {cfg.DIR_UNIPROT}sprot2022.data.gz', 
    f'rm -f {cfg.DIR_UNIPROT}uniprot_sprot.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot_varsplic.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot.xml.gz'
]

[os.system(item) for item in cmd_array]

uniprot_sprot.dat.gz
uniprot_sprot.fasta.gz
uniprot_sprot_varsplic.fasta.gz
uniprot_sprot.xml.gz
uniprot_sprot.dat.gz
uniprot_sprot.fasta.gz
uniprot_sprot_varsplic.fasta.gz
uniprot_sprot.xml.gz
uniprot_sprot.dat.gz
uniprot_sprot.fasta.gz
uniprot_sprot_varsplic.fasta.gz
uniprot_sprot.xml.gz
uniprot_sprot.dat.gz
uniprot_sprot.fasta.gz
uniprot_sprot_varsplic.fasta.gz
uniprot_sprot.xml.gz
uniprot_sprot.dat.gz
uniprot_sprot.fasta.gz
uniprot_sprot_varsplic.fasta.gz
uniprot_sprot.xml.gz


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [20]:
# snapshot 2018-02
exactec.run_exact_task(infile=f'{cfg.DIR_UNIPROT}sprot2018.data.gz', outfile=f'{cfg.DIR_UNIPROT}sprot2018.tsv')

# #snapshot 2019-02
exactec.run_exact_task(infile=f'{cfg.DIR_UNIPROT}sprot2019.data.gz', outfile=f'{cfg.DIR_UNIPROT}sprot2019.tsv')

#snapshot 2020-06
exactec.run_exact_task(infile=f'{cfg.DIR_UNIPROT}sprot2020.data.gz', outfile=f'{cfg.DIR_UNIPROT}sprot2020.tsv')

#snapshot 2021-06
exactec.run_exact_task(infile=f'{cfg.DIR_UNIPROT}sprot2021.data.gz', outfile=f'{cfg.DIR_UNIPROT}sprot2021.tsv')

# #snapshot 2022-02
exactec.run_exact_task(infile=f'{cfg.DIR_UNIPROT}sprot2022.data.gz', outfile=f'{cfg.DIR_UNIPROT}sprot2022.tsv')

556825it [02:37, 3538.33it/s]


finished use time 157.696 s


559228it [02:42, 3436.71it/s]


finished use time 163.000 s


563972it [03:02, 3087.93it/s]


finished use time 183.079 s


564638it [03:03, 3077.24it/s]


finished use time 183.914 s


567483it [03:11, 2963.10it/s]

finished use time 191.940 s





## 4. Load records & Drop Duplicates

In [21]:
#加载数据并转换时间格式
sprot2018 = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot2018.tsv', sep='\t',header=0) #读入文件
sprot2018 = mtool.convert_DF_dateTime(inputdf = sprot2018)

sprot2019 = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot2019.tsv', sep='\t',header=0) #读入文件
sprot2019 = mtool.convert_DF_dateTime(inputdf = sprot2019)

sprot2020 = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot2020.tsv', sep='\t',header=0) #读入文件
sprot2020 = mtool.convert_DF_dateTime(inputdf = sprot2020)

sprot2021 = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot2021.tsv', sep='\t',header=0) #读入文件
sprot2021 = mtool.convert_DF_dateTime(inputdf = sprot2021)

sprot2022 = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot2022.tsv', sep='\t',header=0) #读入文件
sprot2022 = mtool.convert_DF_dateTime(inputdf = sprot2022)


#Drop Duplicates
sprot2018.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot2018.reset_index(drop=True, inplace=True)

sprot2019.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot2019.reset_index(drop=True, inplace=True)

sprot2020.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot2020.reset_index(drop=True, inplace=True)

sprot2021.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot2021.reset_index(drop=True, inplace=True)

sprot2022.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot2022.reset_index(drop=True, inplace=True)


In [22]:
sprot2018.head(3)

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,Q3J1A3,LHB1_RHOS4,False,False,0,-,0,1986-07-21,2007-01-23,2017-10-25,MADKSDLGYTGLTDEQAQELHSVYMSGLWLFSAVAIVAHLAVYIWRPWF,49
1,P02157,MYG_MELME,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MGLSDGEWQLVLNVWGKVEADLAGHGQEVLIRLFKGHPETLEKFDK...,154
2,P02178,MYG_MEGNO,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MVLSDAEWQLVLNIWAKVEADVAGHGQDILIRLFKGHPETLEKFDK...,154


In [23]:
sprot2019.head(3)

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,P03375,ENV_HV1B1,False,False,0,-,0,1986-07-21,1986-07-21,2019-01-16,MRVKEKYQHLWRWGWRWGTMLLGMLMICSATEKLWVTVYYGVPVWK...,856
1,P03356,POL_MLVAV,True,True,6,"3.4.23.-, 2.7.7.49, 2.7.7.7, 3.1.26.4, 2.7.7.-...",4,1986-07-21,2018-01-31,2019-02-13,MGQTVTTPLSLTLEHWEDVQRIASNQSVDVKKRRWVTFCSAEWPTF...,1734
2,P02879,RICI_RICCO,True,False,1,3.2.2.22,4,1986-07-21,1987-08-13,2019-01-16,MKPGGNTIVIWMYAVATWLCFGSTSGWSFTLEDNNIFPKQYPIINF...,576


In [24]:
sprot2020.head(3)

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,P03711,SCAF_LAMBD,True,False,1,3.4.21.-,3,1986-07-21,1986-07-21,2020-12-02,MTAELRNLPHIASMAFNEPLMLEPAYARVFFCALAGQLGISSLTDA...,439
1,P01027,CO3_MOUSE,False,False,0,-,0,1986-07-21,2011-07-27,2020-12-02,MGPASGSQLLVLLLLLASSPLALGIPMYSIITPNVLRLESEETIVL...,1663
2,P02706,ASGR1_RAT,False,False,0,-,0,1986-07-21,2007-01-23,2020-12-02,MTKDYQDFQHLDNENDHHQLQRGPPPAPRLLQRLCSGFRLFLLSLG...,284


In [25]:
sprot2021.head(3)

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,P02802,MT1_MOUSE,False,False,0,-,0,1986-07-21,1986-07-21,2021-04-07,MDPNCSCSTGGSCTCTSSCACKNCKCTSCKKSCCSCCPVGCSKCAQ...,61
1,P02732,ANP3_PAGBO,False,False,0,-,0,1986-07-21,1986-07-21,2019-12-11,AATAATAATAATAATAATAATAATAATAATA,31
2,P02733,ANP3_PSEAM,False,False,0,-,0,1986-07-21,1986-07-21,2019-12-11,DTASDAAAAAALTAABAAAAAKLTABBAAAAAAATAA,37


In [26]:
sprot2022.head(3)

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,P00250,FER_APHSA,False,False,0,-,0,1986-07-21,2007-01-23,2022-05-25,MASYKVTLKTPDGDNVITVPDDEYILDVAEEEGLDLPYSCRAGACS...,97
1,P03420,FUS_HRSVA,False,False,0,-,0,1986-07-21,1986-07-21,2022-05-25,MELLILKANAITTILTAVTFCFASGQNITEEFYQSTCSAVSKGYLS...,574
2,P0ACF7,DBHB_SHIFL,False,False,0,-,0,1986-07-21,1986-07-21,2022-05-25,MNKSQLIDKIAAGADISKAAAGRALDAIIASVTESLKEGDDVALVG...,90


## 5. Preprocessing
### 5.1 format EC

In [27]:
#sprot2018
sprot2018['ec_number'] = sprot2018.ec_number.parallel_apply(lambda x: mtool.format_ec(x))
sprot2018['ec_number'] = sprot2018.ec_number.parallel_apply(lambda x: mtool.specific_ecs(x))
sprot2018['functionCounts'] = sprot2018.ec_number.parallel_apply(lambda x: 0 if x=='-'  else len(x.split(',')))
print('sprot 2018 finished')

#sprot2019
sprot2019['ec_number'] = sprot2019.ec_number.parallel_apply(lambda x: mtool.format_ec(x))
sprot2019['ec_number'] = sprot2019.ec_number.parallel_apply(lambda x: mtool.specific_ecs(x))
sprot2019['functionCounts'] = sprot2019.ec_number.parallel_apply(lambda x: 0 if x=='-'  else len(x.split(',')))
print('sprot 2019 finished')

#sprot2020
sprot2020['ec_number'] = sprot2020.ec_number.parallel_apply(lambda x: mtool.format_ec(x))
sprot2020['ec_number'] = sprot2020.ec_number.parallel_apply(lambda x: mtool.specific_ecs(x))
sprot2020['functionCounts'] = sprot2020.ec_number.parallel_apply(lambda x: 0 if x=='-'  else len(x.split(',')))
print('sprot 2020 finished')

#sprot2021
sprot2021['ec_number'] = sprot2021.ec_number.parallel_apply(lambda x: mtool.format_ec(x))
sprot2021['ec_number'] = sprot2021.ec_number.parallel_apply(lambda x: mtool.specific_ecs(x))
sprot2021['functionCounts'] = sprot2021.ec_number.parallel_apply(lambda x: 0 if x=='-'  else len(x.split(',')))
print('sprot 2021 finished')


#sprot2022
sprot2022['ec_number'] = sprot2022.ec_number.parallel_apply(lambda x: mtool.format_ec(x))
sprot2022['ec_number'] = sprot2022.ec_number.parallel_apply(lambda x: mtool.specific_ecs(x))
sprot2022['functionCounts'] = sprot2022.ec_number.parallel_apply(lambda x: 0 if x=='-'  else len(x.split(',')))

print('sprot 2022 finished')

sprot 2018 finished
sprot 2019 finished
sprot 2020 finished
sprot 2021 finished
sprot 2022 finished


In [28]:
sprot2018.to_feather(cfg.DIR_UNIPROT + '/snap201802.feather')
sprot2019.to_feather(cfg.DIR_UNIPROT + '/snap201902.feather')
sprot2020.to_feather(cfg.DIR_UNIPROT + '/snap202006.feather')
sprot2021.to_feather(cfg.DIR_UNIPROT + '/snap202102.feather')
sprot2022.to_feather(cfg.DIR_UNIPROT + '/snap202202.feather')

### 5.2 Split Tain Test

In [16]:
sprot2018

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,Q3J1A3,LHB1_RHOS4,False,False,0,-,0,1986-07-21,2007-01-23,2017-10-25,MADKSDLGYTGLTDEQAQELHSVYMSGLWLFSAVAIVAHLAVYIWRPWF,49
1,P02157,MYG_MELME,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MGLSDGEWQLVLNVWGKVEADLAGHGQEVLIRLFKGHPETLEKFDK...,154
2,P02178,MYG_MEGNO,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MVLSDAEWQLVLNIWAKVEADVAGHGQDILIRLFKGHPETLEKFDK...,154
3,P02194,MYG_MACRU,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MGLSDGEWQLVLNIWGKVETDEGGHGKDVLIRLFKGHPETLEKFDK...,154
4,P01915,HB22_MOUSE,False,False,0,-,0,1986-07-21,1986-07-21,2017-10-25,MVWLPRVPCVAAVILLLTVLSPPVALVRDTRPRFLEYVTSECHFYN...,264
...,...,...,...,...,...,...,...,...,...,...,...,...
469129,Q21221,AHO3_CAEEL,True,False,1,3.1.2.22,4,2018-02-28,2004-11-23,2018-02-28,MSSGAPSGSSMSSTPGSPPPRAGGPNSVSFKDLCCLFCCPPFPSSI...,332
469130,Q6QJ72,PDL2_ARATH,True,False,1,4.2.1.96,4,2018-02-28,2004-07-05,2018-02-28,MSRLLLPKLFSISRTQVPAASLFNNLYRRHKRFVHWTSKMSTDSVR...,187
469131,C0HL68,ES1GA_ODOGR,False,False,0,-,0,2018-02-28,2018-02-28,2018-02-28,GLFSKPAGKGIKNLIPKGVKHIGKEVGKDVIRTGIDVAGCKIKGEC,46
469132,C0HK74,VKT3_HETMG,False,False,0,-,0,2018-02-28,2018-02-28,2018-02-28,GSICLEPKVVGPCTAYFPRFYFDSETGKCTPFIYGGCEGNGNNFET...,56


In [29]:
train = sprot2018.iloc[:,np.r_[0,2:8,10:12]]

test_2019 = sprot2019.iloc[:,np.r_[0,2:8,10:12]]
test_2020 = sprot2020.iloc[:,np.r_[0,2:8,10:12]]
test_2021 = sprot2021.iloc[:,np.r_[0,2:8,10:12]]
test_2022 = sprot2022.iloc[:,np.r_[0,2:8,10:12]]

test_2019 =test_2019[~test_2019.seq.isin(train.seq)]
test_2019.reset_index(drop=True, inplace=True)

test_2020 =test_2020[~test_2020.seq.isin(train.seq)]
test_2020.reset_index(drop=True, inplace=True)

test_2021 =test_2021[~test_2021.seq.isin(train.seq)]
test_2021.reset_index(drop=True, inplace=True)

test_2022 =test_2022[~test_2022.seq.isin(train.seq)]
test_2022.reset_index(drop=True, inplace=True)

In [33]:
test_2019

Unnamed: 0,id,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,seq,seqlength
0,P02340,False,False,0,-,0,1986-07-21,MTAMEESQSDISLELPLSQETFSGLWKLLPPEDILPSPHCMDDLLL...,390
1,P01848,False,False,0,-,0,1986-07-21,IQNPDPAVYQLRDSKSSDKSVCLFTDFDSQTNVSQSKDSDVYITDK...,140
2,P01850,False,False,0,-,0,1986-07-21,DLNKVFPPEVAVFEPSEAEISHTQKATLVCLATGFFPDHVELSWWV...,176
3,P00939,True,True,2,"4.2.3.3,5.3.1.1",4,1986-07-21,MAGTGQEAEFRFSAFYISRQRPQPRPHGGTDLQCAGPSAMAPSRKF...,288
4,P01733,False,False,0,-,0,1986-07-21,MDSWTFCCVSLCILVAKHTDAGVIQSPRHEVTEMGQEVTLRCKPIS...,115
...,...,...,...,...,...,...,...,...,...
2877,Q5B8A2,True,False,1,2.5.1.-,3,2019-02-13,MYSKSWILNALPAPLVPYCELTRVGYLPIGVLVSYLPVLVAILHVA...,309
2878,Q5YTV5,True,False,1,1.14.13.-,3,2019-02-13,MIDVIIAGGGPTGLMLAGELRLHGVRTVVLEKEPTPNQHSRSRGLH...,473
2879,B4F779,False,False,0,-,0,2019-02-13,MPAVDKLLLEEALQDSPQTRSLLSVFEEDAGTLTDYTNQLLQAMQR...,662
2880,G4N287,True,False,1,1.-.-.-,1,2019-02-13,MKSFSLLASAGLATLASLPLTMAGVITPSYFDKHPLSRRQLSDAQV...,520


### 5.3 Remove changed seqence in test set

In [34]:
test_2019 = test_2019[~test_2019.id.isin(test_2019.merge(train, on='id', how='inner').id.values)]
test_2019.reset_index(drop=True, inplace=True)

test_2020 = test_2020[~test_2020.id.isin(test_2020.merge(train, on='id', how='inner').id.values)]
test_2020.reset_index(drop=True, inplace=True)

test_2021 = test_2021[~test_2021.id.isin(test_2021.merge(train, on='id', how='inner').id.values)]
test_2021.reset_index(drop=True, inplace=True)

test_2022 = test_2022[~test_2022.id.isin(test_2022.merge(train, on='id', how='inner').id.values)]
test_2022.reset_index(drop=True, inplace=True)


### 5.4 Trim string

In [35]:
with pd.option_context('mode.chained_assignment', None):
    train.ec_number = train.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    train.seq = train.seq.parallel_apply(lambda x : str(x).strip()) #seq trim

    test_2019.ec_number = test_2019.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    test_2019.seq = test_2019.seq.parallel_apply(lambda x : str(x).strip()) #seq trim
    
    test_2020.ec_number = test_2020.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    test_2020.seq = test_2020.seq.parallel_apply(lambda x : str(x).strip()) #seq trim

    test_2021.ec_number = test_2021.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    test_2021.seq = test_2021.seq.parallel_apply(lambda x : str(x).strip()) #seq trim

    test_2022.ec_number = test_2022.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    test_2022.seq = test_2022.seq.parallel_apply(lambda x : str(x).strip()) #seq trim

### 5.5 Save train test

In [36]:
train.to_feather(cfg.DATADIR + 'datasets/train.feather')
test_2019.to_feather(cfg.DATADIR + 'datasets/test_2019.feather')
test_2020.to_feather(cfg.DATADIR + 'datasets/test_2020.feather')
test_2021.to_feather(cfg.DATADIR + 'datasets/test_2021.feather')
test_2022.to_feather(cfg.DATADIR + 'datasets/test_2022.feather')

## 6. Build benchmarking datasets
### 6.1 Task 1 isEnzyme

In [37]:
train = pd.read_feather(cfg.DIR_DATASETS + 'train.feather')
test_2019 = pd.read_feather(cfg.DIR_DATASETS + 'test_2019.feather')
test_2020 = pd.read_feather(cfg.DIR_DATASETS + 'test_2020.feather')
test_2021 = pd.read_feather(cfg.DIR_DATASETS + 'test_2021.feather')
test_2022 = pd.read_feather(cfg.DIR_DATASETS + 'test_2022.feather')

task1_train = train.iloc[:,np.r_[0,7,1]]

task1_test_2019 = test_2019.iloc[:,np.r_[0,7,1]]
task1_test_2020 = test_2020.iloc[:,np.r_[0,7,1]]
task1_test_2021 = test_2021.iloc[:,np.r_[0,7,1]]
task1_test_2022 = test_2022.iloc[:,np.r_[0,7,1]]


task1_train.to_feather(cfg.FILE_TASK1_TRAIN)
task1_test_2019.to_feather(cfg.FILE_TASK1_TEST_2019)
task1_test_2020.to_feather(cfg.FILE_TASK1_TEST_2020)
task1_test_2021.to_feather(cfg.FILE_TASK1_TEST_2021)
task1_test_2022.to_feather(cfg.FILE_TASK1_TEST_2022)

funclib.table2fasta(table=task1_train[['id', 'seq']], file_out=cfg.FILE_TASK1_TRAIN_FASTA)
funclib.table2fasta(table=task1_test_2019[['id', 'seq']], file_out=cfg.FILE_TASK1_TEST_2019_FASTA)
funclib.table2fasta(table=task1_test_2020[['id', 'seq']], file_out=cfg.FILE_TASK1_TEST_2020_FASTA)
funclib.table2fasta(table=task1_test_2021[['id', 'seq']], file_out=cfg.FILE_TASK1_TEST_2021_FASTA)
funclib.table2fasta(table=task1_test_2022[['id', 'seq']], file_out=cfg.FILE_TASK1_TEST_2022_FASTA)

Write finished
Write finished
Write finished
Write finished
Write finished


### 6.2 Task2 Function Counts

In [38]:
task2_train = train[train.functionCounts >0]
task2_train.reset_index(drop=True, inplace=True)
task2_train = task2_train.iloc[:,np.r_[0,7,3]]

task2_test_2019 = test_2019[test_2019.functionCounts >0]
task2_test_2019.reset_index(drop=True, inplace=True)
task2_test_2019 = task2_test_2019.iloc[:,np.r_[0,7,3]]

task2_test_2020 = test_2020[test_2020.functionCounts >0]
task2_test_2020.reset_index(drop=True, inplace=True)
task2_test_2020 = task2_test_2020.iloc[:,np.r_[0,7,3]]

task2_test_2021 = test_2021[test_2021.functionCounts >0]
task2_test_2021.reset_index(drop=True, inplace=True)
task2_test_2021 = task2_test_2021.iloc[:,np.r_[0,7,3]]

task2_test_2022 = test_2022[test_2022.functionCounts >0]
task2_test_2022.reset_index(drop=True, inplace=True)
task2_test_2022 = task2_test_2022.iloc[:,np.r_[0,7,3]]

task2_train.to_feather(cfg.FILE_TASK2_TRAIN)
task2_test_2019.to_feather(cfg.FILE_TASK2_TEST_2019)
task2_test_2020.to_feather(cfg.FILE_TASK2_TEST_2020)
task2_test_2021.to_feather(cfg.FILE_TASK2_TEST_2021)
task2_test_2022.to_feather(cfg.FILE_TASK2_TEST_2022)

funclib.table2fasta(table=task2_train[['id', 'seq']], file_out=cfg.FILE_TASK2_TRAIN_FASTA)
funclib.table2fasta(table=task2_test_2019[['id', 'seq']], file_out=cfg.FILE_TASK2_TEST_2019_FASTA)
funclib.table2fasta(table=task2_test_2020[['id', 'seq']], file_out=cfg.FILE_TASK2_TEST_2020_FASTA)
funclib.table2fasta(table=task2_test_2021[['id', 'seq']], file_out=cfg.FILE_TASK2_TEST_2021_FASTA)
funclib.table2fasta(table=task2_test_2022[['id', 'seq']], file_out=cfg.FILE_TASK2_TEST_2022_FASTA)

Write finished
Write finished
Write finished
Write finished
Write finished


### 6.3 Task3 EC Number

In [39]:
task3_train = train[train.functionCounts >0]
task3_train.reset_index(drop=True, inplace=True)
task3_train = task3_train.iloc[:,np.r_[0,7,4]]

task3_test_2019 = test_2019[test_2019.functionCounts >0]
task3_test_2019.reset_index(drop=True, inplace=True)
task3_test_2019 = task3_test_2019.iloc[:,np.r_[0,7,4]]

task3_test_2020 = test_2020[test_2020.functionCounts >0]
task3_test_2020.reset_index(drop=True, inplace=True)
task3_test_2020 = task3_test_2020.iloc[:,np.r_[0,7,4]]

task3_test_2021 = test_2021[test_2021.functionCounts >0]
task3_test_2021.reset_index(drop=True, inplace=True)
task3_test_2021 = task3_test_2021.iloc[:,np.r_[0,7,4]]

task3_test_2022 = test_2022[test_2022.functionCounts >0]
task3_test_2022.reset_index(drop=True, inplace=True)
task3_test_2022 = task3_test_2022.iloc[:,np.r_[0,7,4]]

task3_train.to_feather(cfg.FILE_TASK3_TRAIN)
task3_test_2019.to_feather(cfg.FILE_TASK3_TEST_2019)
task3_test_2020.to_feather(cfg.FILE_TASK3_TEST_2020)
task3_test_2021.to_feather(cfg.FILE_TASK3_TEST_2021)
task3_test_2022.to_feather(cfg.FILE_TASK3_TEST_2022)

funclib.table2fasta(table=task3_train[['id', 'seq']], file_out=cfg.FILE_TASK3_TRAIN_FASTA)
funclib.table2fasta(table=task3_test_2019[['id', 'seq']], file_out=cfg.FILE_TASK3_TEST_2019_FASTA)
funclib.table2fasta(table=task3_test_2020[['id', 'seq']], file_out=cfg.FILE_TASK3_TEST_2020_FASTA)
funclib.table2fasta(table=task3_test_2021[['id', 'seq']], file_out=cfg.FILE_TASK3_TEST_2021_FASTA)
funclib.table2fasta(table=task3_test_2022[['id', 'seq']], file_out=cfg.FILE_TASK3_TEST_2022_FASTA)

Write finished
Write finished
Write finished
Write finished
Write finished


## 7 Make Feature Bank

### 7.1 ESM embedding 

In [12]:
# loading sprot data
snap18 = pd.read_feather(cfg.DIR_UNIPROT + '/snap201802.feather')
snap19 = pd.read_feather(cfg.DIR_UNIPROT + '/snap201902.feather')
snap20 = pd.read_feather(cfg.DIR_UNIPROT + '/snap202006.feather')
snap21 = pd.read_feather(cfg.DIR_UNIPROT + '/snap202102.feather')
snap22 = pd.read_feather(cfg.DIR_UNIPROT + '/snap202202.feather')
# merge
full_snap_data = pd.concat([snap18, snap19,snap20,snap21,snap22], axis=0)
full_snap_data = full_snap_data.sort_values(by=['id', 'date_annotation_update'], ascending=False)
full_snap_data = full_snap_data[['id', 'seq']].drop_duplicates(subset='id', keep='first')
full_snap_data.reset_index(drop=True, inplace=True)


# loading exsisting features
if ftool.isfileExists(cfg.FILE_FEATURE_ESM0):
    feature_esm0 = pd.read_feather(cfg.FILE_FEATURE_ESM0)
    feature_esm32 = pd.read_feather(cfg.FILE_FEATURE_ESM32)
    feature_esm33 = pd.read_feather(cfg.FILE_FEATURE_ESM33)
    feature_unirep = pd.read_feather(cfg.FILE_FEATURE_UNIREP)
    feature_onehot = pd.read_feather(cfg.FILE_FEATURE_ONEHOT)
    #caculate embedding list
    needesm = full_snap_data[~full_snap_data.id.isin(list(set(feature_esm33.id)))]
    needunirep = full_snap_data[~full_snap_data.id.isin(list(set(feature_unirep.id)))]
    needonehot = full_snap_data[~full_snap_data.id.isin(list(set(feature_onehot.id)))]
else:
    needesm = full_snap_data
    needunirep = full_snap_data
    needonehot = full_snap_data



In [3]:
feature_esm33 = pd.read_feather(cfg.FILE_FEATURE_ESM33)

In [4]:
feature_esm33

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f1271,f1272,f1273,f1274,f1275,f1276,f1277,f1278,f1279,f1280
0,X6R8R1,0.122700,0.226933,0.159202,0.120144,0.011594,-0.004360,-0.383417,-0.000875,-0.251191,...,0.380638,-0.105144,-0.204226,-0.034873,-0.375223,0.041795,0.045982,0.112148,-0.136823,-0.061542
1,X6R8D5,0.030997,0.157779,-0.004480,0.116845,-0.090119,-0.053348,-0.197258,-0.057675,0.012594,...,0.151833,-0.146131,-0.054271,0.032418,-1.204727,-0.032018,0.151907,0.079865,0.043225,0.160457
2,X5M8U1,-0.027750,-0.048004,0.102815,0.125720,-0.017340,-0.098527,-0.124109,0.129336,0.090217,...,0.127853,-0.082465,-0.074933,0.137480,0.284578,0.030022,-0.075125,-0.023453,0.061830,-0.028551
3,X5M5N0,-0.012435,0.093123,0.088707,0.037654,0.014413,-0.085524,-0.048758,0.130967,-0.041706,...,0.075575,0.033063,-0.125976,0.072138,-0.569141,-0.046442,-0.018624,-0.064436,-0.048540,0.240793
4,X5JB51,-0.038531,0.160082,0.201200,0.057422,0.046277,-0.066395,-0.031294,0.076849,-0.002647,...,0.211565,-0.042334,-0.122524,0.027503,-0.553735,0.100470,0.212672,0.003233,-0.216341,0.085664
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503264,A0A0N9NCU6,0.052276,0.212493,-0.030868,0.041476,-0.233532,-0.201280,0.027373,-0.098918,-0.124127,...,0.102771,0.010940,-0.196845,0.098058,-0.826236,-0.086817,0.071059,-0.029248,-0.149321,0.148098
503265,A0A0J9YXY3,0.035284,0.050764,-0.137996,0.024225,0.030510,-0.225019,-0.275360,0.057804,0.024257,...,0.270679,-0.187581,-0.155288,-0.252891,-0.184232,0.107679,0.000649,0.094124,-0.130814,-0.276972
503266,A0A0H3N8G7,0.116379,0.007588,0.021867,0.050561,-0.082611,-0.087392,0.109281,-0.059609,-0.083992,...,0.042052,0.015968,-0.084732,0.047217,-0.922598,0.037985,0.037964,-0.026476,0.083009,0.065470
503267,A0A0H3K6Z6,0.134652,0.084429,-0.112628,0.101439,-0.180682,-0.106040,0.069843,0.045752,-0.034431,...,0.107869,-0.029719,-0.011126,0.062325,-0.827122,-0.001456,0.076949,0.056494,-0.101931,0.127485


In [4]:
# !pip install fair-esm
from tools import embedding_esm as esmebd
if len(needesm)>0:
    tr_rep0, tr_rep32, tr_rep33 = esmebd.get_rep_multi_sequence(sequences=needesm, model='esm1b_t33_650M_UR50S',seqthres=1022)

    #merge existing
    feature_esm0 = pd.concat([feature_esm0, tr_rep0], axis=0).reset_index(drop=True)
    feature_esm32 = pd.concat([feature_esm32, tr_rep32], axis=0).reset_index(drop=True)
    feature_esm33 = pd.concat([feature_esm33, tr_rep33], axis=0).reset_index(drop=True)


    #save
    feature_esm0.to_feather(cfg.FILE_FEATURE_ESM0)
    feature_esm32.to_feather(cfg.FILE_FEATURE_ESM32)
    feature_esm33.to_feather(cfg.FILE_FEATURE_ESM33)

Transferred model to GPU


100%|██████████| 5249/5249 [03:26<00:00, 25.37it/s]


### 7.2 Unirep

In [7]:
if len(needunirep) > 0:
    from tools import embedding_unirep as unirep
    tr_unirep = unirep.getunirep(needunirep, 40)

    feature_unirep = pd.concat([feature_unirep, tr_unirep],axis=0).reset_index(drop=True)
    feature_unirep.to_feather(cfg.FILE_FEATURE_UNIREP)


  train_h_avg, train_h_final, train_c_final= get_reps(list(enzyme_noemzyme.seq[i:i+step]))
100%|██████████| 55/55 [55:15<00:00, 60.28s/it]

length not match





### 7.3 one-hot

In [11]:
feature_onehot = pd.DataFrame()
needonehot = full_snap_data
if len(needonehot) > 0:
    tr_unirep = onehotebd.get_onehot(sequences=needonehot, padding=True, padding_window=1500)
    feature_onehot = pd.concat([feature_onehot, tr_unirep],axis=0).reset_index(drop=True)
    feature_onehot.to_feather(cfg.FILE_FEATURE_ONEHOT)

In [1]:
import xml.etree.ElementTree as et 

In [15]:
pd.read_xml(f'{cfg.DATADIR}explorenz/enzyme-data.xml')

Unnamed: 0,name,table_structure,table_data
0,enzymeda_1,,


In [3]:
xtree = et.parse(f'{cfg.DATADIR}explorenz/enzyme-data.xml')
xroot = xtree.getroot()

In [14]:
counter = 1
for node in xroot:
    print(dir(node))
    print(node.attrib.get('row'))

    counter = counter +1
    if counter ==15:
        break

['__class__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', 'append', 'attrib', 'clear', 'extend', 'find', 'findall', 'findtext', 'get', 'getchildren', 'getiterator', 'insert', 'items', 'iter', 'iterfind', 'itertext', 'keys', 'makeelement', 'remove', 'set', 'tag', 'tail', 'text']
None
