# Prepare Datasets for Benchmarking Tasks

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2022-10-05  


## Dataset1. Enzyme None-enzyme Dataset
The enzyme dataset is consists of two parts: <u>a training set</u> and <u>a testing set</u>.   
The training set is from snapshot Feb-2018 and ***excludes*** those <u>deleted items</u> and <u>sequences changed items</u> in snapshot Feb-2022.    
The training set is consists of ***469,134*** records, of which ***222,567*** are enzymes, and ***246,567*** are none-enzymes.   
The testing set is from snapshot Feb-2022 and excludes these items that appeared in snapshot Feb-2018.   
The testing set is consists of ***10,614*** records, of which ***5111*** are enzymes, and ***5503*** are none-enzymes.   
Unlike previous works,  we did not filter any sequences in terms of length and homology to make the data more inclusive. We make a label for each sequence, 1 for enzyme and 0 for none-enzyme.   

## Dataset2. Enzyme Function Quantity Dataset
The enzyme quantity dataset only contains enzyme data, contain ***222,567*** records. The function quantity ranges from 1 to 8.

## Dataset 3: EC Dataset

The EC dataset consists of 227,678 enzyme records, 222,567 are training-set, and the rest 5111 are testing-set, covering 6,031 EC numbers. Up to Feb 2022, ***cmopared with [ExplorEnz](https://www.enzyme-database.org/stats.php) CURRENT EC = 6674***, there still exist 643 EC numbers that the model can not handle in the benchmarking. Thus, we exclude the sequences with these 267 EC numbers in the evaluation process. But, this problem can be resolved in the production scenario because we use the entire data from Swiss-Prot. Now the EC coverage is 6,031 and can be automatically extended, for the training is real-time based on the publication of Swiss-Prot every 8 weeks. 

## 1. Import packages

In [3]:
import numpy as np
import pandas as pd
import sys,os
from tqdm import tqdm
import config as cfg
from functools import reduce

from tools import filetool as ftool
from tools import exact_ec_from_uniprot as exactec
from tools import funclib
from tools import minitools as mtool
# from tools import embedding_esm as esmebd
from tools import embdding_onehot as onehotebd

from pandarallel import pandarallel # 导入pandaralle
pandarallel.initialize(progress_bar=False)


%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 52 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2. Download rawdata from unisprot

> IF first time run pls uncomment the cell below

In [2]:
# #snapshot 2018-02
# ftool.wget(download_url=cfg.URL_SPROT_SNAP201802, save_file=cfg.FILE_SPROT_SNAP201802)

#snapshot 2020-06
ftool.wget(download_url=cfg.URL_SPROT_SNAP201902, save_file=cfg.FILE_SPROT_SNAP201902)

# #snapshot 2020-06
# ftool.wget(download_url=cfg.URL_SPROT_SNAP202006, save_file=cfg.FILE_SPROT_SNAP202006)

#snapshot 2020-06
ftool.wget(download_url=cfg.URL_SPROT_SNAP202102, save_file=cfg.FILE_SPROT_SNAP202102)

# #snapshot 2022-02
# ftool.wget(download_url=cfg.URL_SPROT_SNAP202202, save_file=cfg.FILE_SPROT_SNAP202202)

wget -q https://ftp.uniprot.org/pub/databases/uniprot/previous_major_releases/release-2019_02/knowledgebase/uniprot_sprot-only2019_02.tar.gz -O /home/shizhenkun/codebase/DMLF/data/uniprot/uniprot_sprot-only2019_02.tar.gz
wget -q https://ftp.uniprot.org/pub/databases/uniprot/previous_major_releases/release-2021_02/knowledgebase/uniprot_sprot-only2021_02.tar.gz -O /home/shizhenkun/codebase/DMLF/data/uniprot/uniprot_sprot-only2021_02.tar.gz


## 3. Extract records from rawdata

In [4]:
cmd_array = [
    # 2018 data
    f'tar -zxvf {cfg.FILE_SPROT_SNAP201802} -C {cfg.DIR_UNIPROT}',
    f'mv {cfg.DIR_UNIPROT}uniprot_sprot.dat.gz {cfg.DIR_UNIPROT}sprot2018.data.gz', 
    f'rm -f {cfg.DIR_UNIPROT}uniprot_sprot.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot_varsplic.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot.xml.gz',
    # 2020 data
    f'tar -zxvf {cfg.FILE_SPROT_SNAP202006} -C {cfg.DIR_UNIPROT}',
    f'mv {cfg.DIR_UNIPROT}uniprot_sprot.dat.gz {cfg.DIR_UNIPROT}sprot2020.data.gz', 
    f'rm -f {cfg.DIR_UNIPROT}uniprot_sprot.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot_varsplic.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot.xml.gz',
    # 2022 data
    f'tar -zxvf {cfg.FILE_SPROT_SNAP202202} -C {cfg.DIR_UNIPROT}',
    f'mv {cfg.DIR_UNIPROT}uniprot_sprot.dat.gz {cfg.DIR_UNIPROT}sprot2022.data.gz', 
    f'rm -f {cfg.DIR_UNIPROT}uniprot_sprot.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot_varsplic.fasta.gz {cfg.DIR_UNIPROT}uniprot_sprot.xml.gz'
]

[os.system(item) for item in cmd_array]

uniprot_sprot.dat.gz
uniprot_sprot.fasta.gz
uniprot_sprot_varsplic.fasta.gz
uniprot_sprot.xml.gz
uniprot_sprot.dat.gz
uniprot_sprot.fasta.gz
uniprot_sprot_varsplic.fasta.gz
uniprot_sprot.xml.gz
uniprot_sprot.dat.gz
uniprot_sprot.fasta.gz
uniprot_sprot_varsplic.fasta.gz
uniprot_sprot.xml.gz


[0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
# #snapshot 2018-02
exactec.run_exact_task(infile=f'{cfg.DIR_UNIPROT}sprot2018.data.gz', outfile=f'{cfg.DIR_UNIPROT}sprot2018.tsv')

#snapshot 2020-06
exactec.run_exact_task(infile=f'{cfg.DIR_UNIPROT}sprot2020.data.gz', outfile=f'{cfg.DIR_UNIPROT}sprot2020.tsv')

# #snapshot 2022-02
exactec.run_exact_task(infile=f'{cfg.DIR_UNIPROT}sprot2022.data.gz', outfile=f'{cfg.DIR_UNIPROT}sprot2022.tsv')

## 4. Load records & Drop Duplicates

In [6]:
#加载数据并转换时间格式
sprot2018 = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot2018.tsv', sep='\t',header=0) #读入文件
sprot2018 = mtool.convert_DF_dateTime(inputdf = sprot2018)

sprot2020 = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot2020.tsv', sep='\t',header=0) #读入文件
sprot2020 = mtool.convert_DF_dateTime(inputdf = sprot2020)

sprot2022 = pd.read_csv(f'{cfg.DIR_UNIPROT}sprot2022.tsv', sep='\t',header=0) #读入文件
sprot2022 = mtool.convert_DF_dateTime(inputdf = sprot2022)


#Drop Duplicates
sprot2018.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot2018.reset_index(drop=True, inplace=True)
sprot2020.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot2020.reset_index(drop=True, inplace=True)

sprot2022.drop_duplicates(subset=['seq'], keep='first', inplace=True)
sprot2022.reset_index(drop=True, inplace=True)


In [9]:
sprot2018.head(3)

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,Q3J1A3,LHB1_RHOS4,False,False,0,-,0,1986-07-21,2007-01-23,2017-10-25,MADKSDLGYTGLTDEQAQELHSVYMSGLWLFSAVAIVAHLAVYIWRPWF,49
1,P02157,MYG_MELME,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MGLSDGEWQLVLNVWGKVEADLAGHGQEVLIRLFKGHPETLEKFDK...,154
2,P02178,MYG_MEGNO,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MVLSDAEWQLVLNIWAKVEADVAGHGQDILIRLFKGHPETLEKFDK...,154


In [10]:
sprot2020.head(3)

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,P03711,SCAF_LAMBD,True,False,1,3.4.21.-,3,1986-07-21,1986-07-21,2020-12-02,MTAELRNLPHIASMAFNEPLMLEPAYARVFFCALAGQLGISSLTDA...,439
1,P01027,CO3_MOUSE,False,False,0,-,0,1986-07-21,2011-07-27,2020-12-02,MGPASGSQLLVLLLLLASSPLALGIPMYSIITPNVLRLESEETIVL...,1663
2,P02706,ASGR1_RAT,False,False,0,-,0,1986-07-21,2007-01-23,2020-12-02,MTKDYQDFQHLDNENDHHQLQRGPPPAPRLLQRLCSGFRLFLLSLG...,284


In [6]:
sprot2022.head(3)

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,P00250,FER_APHSA,False,False,0,-,0,1986-07-21,2007-01-23,2022-05-25,MASYKVTLKTPDGDNVITVPDDEYILDVAEEEGLDLPYSCRAGACS...,97
1,P03420,FUS_HRSVA,False,False,0,-,0,1986-07-21,1986-07-21,2022-05-25,MELLILKANAITTILTAVTFCFASGQNITEEFYQSTCSAVSKGYLS...,574
2,P0ACF7,DBHB_SHIFL,False,False,0,-,0,1986-07-21,1986-07-21,2022-05-25,MNKSQLIDKIAAGADISKAAAGRALDAIIASVTESLKEGDDVALVG...,90


## 5. Preprocessing
### 5.1 format EC

In [7]:
#sprot2018
sprot2018['ec_number'] = sprot2018.ec_number.parallel_apply(lambda x: mtool.format_ec(x))
sprot2018['ec_number'] = sprot2018.ec_number.parallel_apply(lambda x: mtool.specific_ecs(x))
sprot2018['functionCounts'] = sprot2018.ec_number.parallel_apply(lambda x: 0 if x=='-'  else len(x.split(',')))
print('sprot 2018 finished')

#sprot2020
sprot2020['ec_number'] = sprot2020.ec_number.parallel_apply(lambda x: mtool.format_ec(x))
sprot2020['ec_number'] = sprot2020.ec_number.parallel_apply(lambda x: mtool.specific_ecs(x))
sprot2020['functionCounts'] = sprot2020.ec_number.parallel_apply(lambda x: 0 if x=='-'  else len(x.split(',')))

print('sprot 2020 finished')


#sprot2022
sprot2022['ec_number'] = sprot2022.ec_number.parallel_apply(lambda x: mtool.format_ec(x))
sprot2022['ec_number'] = sprot2022.ec_number.parallel_apply(lambda x: mtool.specific_ecs(x))
sprot2022['functionCounts'] = sprot2022.ec_number.parallel_apply(lambda x: 0 if x=='-'  else len(x.split(',')))

print('sprot 2022 finished')

sprot 2018 finished
sprot 2020 finished
sprot 2022 finished


In [15]:
sprot2018.to_feather(cfg.DIR_UNIPROT + '/snap201802.feather')
sprot2020.to_feather(cfg.DIR_UNIPROT + '/snap202006.feather')
sprot2022.to_feather(cfg.DIR_UNIPROT + '/snap202202.feather')

### 5.2 Split Tain Test

In [16]:
sprot2018

Unnamed: 0,id,name,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,Q3J1A3,LHB1_RHOS4,False,False,0,-,0,1986-07-21,2007-01-23,2017-10-25,MADKSDLGYTGLTDEQAQELHSVYMSGLWLFSAVAIVAHLAVYIWRPWF,49
1,P02157,MYG_MELME,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MGLSDGEWQLVLNVWGKVEADLAGHGQEVLIRLFKGHPETLEKFDK...,154
2,P02178,MYG_MEGNO,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MVLSDAEWQLVLNIWAKVEADVAGHGQDILIRLFKGHPETLEKFDK...,154
3,P02194,MYG_MACRU,False,False,0,-,0,1986-07-21,2007-01-23,2017-11-22,MGLSDGEWQLVLNIWGKVETDEGGHGKDVLIRLFKGHPETLEKFDK...,154
4,P01915,HB22_MOUSE,False,False,0,-,0,1986-07-21,1986-07-21,2017-10-25,MVWLPRVPCVAAVILLLTVLSPPVALVRDTRPRFLEYVTSECHFYN...,264
...,...,...,...,...,...,...,...,...,...,...,...,...
469129,Q21221,AHO3_CAEEL,True,False,1,3.1.2.22,4,2018-02-28,2004-11-23,2018-02-28,MSSGAPSGSSMSSTPGSPPPRAGGPNSVSFKDLCCLFCCPPFPSSI...,332
469130,Q6QJ72,PDL2_ARATH,True,False,1,4.2.1.96,4,2018-02-28,2004-07-05,2018-02-28,MSRLLLPKLFSISRTQVPAASLFNNLYRRHKRFVHWTSKMSTDSVR...,187
469131,C0HL68,ES1GA_ODOGR,False,False,0,-,0,2018-02-28,2018-02-28,2018-02-28,GLFSKPAGKGIKNLIPKGVKHIGKEVGKDVIRTGIDVAGCKIKGEC,46
469132,C0HK74,VKT3_HETMG,False,False,0,-,0,2018-02-28,2018-02-28,2018-02-28,GSICLEPKVVGPCTAYFPRFYFDSETGKCTPFIYGGCEGNGNNFET...,56


In [17]:
train = sprot2018.iloc[:,np.r_[0,2:8,10:12]]

test_2020 = sprot2020.iloc[:,np.r_[0,2:8,10:12]]
test_2022 = sprot2022.iloc[:,np.r_[0,2:8,10:12]]

test_2020 =test_2020[~test_2020.seq.isin(train.seq)]
test_2020.reset_index(drop=True, inplace=True)

test_2022 =test_2022[~test_2022.seq.isin(train.seq)]
test_2022.reset_index(drop=True, inplace=True)

In [27]:
test_2022

Unnamed: 0,id,isenzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,seq,seqlength
0,P02812,False,False,0,-,0,1986-07-21,MLLILLSVALLALSSAQNLNEDVSQEESPSLIAGNPQGAPPQGGNK...,416
1,P02883,False,False,0,-,0,1986-07-21,MAATTCFFFLFPFLLLLTLSRAATFEIVNRCSYTVWAAASKGDAAL...,235
2,P01160,False,False,0,-,0,1986-07-21,MSSFSTTTVSFLLLLAFQLLGQTRANPMYNAVSNADLMDFKNLLDH...,151
3,P00780,True,False,1,3.4.21.62,4,1986-07-21,MMRKKSFWLGMLTAFMLVFTMAFSDSASAAQPAKNVEKDYIVGFKS...,379
4,P01523,False,False,0,-,0,1986-07-21,MMSKLGVLLTICLLLFPLTALPMDGDEPANRPVERMQDNISSEQYP...,75
...,...,...,...,...,...,...,...,...,...
12053,A0A2K5TU92,True,True,2,"2.3.1.286,2.4.2.-",4,2022-05-25,MSVNYAAGLSPYADKGKCGLPEIFDPPEELERKVWELARLVWQSSH...,355
12054,A0A3R0A696,True,False,1,3.2.1.55,4,2022-05-25,MKHWKKMAASLIAISTMVAVVPTTYAMESEDSQPQTTDTATVQTTK...,1065
12055,Q5ZV91,False,False,0,-,0,2022-05-25,MDEIKKDDELSQWLSTYGTITAERILGRYNISLPQDEILEAINIPS...,294
12056,M1H607,False,False,0,-,0,2022-05-25,MDAIKKKMQAMKLEKDDAMDRADTLEQQNKEANIRAEKAEEEVHNL...,284


### 5.3 Remove changed seqence in test set

In [30]:
test_2020 = test_2020[~test_2020.id.isin(test_2020.merge(train, on='id', how='inner').id.values)]
test_2020.reset_index(drop=True, inplace=True)


test_2022 = test_2022[~test_2022.id.isin(test_2022.merge(train, on='id', how='inner').id.values)]
test_2022.reset_index(drop=True, inplace=True)


### 5.4 Trim string

In [31]:
with pd.option_context('mode.chained_assignment', None):
    train.ec_number = train.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    train.seq = train.seq.parallel_apply(lambda x : str(x).strip()) #seq trim
    
    test_2020.ec_number = test_2020.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    test_2020.seq = test_2020.seq.parallel_apply(lambda x : str(x).strip()) #seq trim

    test_2022.ec_number = test_2022.ec_number.parallel_apply(lambda x : str(x).strip()) #ec trim
    test_2022.seq = test_2022.seq.parallel_apply(lambda x : str(x).strip()) #seq trim

### 5.5 Save train test

In [32]:
train.to_feather(cfg.DATADIR + 'datasets/train.feather')
test_2020.to_feather(cfg.DATADIR + 'datasets/test_2020.feather')
test_2022.to_feather(cfg.DATADIR + 'datasets/test_2022.feather')

## 6. Build benchmarking datasets
### 6.1 Task 1 isEnzyme

In [34]:
train = pd.read_feather(cfg.DIR_DATASETS + 'train.feather')
test_2020 = pd.read_feather(cfg.DIR_DATASETS + 'test_2020.feather')
test_2022 = pd.read_feather(cfg.DIR_DATASETS + 'test_2022.feather')

task1_train = train.iloc[:,np.r_[0,7,1]]
task1_test_2020 = test_2020.iloc[:,np.r_[0,7,1]]
task1_test_2022 = test_2022.iloc[:,np.r_[0,7,1]]

task1_train.to_feather(cfg.FILE_TASK1_TRAIN)
task1_test_2020.to_feather(cfg.FILE_TASK1_TEST_2020)
task1_test_2022.to_feather(cfg.FILE_TASK1_TEST_2022)

funclib.table2fasta(table=task1_train[['id', 'seq']], file_out=cfg.FILE_TASK1_TRAIN_FASTA)
funclib.table2fasta(table=task1_test_2020[['id', 'seq']], file_out=cfg.FILE_TASK1_TEST_2020_FASTA)
funclib.table2fasta(table=task1_test_2022[['id', 'seq']], file_out=cfg.FILE_TASK1_TEST_2022_FASTA)

Write finished
Write finished
Write finished


### 6.2 Task2 Function Counts

In [35]:
task2_train = train[train.functionCounts >0]
task2_train.reset_index(drop=True, inplace=True)
task2_train = task2_train.iloc[:,np.r_[0,7,3]]

task2_test_2020 = test_2020[test_2020.functionCounts >0]
task2_test_2020.reset_index(drop=True, inplace=True)
task2_test_2020 = task2_test_2020.iloc[:,np.r_[0,7,3]]

task2_test_2022 = test_2022[test_2022.functionCounts >0]
task2_test_2022.reset_index(drop=True, inplace=True)
task2_test_2022 = task2_test_2022.iloc[:,np.r_[0,7,3]]

task2_train.to_feather(cfg.FILE_TASK2_TRAIN)
task2_test_2020.to_feather(cfg.FILE_TASK2_TEST_2020)
task2_test_2022.to_feather(cfg.FILE_TASK2_TEST_2022)

funclib.table2fasta(table=task2_train[['id', 'seq']], file_out=cfg.FILE_TASK2_TRAIN_FASTA)
funclib.table2fasta(table=task2_test_2020[['id', 'seq']], file_out=cfg.FILE_TASK2_TEST_2020_FASTA)
funclib.table2fasta(table=task2_test_2022[['id', 'seq']], file_out=cfg.FILE_TASK2_TEST_2022_FASTA)

Write finished
Write finished
Write finished


### 6.3 Task3 EC Number

In [37]:
task3_train = train[train.functionCounts >0]
task3_train.reset_index(drop=True, inplace=True)
task3_train = task3_train.iloc[:,np.r_[0,7,4]]

task3_test_2020 = test_2020[test_2020.functionCounts >0]
task3_test_2020.reset_index(drop=True, inplace=True)
task3_test_2020 = task3_test_2020.iloc[:,np.r_[0,7,4]]

task3_test_2022 = test_2022[test_2022.functionCounts >0]
task3_test_2022.reset_index(drop=True, inplace=True)
task3_test_2022 = task3_test_2022.iloc[:,np.r_[0,7,4]]

task3_train.to_feather(cfg.FILE_TASK3_TRAIN)
task3_test_2020.to_feather(cfg.FILE_TASK3_TEST_2020)
task3_test_2022.to_feather(cfg.FILE_TASK3_TEST_2022)

funclib.table2fasta(table=task3_train[['id', 'seq']], file_out=cfg.FILE_TASK3_TRAIN_FASTA)
funclib.table2fasta(table=task3_test_2020[['id', 'seq']], file_out=cfg.FILE_TASK3_TEST_2020_FASTA)
funclib.table2fasta(table=task3_test_2022[['id', 'seq']], file_out=cfg.FILE_TASK3_TEST_2022_FASTA)

Write finished
Write finished
Write finished


## 7 Make Feature Bank

### 7.1 ESM embedding 

In [9]:
# loading sprot data
snap18 = pd.read_feather(cfg.DIR_UNIPROT + '/snap201802.feather')
snap20 = pd.read_feather(cfg.DIR_UNIPROT + '/snap202006.feather')
snap22 = pd.read_feather(cfg.DIR_UNIPROT + '/snap202202.feather')
# merge
full_snap_data = pd.concat([snap18, snap20,snap22], axis=0)
full_snap_data = full_snap_data.sort_values(by=['id', 'date_annotation_update'], ascending=False)
full_snap_data = full_snap_data[['id', 'seq']].drop_duplicates(subset='id', keep='first')
full_snap_data.reset_index(drop=True, inplace=True)


# loading exsisting features
if ftool.isfileExists(cfg.FILE_FEATURE_ESM0):
    feature_esm0 = pd.read_feather(cfg.FILE_FEATURE_ESM0)
    feature_esm32 = pd.read_feather(cfg.FILE_FEATURE_ESM32)
    feature_esm33 = pd.read_feather(cfg.FILE_FEATURE_ESM33)
    feature_unirep = pd.read_feather(cfg.FILE_FEATURE_UNIREP)
    feature_onehot = pd.read_feather(cfg.FILE_FEATURE_ONEHOT)
    #caculate embedding list
    needesm = full_snap_data[~full_snap_data.id.isin(list(set(feature_esm33.id)))]
    needunirep = full_snap_data[~full_snap_data.id.isin(list(set(feature_unirep.id)))]
    needonehot = full_snap_data[~full_snap_data.id.isin(list(set(feature_onehot.id)))]
else:
    needesm = full_snap_data
    needunirep = full_snap_data
    needonehot = full_snap_data



In [None]:
# !pip install fair-esm
if len(needesm)>0:
    tr_rep0, tr_rep32, tr_rep33 = esmebd.get_rep_multi_sequence(sequences=needesm, model='esm1b_t33_650M_UR50S',seqthres=1022)

    #merge existing
    feature_esm0 = pd.concat([feature_esm0, tr_rep0], axis=0).reset_index(drop=True)
    feature_esm32 = pd.concat([feature_esm32, tr_rep32], axis=0).reset_index(drop=True)
    feature_esm33 = pd.concat([feature_esm33, tr_rep33], axis=0).reset_index(drop=True)


    #save
    feature_esm0.to_feather(cfg.FILE_FEATURE_ESM0)
    feature_esm32.to_feather(cfg.FILE_FEATURE_ESM32)
    feature_esm33.to_feather(cfg.FILE_FEATURE_ESM33)

### 7.2 Unirep

In [4]:
if len(needunirep) > 0:
    from tools import embedding_unirep as unirep
    tr_unirep = unirep.getunirep(needunirep, 40)

    feature_unirep = pd.concat([feature_unirep, tr_unirep],axis=0).reset_index(drop=True)
    feature_unirep.to_feather(cfg.FILE_FEATURE_UNIREP)


100%|██████████| 49/49 [14:24<00:00, 17.65s/it]

length not match





### 7.3 one-hot

In [10]:
feature_onehot = pd.DataFrame()
needonehot = full_snap_data
if len(needonehot) > 0:
    tr_unirep = onehotebd.get_onehot(sequences=needonehot, padding=True, padding_window=1500)
    feature_onehot = pd.concat([feature_onehot, tr_unirep],axis=0).reset_index(drop=True)
    feature_onehot.to_feather(cfg.FILE_FEATURE_ONEHOT)