# 任务3重构

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-07-14  

以快照数据为时间节点重新梳理任务

构建流程：

> 2018年2月的快照数据为训练集  （469,134条）
> 2020年6月的快照数据 （476,011条） - 2018年2月份快照数据中出现的序列数据（469,134条）  差集作为测试集 (8033条，其中7开头的数据 3619 条)

## 1. 导入必要的包

In [27]:
import numpy as np
import pandas as pd
import sys
import os
from tqdm import tqdm
sys.path.append("../../tools/")
import ucTools
import funclib
from pandarallel import pandarallel
pandarallel.initialize()

%load_ext autoreload
%autoreload 2

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2. 定义函数功能

In [2]:
def ec_appears_statistic(data):
    """
    统计EC号出现的频次
    返回[EC号，出现频次] 的DataFrame
    """
    eclist = funclib.get_distinct_ec(data)
    resdf = pd.DataFrame(eclist)
    resdf.index = eclist
    resdf[0]=0
    
    for item in data:
        ecarray = item.split(',')
        for subitem in ecarray:
            resdf[resdf.index==subitem.strip()] +=1
            
    return resdf

## 3. 读取数据

In [29]:
# 2018 快照
snap2018 = pd.read_csv('../../data/201802/sprot_full.tsv', sep='\t', header=None, names=funclib.table_head)
snap2018.drop_duplicates(subset=['seq'], keep='first', inplace =True)
snap2018.reset_index(drop=True, inplace=True)


#2020 快照
snap2020 = pd.read_csv('../../data/202006/sprot_full.tsv', sep='\t', header=None, names=funclib.table_head)
snap2020.drop_duplicates(subset=['seq'], keep='first', inplace =True)
snap2020.reset_index(drop=True, inplace=True)


In [5]:
#展开后的快照数据
snap2018M = pd.read_csv('../../data/201802/snap2018_siglelineec.tsv', sep='\t', header=0)
snap2020M = pd.read_csv('../../data/202006/snap2020_siglelineec.tsv', sep='\t', header=0)

In [13]:
# snap2018M['ec_number']=snap2018M.ec_number.apply(lambda x:x.replace('_', '-'))
# snap2020M['ec_number']=snap2020M.ec_number.apply(lambda x:x.replace('_', '-'))

# snap2018M.to_csv('../../data/201802/snap2018_siglelineec.tsv', sep='\t', index=None)
# snap2020M.to_csv('../../data/202006/snap2020_siglelineec.tsv', sep='\t', index=None)

In [8]:
# 写入数据库中
uctools =  ucTools.ucTools('172.16.25.20')
cnx_mimic = uctools.db_conn()
uctools.saveToDB(snap2018M, 'tb_snap_2018', cnx_mimic)
uctools.saveToDB(snap2020M, 'tb_snap_2020', cnx_mimic)

In [30]:
#处理EC号后边为.的数据
snap2018['ec_number']=snap2018.parallel_apply(lambda x: x.ec_number+'-' if x.ec_number.endswith('.') else x.ec_number, axis=1)
snap2020['ec_number']=snap2020.parallel_apply(lambda x: x.ec_number+'-' if x.ec_number.endswith('.') else x.ec_number, axis=1)

## 4. 计算-统计数据

In [5]:
# 计算差集
list_subtraction = list(set(snap2020.seq) - set(snap2018.seq))
data_subtraction = snap2020[snap2020.seq.isin(list_subtraction)]
data_subtraction.reset_index(drop=True, inplace=True)

# 计算保存EC列表
ec_2018 = funclib.get_distinct_ec(snap2018.ec_number)
ec_2020 = funclib.get_distinct_ec(snap2020.ec_number)
ec_added = sorted(list(set(ec_2020) - set(ec_2018)))
ec_notappear = sorted(list(set(ec_2018) - set(ec_2020)))
ec_common = sorted(set(ec_2018) & set(ec_2020))

In [10]:
ec_counts_2018 =ec_appears_statistic(snap2018.ec_number)
ec_counts_2020 =ec_appears_statistic(snap2020.ec_number)

## 5.保存数据  

In [203]:
# 保存EC列表
pd.DataFrame(ec_2018, columns=['ec_number']).to_csv('../../results/task3r/ec2018.tsv', sep='\t')
pd.DataFrame(ec_2020, columns=['ec_number']).to_csv('../../results/task3r/ec2020.tsv', sep='\t')
pd.DataFrame(ec_added, columns=['ec_number']).to_csv('../../results/task3r/ecadded.tsv', sep='\t')
pd.DataFrame(ec_notappear, columns=['ec_number']).to_csv('../../results/task3r/ecnotappear.tsv', sep='\t')
pd.DataFrame(ec_common, columns=['ec_number']).to_csv('../../results/task3r/eccommon.tsv', sep='\t')


In [240]:
ec_counts_2018.to_csv('../../results/task3r/eccounts_2018.tsv', sep='\t')
ec_counts_2020.to_csv('../../results/task3r/eccounts_2020.tsv', sep='\t')

In [4]:
pd.DataFrame(columns=snap2018.columns.values)

Unnamed: 0,id,name,isemzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength


# SECTION 2 从数据库中读取并分析数据

In [19]:
469129-246562

222567

# SECTION 3 unirep

In [34]:
uniprep = pd.read_feather('../../data/sprot_unirep_bank.feather')

In [73]:
# unirep = pd.read_hdf('../../data/emzyme_noemzyme_uncertain_with_unirep.h5', mode='r', key='data')
uniprep=pd.read_feather('../../data/sprot_unirep.feather')

In [21]:
len(set(uniprep.seq))

476664

In [29]:
from jax_unirep import get_reps

In [30]:
def getunirep(enzyme_noemzyme, step):
    unirep_res = []
    counter = 1
    for i in tqdm(range(0, len(enzyme_noemzyme), step)):
        train_h_avg, train_h_final, train_c_final= get_reps(list(enzyme_noemzyme.seq[i:i+step]))
        checkpoint = np.hstack((np.array(enzyme_noemzyme[i:i+step]),train_h_final))
        
        if counter == 1:
            unirep_res = np.array(checkpoint)
        else:
            unirep_res = np.concatenate((unirep_res,checkpoint))

        np.save(r'/tmp/task_unirep_'+str(counter)+'.tsv', checkpoint)

        if len(train_h_final) != step:
            print('length not match')
        counter += 1
    
    return unirep_res

In [26]:
need_unirep = snap2018[~snap2018.seq.isin(uniprep.seq)]

# SECTION 4 构建训练测试数据

## 4.1 训练数据

In [31]:
DATAROOT= '/home/shizhenkun/codebase/BioUniprot/data/benchmark/data/'

In [32]:
# 写入「是否是酶」训练数据
funclib.table2fasta(snap2018, DATAROOT+'train.fasta')

Write finished


In [35]:
# 拼合训练数据
train_rep=snap2018.merge(uniprep, on='seq', how='left')

# 写入酶训练数据
train = train_rep.iloc[:, np.r_[0,2,4,5, 12:1912]]
train.to_feather(DATAROOT+'train.feather')

## 4.2 测试数据

In [36]:
test = snap2020[~snap2020.seq.isin(snap2018.seq)]

In [37]:
#test fasta
funclib.table2fasta(test, DATAROOT+'test.fasta')

Write finished


In [38]:
# 拼合测试数据
uniprep = pd.read_feather('../../data/sprot_unirep_bank.feather')
test_rep=test.merge(uniprep, on='seq', how='left')

In [39]:
# 写入酶训练数据
test_isenzyme = test_rep.iloc[:, np.r_[0,2,4,5, 12:1912]]
test_isenzyme.to_feather(DATAROOT+'test.feather')

## 4.3 EC字典

In [7]:
sql="""with tb1 as (
SELECT DISTINCT ec_number FROM tb_snap_2018  UNION SELECT DISTINCT ec_number from tb_snap_2020)
SELECT DISTINCT ec_number FROM tb1 ORDER BY ec_number"""
uctools =  ucTools.ucTools('172.16.25.20')
cnx_mimic = uctools.db_conn()
res= pd.read_sql_query(sql,cnx_mimic)

In [18]:
emzyme_label_dict = {k: v for k,v in zip(res.ec_number, res.index )}
np.save(DATAROOT+'ec_label_dict.npy', emzyme_label_dict)

In [41]:
isEmzyme_dict1 = np.load(DATAROOT+'ec_label_dict.npy', allow_pickle=True).item()

In [42]:
isEmzyme_dict1.get('-')

5309

In [43]:
len(isEmzyme_dict1)

5592

In [67]:
max(isEmzyme_dict1.values())

5591

In [116]:
# 写入是几功能酶
howmany = enzyme.iloc[:,np.r_[0,4,10,12:1912]]
# howmany.to_feather(DATAROOT+'train_howmany.feather')
funclib.table2fasta(howmany, DATAROOT+'train_howmany.fasta')

Write finished


In [120]:
ectrain = train_rep.iloc[:, np.r_[0,5, 12:1912]]

In [119]:
funclib.table2fasta(ectrain, DATAROOT+'train_ec.fasta')

Write finished


In [122]:
ectrain.to_feather(DATAROOT+'train_ec.feather')

In [109]:
uctools =  ucTools.ucTools('172.16.25.20')
cnx_mimic = uctools.db_conn()
for i in range(1,8):
    sql='''with tb1 as (
                SELECT ec_number, count(ec_number) from tb_snap_2020 GROUP BY ec_number ORDER BY ec_number
                ) SELECT count(ec_number), sum(count) from tb1 WHERE ec_number like '{0}.%%';'''.format(i)
    res= pd.read_sql_query(sql,cnx_mimic)
    print(i, '\t',int(res.values[0][0]), '\t', int(res.values[0][1]))

1 	 1501 	 31265
2 	 1564 	 87931
3 	 1076 	 57991
4 	 611 	 22331
5 	 269 	 12366
6 	 200 	 25013
7 	 85 	 11495


In [None]:
pd.read_sql_query(sql,cnx_mimic)

In [17]:
test_rep

Unnamed: 0,id,name,isemzyme,isMultiFunctional,functionCounts,ec_number,ec_specific_level,date_integraged,date_sequence_update,date_annotation_update,...,f1891,f1892,f1893,f1894,f1895,f1896,f1897,f1898,f1899,f1900
0,P02462,CO4A1_HUMAN,False,False,0,-,0,1986-07-21,2018-05-23,2020-12-02,...,-0.010307,0.254226,0.088533,-0.009481,-0.094310,-0.155860,-0.032777,-0.141111,0.403599,0.038656
1,P00939,TPIS_RABIT,True,True,2,"5.3.1.1, 4.2.3.3",4,1986-07-21,2020-10-07,2020-12-02,...,0.380862,-0.059595,-0.008574,-0.059051,-0.067673,0.042937,-0.034098,0.182489,0.055264,0.063145
2,P02340,P53_MOUSE,False,False,0,-,0,1986-07-21,2018-10-10,2020-12-02,...,0.046348,0.070390,-0.225478,0.052106,-0.054457,-0.023183,-0.009764,-0.126373,0.027127,0.151444
3,P01848,TRAC_HUMAN,False,False,0,-,0,1986-07-21,2018-07-18,2020-08-12,...,0.034023,-0.010156,-0.237000,0.016486,-0.114827,0.272844,0.143286,0.086096,0.022759,-0.010570
4,P01733,TVBL3_HUMAN,False,False,0,-,0,1986-07-21,2018-07-18,2020-08-12,...,0.110813,0.034311,0.008858,0.002125,-0.283349,0.194242,0.050730,0.107122,0.151845,0.050503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8028,A0A061ACU2,PIEZ1_CAEEL,False,False,0,-,0,2020-12-02,2014-09-03,2020-12-02,...,0.007575,0.088760,-0.033027,0.111776,-0.303977,-0.034524,0.048425,0.022641,-0.144014,-0.013284
8029,Q8GYS8,LTG17_ARATH,False,False,0,-,0,2020-12-02,2003-03-01,2020-12-02,...,-0.039894,0.028112,0.017320,-0.007484,-0.238741,0.062047,-0.224393,0.023556,-0.157620,-0.001258
8030,Q6NRV0,TRAIP_XENLA,True,False,1,2.3.2.27,4,2020-12-02,2020-12-02,2020-12-02,...,0.010593,-0.011309,-0.014276,0.018208,-0.093581,-0.180773,0.081044,0.018184,0.379403,0.077395
8031,C5DLH0,DXO_LACTC,True,True,2,"3.6.1.-, 3.6.1.-",3,2020-12-02,2009-07-28,2020-12-02,...,0.105460,-0.045830,-0.279623,0.013364,-0.023051,-0.025641,-0.018461,-0.011174,0.054235,-0.008912


In [20]:
test_enzyme = test_isenzyme[test_rep.isemzyme]
test_enzyme.reset_index(drop=True, inplace=True)

In [21]:
test_enzyme

Unnamed: 0,id,isemzyme,f1,f2,f3,f4,f5,f6,f7,f8,...,f1891,f1892,f1893,f1894,f1895,f1896,f1897,f1898,f1899,f1900
0,P00939,True,0.063588,0.601079,0.325486,-0.085601,-0.253584,-0.036647,-0.029784,-0.005847,...,0.380862,-0.059595,-0.008574,-0.059051,-0.067673,0.042937,-0.034098,0.182489,0.055264,0.063145
1,P00780,True,0.007205,-0.123280,-0.034474,-0.289568,-0.647642,0.000935,0.371740,-0.028273,...,0.085781,-0.273059,-0.221449,-0.223586,-0.606882,0.184678,-0.018794,0.002871,-0.130372,-0.004079
2,P11509,True,0.001756,-0.083595,-0.034843,-0.033445,-0.026055,-0.021621,-0.011789,-0.176618,...,0.008384,0.032996,0.008304,-0.134407,-0.874854,0.438699,-0.004966,0.038901,-0.160794,0.087475
3,P03318,True,-0.018868,-0.001680,0.023652,-0.000368,0.458131,0.078176,-0.093273,-0.005638,...,0.002054,0.660758,-0.310641,0.056559,-0.505239,-0.019711,0.189862,-0.019588,-0.032944,-0.000018
4,P03317,True,-0.000369,-0.062202,0.053365,-0.001488,0.568943,0.005530,-0.312891,-0.002194,...,0.016664,0.175515,-0.057868,0.044472,-0.478046,-0.031133,-0.013975,0.016027,-0.055946,-0.040853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3574,A0A2R6Q324,True,0.000103,-0.002359,0.043916,0.022434,-0.197716,0.009035,0.105923,-0.008771,...,0.039002,0.095009,0.038163,0.036562,-0.561948,-0.003115,0.278384,0.010972,-0.301565,-0.044138
3575,Q6VE93,True,0.064012,-0.023088,0.024043,-0.005472,0.112231,-0.010082,0.283041,-0.002689,...,0.033600,-0.012503,-0.141233,0.047683,-0.082917,-0.016987,-0.001321,-0.037108,0.026485,-0.046931
3576,A0A509AKI1,True,0.009049,-0.012200,-0.043129,-0.001106,-0.037016,0.009201,-0.680708,-0.004776,...,0.032895,0.016874,-0.038663,0.039172,0.090116,-0.042056,-0.042705,0.006708,0.570306,0.161003
3577,Q6NRV0,True,0.001207,-0.053773,0.025933,-0.003389,0.003366,0.015046,-0.077923,-0.004771,...,0.010593,-0.011309,-0.014276,0.018208,-0.093581,-0.180773,0.081044,0.018184,0.379403,0.077395


# SECTION 5 构建EC迁移数据

In [127]:
filein = DATAROOT + 'ec_discription_20210719.txt'

In [164]:
import re

In [161]:
f= open(filein)
counter =0
reslist=[]
for line in tqdm(f):
    line = line.strip().replace('\n','')
    counter +=1
#     if counter ==1200:
#         break
    if len(line) != 0:
        if ('deleted' in line) or ('transferred' in line) or (line.count('EC')>1):
            if line.startswith('EC'):
                reslist +=[line]

8793it [00:00, 835213.33it/s]


In [173]:
writer = open(DATAROOT + 'ec_discription_20210719_r1.txt', 'w')
for line in reslist:
    writer.write(line+'\n')
writer.close()

In [182]:
deleted= []
transferred =[]
other =[]
for line in reslist:
    if 'deleted' in line:
        deleted +=[line]
    elif 'transferred' in line:
        transferred+=[line]
    else:
        other += [line]

In [176]:
len(deleted)

326

In [177]:
len(transferred)

133

In [171]:
reslist[1:15]

['EC 1.1.1.63 transferred, now EC 1.1.1.239',
 'EC 1.1.1.68 now EC 1.7.99.5',
 'EC 1.1.1.70 deleted, included in EC 1.2.1.3',
 'EC 1.1.1.74 deleted',
 'EC 1.1.1.89 deleted, included in EC 1.1.1.86',
 'EC 1.1.1.109 now EC 1.3.1.28',
 'EC 1.1.1.128 deleted covered by EC 1.1.1.264',
 'EC 1.1.1.139 deleted, included in EC 1.1.1.21',
 'EC 1.1.1.155 identical to EC 1.1.1.87',
 'EC 1.1.1.158 transferred now EC 1.3.1.98',
 'EC 1.1.1.161 deleted, covered by EC 1.14.13.15',
 'EC 1.1.1.171 now EC 1.5.1.20',
 'EC 1.1.1.180 deleted, included in EC 1.1.1.131',
 'EC 1.1.1.182 deleted, included in EC 1.1.1.198, EC 1.1.1.227 and EC 1.1.1.228']

In [None]:
w

In [169]:
translist=[]
for item in reslist:
    if 'transferred' in item:
        subres = re.findall(r'EC[ ]*[0-9.]*', item ) +['transferred']
        translist +=[subres]

In [166]:
re.findall(r'EC[ ]*[0-9.]*', 'EC 1.1.1.63 transferred, now EC 1.1.1.239' )

['EC 1.1.1.63', 'EC 1.1.1.239']

In [61]:
train.iloc[240:255,:]

Unnamed: 0,id,isemzyme,functionCounts,ec_number,f1,f2,f3,f4,f5,f6,...,f1891,f1892,f1893,f1894,f1895,f1896,f1897,f1898,f1899,f1900
240,P03141,False,0,-,0.001659,0.005037,0.03424,-0.009932,0.301492,0.131113,...,-0.000538,-0.160595,-0.199464,-0.029431,-0.047798,-0.044202,0.274341,0.044814,-0.131824,-0.010497
241,P03140,False,0,-,0.000867,0.004244,0.02681,-0.006595,0.26994,0.103676,...,-0.001108,-0.133161,-0.121784,-0.021571,-0.039129,-0.261838,0.208429,0.041954,-0.073013,-0.010056
242,P03139,False,0,-,0.00382,0.008146,0.048195,-0.00697,0.254542,0.13393,...,-0.000376,-0.060732,-0.33716,-0.033123,-0.031427,0.050323,0.087816,0.097321,-0.095659,-0.014205
243,P03138,False,0,-,0.002133,0.004716,0.04048,-0.009079,0.333546,0.147972,...,-0.000554,-0.10019,-0.2737,-0.038758,-0.038821,0.014683,0.194105,0.041437,-0.12784,-0.011162
244,P03143,False,0,-,0.000884,0.000953,0.022431,-0.0121,0.185756,0.15187,...,-0.002725,-0.100221,-0.082165,-0.012705,-0.044696,-0.352842,0.12217,0.029388,-0.088356,-0.014819
245,P01103,False,0,-,0.00155,-0.010215,0.084963,-0.002161,-0.106861,0.02424,...,0.014058,-0.139779,-0.051309,0.02375,0.076201,-0.027757,0.444871,0.005618,0.087728,0.076237
246,P01104,False,0,-,0.007767,-0.011116,0.033822,-0.00062,-0.116797,0.008301,...,0.009997,0.014019,-0.052277,0.016994,0.027973,-0.029086,-0.031344,0.00327,0.109752,0.123043
247,P01105,False,0,-,0.003333,-0.039282,0.089289,-0.002111,-0.176063,0.009939,...,-0.009,0.154131,-0.038165,0.008573,0.262453,-0.056291,-0.011338,0.002546,0.066857,-0.044863
248,P00959,True,1,6.1.1.10,0.000385,0.027466,0.048272,-0.124654,0.316184,0.002738,...,-0.074817,0.015226,-0.074829,0.232996,-0.182579,0.062484,0.017667,0.001724,-0.112301,-0.012241
249,P00348,True,1,1.1.1.35,0.059642,-0.236925,-0.050564,-0.107688,-0.017693,0.040009,...,-0.007338,0.211365,-0.085157,0.259117,-0.191403,0.006713,0.001691,-0.287444,-0.011161,0.045252


In [62]:
aaa = pd.read_csv(DATAROOT+'slice_train_y.txt', skiprows=1, header=None, sep=':')

In [53]:
len(set(aaa[1]))

1

In [1]:
! python ../../baselines/deepec/deepec.py -i ../../data/benchmark/data/test.fasta -o ../../data/benchmark/results/deepec

2021-07-26 16:30:00.081337: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-07-26 16:38:10.396227: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-07-26 16:38:10.436460: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:17:00.0 name: RTX A6000 computeCapability: 8.6
coreClock: 1.8GHz coreCount: 84 deviceMemorySize: 47.54GiB deviceMemoryBandwidth: 715.34GiB/s
2021-07-26 16:38:10.438204: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 1 with properties: 
pciBusID: 0000:73:00.0 name: RTX A6000 computeCapability: 8.6
coreClock: 1.8GHz coreCount: 84 deviceMemorySize: 47.54GiB deviceMemoryBandwidth: 715.34GiB/s
2021-07-26 16:38:10.438281: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-07-26 16:38:10