# 构建不同任务的数据集

> author: Shizhenkun   
> email: zhenkun.shi@tib.cas.cn   
> date: 2021-05-25  

# Task1. 预测是酶还是非酶数据集构建



## 任务简介
该任务通过给定蛋白序列，预测该该蛋白是酶还是非酶。本任务所使用的数据集为Sport，对数据集的数据中进行学习，然后对新给定的蛋白序列数据预测是酶还是非酶。


## 数据统计
- 数据源Sprot，共有数据564,638条，其中有EC号的数据270,236条，无EC号的数据294402条。
- 将数据集中的所有数据按照时间排序，～90%作为训练集，～10%作为测试集，找到对应时间节点为2010年2月9日。
- 以2010年2月10日为时间节点，之前的数据为训练集，之后的数据为测试集，具体数据集统计如下： 





|     Items    | 酶     |   非酶    |合计                            |
| ------------ | -------| -------- |-------------------------------|
| 训练集        | 245771 | 264719   | 510490（510490/564638≈90.41%） |
| 测试集        | 24465  | 29683    | 54148（54148/564638≈9.59%）    |

## 数据集构建方法

* 根据蛋白注释信息，将Sprot中的蛋白分为「酶」和 「非酶」。
* 为了保证蛋白的唯一性，按照序列对Sprot中的数据进行去重除了，以保证数据集中序列的唯一性。
* 将注释中含有3级及三级以上EC号的蛋白做为酶数据，收录于数据集中，当中正样本 。
* 将注释中不含有EC号的数据作为非酶数据，并进行如下处理后收录于数据集中，当做是负样本。
> 1. 非酶数据与酶数据进行同源比对(diamoind)，过滤掉相似性>40%的数据
> 2. 过滤掉<50bp的核算片段
> 3. 对酶数据进行CD——HIT比较，按照40%的阈值进行去冗余<span style='background:yellow'>（？有必要吗）</span>

## 1. 导入必要的包

In [10]:
import numpy as np
import pandas as pd
import random
import time
import gzip
import re
from Bio import SeqIO
import datetime
import sys
from tqdm import tqdm

from functools import reduce
import matplotlib.pyplot as plt

sys.path.append("./tools/")
import commontools
import funclib

from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB

from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.faker import Faker
from pyecharts.globals import ThemeType

from jax_unirep import get_reps

# Thres 》=100 《=600｜500

## 2. 读入数据

In [72]:
table_head = [  'id', 
                'isemzyme',
                'isMultiFunctional', 
                'functionCounts', 
                'ec_number', 
                'date_integraged',
                'date_sequence_update',
                'date_annotation_update',
                'seq', 
                'seqlength'
            ]

#加载数据并转换时间格式
sprot = pd.read_csv('./data/sprot_full.tsv', sep='\t',names=table_head) #读入文件
sprot.date_integraged = pd.to_datetime(sprot['date_integraged'])
sprot.date_sequence_update = pd.to_datetime(sprot['date_sequence_update'])
sprot.date_annotation_update = pd.to_datetime(sprot['date_annotation_update'])

sprot.head(2)

Unnamed: 0,id,isemzyme,isMultiFunctional,functionCounts,ec_number,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,P02802,False,False,1,-,1986-07-21,1986-07-21,2021-04-07,MDPNCSCSTGGSCTCTSSCACKNCKCTSCKKSCCSCCPVGCSKCAQ...,61
1,P02732,False,False,1,-,1986-07-21,1986-07-21,2019-12-11,AATAATAATAATAATAATAATAATAATAATA,31


## 3. 数据去重

In [73]:
# 统计重复数据
table_repeat=pd.value_counts(sprot.seq, sort=True)
table_repeat = pd.DataFrame(table_repeat)
table_repeat['sequence'] = table_repeat.index
table_repeat.columns=['repeat', 'seq']
table_repeat = table_repeat.reset_index(drop=True)

#准备画图数据
figure_data = pd.value_counts(table_repeat.repeat, sort=True)
figure_data = pd.DataFrame(figure_data)
figure_data['x'] = figure_data.index
figure_data.columns=['y', 'x']

bar = (
    Bar(init_opts=opts.InitOpts(width="1700px",
                                height="750px",
                                page_title="sprot",
                                theme=ThemeType.CHALK))
    .add_xaxis(list(figure_data.x))
    .add_yaxis("重复次数", list(figure_data.y))
    .set_global_opts(
        title_opts=opts.TitleOpts(title="SPROT重复序列数据统计"),
        datazoom_opts=opts.DataZoomOpts()
    )
)

bar.render_notebook()


In [79]:
# 去除重复数据
sprot = sprot.drop_duplicates(subset=['seq'], keep='first', inplace=False)
sprot = sprot.reset_index(drop=True)
base_data = sprot[sprot.seqlength >= 50]

In [80]:
base_data

Unnamed: 0,id,isemzyme,isMultiFunctional,functionCounts,ec_number,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
0,P02802,False,False,1,-,1986-07-21,1986-07-21,2021-04-07,MDPNCSCSTGGSCTCTSSCACKNCKCTSCKKSCCSCCPVGCSKCAQ...,61
3,P02734,False,False,1,-,1986-07-21,1987-08-13,2019-12-11,MRITEANPDPDAKAVPAAAAPSTASDAAAAAAATAATAAAAAAATA...,85
4,P00484,True,False,1,2.3.1.28,1986-07-21,1988-08-01,2021-04-07,MNYTKFDVKNWVRREHFEFYRHRLPCGFSLTSKIDITTLKKSLDDS...,213
5,P00486,True,False,1,2.3.1.28,1986-07-21,1986-07-21,2021-04-07,MTFNIIKLENWDRKEYFEHYFNQQTTYSITKEIDITLFKDMIKKKG...,215
6,P60175,True,True,2,"5.3.1.1, 4.2.3.3",1986-07-21,2007-01-23,2020-12-02,MAPSRKFFVGGNWKMNGRKQSLGELIGTLNAAKVPADTEVVCAPPT...,249
...,...,...,...,...,...,...,...,...,...,...
476659,A0A1X9WEP1,True,False,1,2.1.1.351,2021-04-07,2017-08-30,2021-04-07,MVFDRLAGIYDATGVEFFRPVARRLLDLVDPRPGVDLLDVGCGRGA...,280
476660,Q6GN86,False,False,1,-,2021-04-07,2004-07-19,2021-04-07,MIPPADSLLKHDNPVLISKNTERKSPKSRPLKVSSPQTVLTAPVPP...,254
476661,Q0JCC3,False,False,1,-,2021-04-07,2006-10-03,2021-04-07,MSPPVAGAASSGDGPPGRPPRELYTIPASSGWFQWDEIHETERRAL...,560
476662,P0DUI1,False,False,1,-,2021-04-07,2021-04-07,2021-04-07,MNYFILLFVATFLLLDVNCKKDGYPVDANNCKFECWKNEYCDELCK...,84


In [43]:
base_data = sprot[sprot.seqlength >= 50]
base_data.head(5)
sprot_with_ec = base_data[base_data.isemzyme]
sprot_with_ec.head(3)

Unnamed: 0,id,isemzyme,isMultiFunctional,functionCounts,ec_number,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength
4,P00484,True,False,1,2.3.1.28,1986-07-21,1988-08-01,2021-04-07,MNYTKFDVKNWVRREHFEFYRHRLPCGFSLTSKIDITTLKKSLDDS...,213
5,P00486,True,False,1,2.3.1.28,1986-07-21,1986-07-21,2021-04-07,MTFNIIKLENWDRKEYFEHYFNQQTTYSITKEIDITLFKDMIKKKG...,215
6,P60175,True,True,2,"5.3.1.1, 4.2.3.3",1986-07-21,2007-01-23,2020-12-02,MAPSRKFFVGGNWKMNGRKQSLGELIGTLNAAKVPADTEVVCAPPT...,249


In [32]:
data_match[data_match.seq=='MTQSNPNEQNVELNRTSLYWGLLLIFVLAVLFSNYFFN']

Unnamed: 0,id,isemzyme,isMultiFunctional,functionCounts,ec_number,date_integraged,date_sequence_update,date_annotation_update,seq,seqlength,repeat
132962,P60148,False,False,1,-,2003-11-28,2003-11-28,2019-12-11,MTQSNPNEQNVELNRTSLYWGLLLIFVLAVLFSNYFFN,38,114
132991,P60144,False,False,1,-,2003-11-28,2003-11-28,2020-08-12,MTQSNPNEQNVELNRTSLYWGLLLIFVLAVLFSNYFFN,38,114
133001,P60138,False,False,1,-,2003-11-28,2003-11-28,2020-12-02,MTQSNPNEQNVELNRTSLYWGLLLIFVLAVLFSNYFFN,38,114
133008,P60145,False,False,1,-,2003-11-28,2003-11-28,2020-12-02,MTQSNPNEQNVELNRTSLYWGLLLIFVLAVLFSNYFFN,38,114
133019,P60147,False,False,1,-,2003-11-28,2003-11-28,2021-04-07,MTQSNPNEQNVELNRTSLYWGLLLIFVLAVLFSNYFFN,38,114
...,...,...,...,...,...,...,...,...,...,...,...
395861,B1VKE8,False,False,1,-,2008-11-04,2008-05-20,2020-02-26,MTQSNPNEQNVELNRTSLYWGLLLIFVLAVLFSNYFFN,38,114
395878,B1A950,False,False,1,-,2008-11-04,2008-04-08,2020-12-02,MTQSNPNEQNVELNRTSLYWGLLLIFVLAVLFSNYFFN,38,114
395890,B1NWG5,False,False,1,-,2008-11-04,2008-04-29,2020-12-02,MTQSNPNEQNVELNRTSLYWGLLLIFVLAVLFSNYFFN,38,114
395925,B3TN65,False,False,1,-,2008-11-04,2008-09-02,2020-12-02,MTQSNPNEQNVELNRTSLYWGLLLIFVLAVLFSNYFFN,38,114
