# Trembl 数据预处理
## 1. 导入必要的包

In [1]:
import numpy as np
import pandas as pd
import random
import time
import gzip
import re
from Bio import SeqIO
import datetime

## 2. 定义必要的函数

In [2]:
#从gizp读取含有EC号的数据
def read_file_from_gzip(file_in_path, file_out_path):
    counter = 0
    saver = 0 
    reslist = []
#     reslist.append(['id', 'description', 'seq'])
    with gzip.open(file_in_path, "rt") as handle:
        for record in SeqIO.parse(handle, 'swiss'):
            counter+=1
            res = process_record(record)

            if counter %10000==0:
                print('lines:{0}/{1}'.format(saver, counter))

            if len(res) >0:
                reslist.append(res)
                saver +=1
            else:
                continue

    result = np.array(reslist,dtype=object)
    np.savetxt(file_out_path, result, delimiter="\t", fmt='%s')
 
# 提取单条含有EC号的数据
def process_record(record):
    description = record.description
    if 'EC=' in description:
        id = record.id
        name = record.name
        seq = record.seq
        ec = str(re.findall(r"EC=[0-9,.\-;]*",description)).replace('EC=','').replace('\'','').replace(']','').replace('[','').replace(';','')
        date_integraged = record.annotations.get('date')
        date_sequence_update = record.annotations.get('date_last_sequence_update')
        date_annotation_update = record.annotations.get('date_last_annotation_update')
        return [id, ec, date_integraged, date_sequence_update, date_annotation_update,  seq]
    else:
        return []
    
#将表格存储为fasta文件    
def table_2_fasta(table, file_out):
    file = open(file_out, 'w')
    for index, row in table.iterrows():
        file.write('>{0}\n'.format(row['id']))
        file.write('{0}\n'.format(row['seq']))
    file.close()
    print('Write finished')

# 将给定的数据随机划分为2份
def split_random(data):
    index_ref = random.sample(range(0,len(data)), int(len(data)/2))  #创建随机index
    ref = data.iloc[index_ref]     #筛选ref
    query = data.iloc[~data.index.isin(index_ref)] # 剩下的是Query
    table_2_fasta(ref, './data/sprot_with_ec_ref.fasta')   #写入文件ref fasta
    table_2_fasta(query, './data/sprot_with_ec_query.fasta') #写入文件query fasta
    return query, ref

# 按时序前一半后一般划分
def split_time_half(data):
    index_ref = range(int(len(data)/2))
    ref = data.iloc[index_ref]     #筛选ref
    query = data.iloc[~data.index.isin(index_ref)] # 剩下的是Query
    table_2_fasta(ref, './data/sprot_with_ec_ref.fasta')   #写入文件ref fasta
    table_2_fasta(query, './data/sprot_with_ec_query.fasta') #写入文件query fasta

## 3. 提取含有EC号的数据，并保存到文件

In [3]:
start =  time.process_time()
in_filepath = r'./data/uniprot_trembl.dat.gz'
out_filepath = r'./data/trembl_with_ec.tsv'
read_file_from_gzip(file_in_path=in_filepath, file_out_path=out_filepath)
end =  time.process_time()
print('finished use time %6.3f s' % (end - start))

KeyboardInterrupt: 

## 4. 加载处理完的数据

In [None]:
sprot = pd.read_csv('./data/sprot_with_ec.tsv', sep='\t',names=['id', 'ec_number', 'date_integraged','date_sequence_update','date_annotation_update','seq']) #读入文件
sprot.date_integraged = pd.to_datetime(sprot['date_integraged'])
sprot.date_sequence_update = pd.to_datetime(sprot['date_sequence_update'])
sprot.date_annotation_update = pd.to_datetime(sprot['date_annotation_update'])
sprot.head(3)