In [1]:
import re
import os
import numpy as np
import scipy.io as sio
from Bio import SeqIO
import tensorflow as tf
from time import time
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

## 氨基酸序列的双联体ONE-HOT编码

In [2]:
alphabet = 'ACDEFGLHKLMNPQRSTVWY'
daa = []
for x in alphabet:
    for y in alphabet:
        daa.append(x+y)

In [3]:
#把一个长度为L的氨基酸序列转换为一个矩阵型数组。按照20个氨基酸的两联体以ONE-HOT方式编码
#矩阵的列数是400
#矩阵的行数根据参数r，有：(L-1)+(L-2)+...+(L-r)=rL-(r+1)*r/2
def seq2DaaOneHotArray(sequence, r, daa):
    L = len(sequence)
    N = r*L - ((r+1)*r)/2
    result = np.zeros(shape=(int(N), 400))
    m = 0
    for i in range(r):
        for j in range(L-i-1):
            aa = sequence[j]+sequence[j+i+1]
            k = daa.index(aa)
            result[m][k] = 1
            m = m + 1  
    return result

## 根据序列的fasta文件构建数据集

In [36]:
#读入序列文件和位点文件
def loadBindingsites(fastaFile, siteFile):
    #读序列文件，每一个序列构成字典的一项，
    #key：序列的id
    #value: 氨基酸序列的字母字符串
    data = {}
    for seq_record in SeqIO.parse(fastaFile, 'fasta'):
        data[seq_record.id] = seq_record.seq
    
    #读位点文件
    bindingsites = {}
    with open(siteFile, 'r') as pbsreader:
        i = 0
        for line in pbsreader:
            i = i + 1
            line = line.strip()
            if '>' in line:
                sid = line[1:]
            else:
                sites = line.split()
                bs = []
                for site in sites:
                    bs.append( int(site))
            if i%2 == 0:
                bindingsites[sid] = bs
      
    return (data, bindingsites)

In [None]:
#构建序列样本集
def createDatasets(data, bindingsites):
    for key in data:
        sites = bindingsites[key]
        

In [38]:
data,bindingsites = loadBindingsites('../data/PDNA-224.fasta','../data/PDNA-224-binding-sites.txt')

In [41]:
for key in data:
    print(key)

1A02_N
1A02_F
1A35_A
1A73_A
1B01_A
1B3T_A
1B72_B
1BDH_A
1BG1_A
1BPX_A
1BRN_L
1C9B_A
1CBV_L
1CMA_A
1CW0_A
1D02_A
1D2I_A
1D5Y_A
1DC1_A
1DDN_A
1DH3_A
1DMU_A
1DNK_A
1DP7_P
1EBM_A
1ECR_A
1EMH_A
1EOO_A
1ESG_A
1EYU_A
1F4K_A
1F4R_A
1FIU_A
1FJX_A
1FYM_B
1FZP_B
1GD2_E
1GDT_A
1H38_A
1H8A_C
1HCR_A
1HHT_P
1HJB_A
1HJB_C
1HLV_A
1HWT_C
1I3J_A
1I7D_A
1IAW_A
1IGN_A
1J1V_A
1JB7_A
1JE8_A
1JEY_B
1JEY_A
1JMC_A
1JNM_A
1K78_A
1KC6_A
1L3L_A
1LLI_A
1LQ1_A
1MA7_A
1MDY_A
1MM8_A
1N6J_A
1NKP_A
1NOP_A
1ODH_A
1OUP_A
1OUZ_A
1P4E_A
1P7D_A
1PAR_A
1Q9Y_A
1R71_A
1R7M_A
1RBJ_A
1REP_C
1RH6_A
1RRQ_A
1SA3_A
1SFU_A
1T2K_D
1T8E_B
1TAU_A
1TGH_A
1TRO_A
1TSR_A
1U1K_A
1U3E_M
1U78_A
1U8B_A
1UUT_A
1VAS_A
1WVL_A
1X9N_A
1XC8_A
1XF2_H
1XPX_A
1XSD_A
1YFJ_A
1Z63_A
1ZNS_A
1ZX4_A
1ZZI_A
2A0I_A
2A3V_A
2BOP_A
2BPA_1
2C5R_B
2C62_A
2C9L_Y
2CGP_A
2D7D_A
2DGC_A
2ES2_A
2ETW_A
2EX5_A
2EZV_A
2F8N_K
2FIO_A
2FKC_A
2FQZ_A
2H1O_E
2H1O_A
2H27_A
2H7F_X
2HAN_B
2HVS_A
2I0Q_B
2I13_A
2NNY_A
2NTC_A
2O49_A
2O61_A
2O8B_B
2OWO_A
2OXV_A
2P6R_A
2PY5_A
2Q2K_A
2Q2T_A

In [21]:
type(site)

Bio.Seq.Seq