In [5]:
import re
import random
import numpy as np
import pandas as pd
from collections import Counter  

In [6]:
def random_pick_1(seq,probabilities):
    # x = random.uniform(0, 1)#首先随机生成一个0，1之间的随机数
    x = random.randint(1, sum(probabilities))
    cumulative_probability = 0
    for item, item_probability in zip(seq,probabilities):#seq代表待输入的字符串，prob代表各自字符串对应的概率
        cumulative_probability += item_probability#只有当累加的概率比刚才随机生成的随机数大时候，才跳出，并输出此时对应的字符串
        if x <= cumulative_probability:
            break
    return item  

In [7]:
def read_proteome_uniprot():
    '''
    Read the sequences of the proteins in the human proteome. 
    Sequence data are stored in a fasta file     
    Args:
        1. path: The input file containing sequence data of the proteome
            downloaded from the ensemble biomart FTP:
            ftp://ftp.ensembl.org/pub/release-90/fasta/homo_sapiens/.
    Return values:
        1. proteome: A dictionary whose keys are protein ensembl IDs
                    and values are protein sequences
    '''   
    path = "uniprot_proteins.csv"
    reference_df = pd.read_csv(path, index_col=0)
    reference_df1 = reference_df.set_index(['accession'])['seq'].to_dict()

    return reference_df1

In [8]:
uniprot = read_proteome_uniprot()

In [10]:
    iedb_csv = "iedb_data.txt"
    iedb_df = pd.read_csv(iedb_csv, sep='\t', skiprows=0, low_memory=False, dtype=object)
    iedb_df = np.array(iedb_df)
    
    all_positive_peptide = list(set([p[0] for p in iedb_df]))

In [11]:
    data_dict = {}
    for i in range(len(iedb_df)):
        allele = iedb_df[i][1]        
        if allele not in data_dict.keys():
            data_dict[allele] = [iedb_df[i].tolist()]
        else:
            data_dict[allele].append(iedb_df[i].tolist())

In [12]:
allele

'HLA-A*02:01'

In [13]:
len(iedb_df)

88110

In [14]:
all_neg = []
for allele in data_dict.keys():
    # allele = 'HLA-B*41:06'
    traing_data = data_dict[allele]
    all_length = [len(traing_data[j][0]) for j in range(len(traing_data))]      
    all_length_times = Counter(all_length)

    all_probabilities = []
    for kmer in [8,9,10,11,12,13,14,15]:
        try:              
            probabilities = all_length_times[kmer]
        except:
            probabilities = 0   
        
        all_probabilities.append(probabilities)

In [15]:
all_length

[10, 10, 10, 9, 9, 10, 9, 9, 9, 10, 9]

In [16]:
def Random_peptides():
    '''
    Randomly sample peptides from the proteome
    Args:
        1. proteome: A dictionary of the human proteome.
        Output of the function read_proteome
    Return values:
        1. peptides: Sampled peptides.
    '''
    proteome = read_proteome_uniprot()
    
    iedb_csv = "iedb_data.txt"
    iedb_df = pd.read_csv(iedb_csv, sep='\t', skiprows=0, low_memory=False, dtype=object)
    iedb_df = np.array(iedb_df)
    
    all_positive_peptide = list(set([p[0] for p in iedb_df]))
    
    data_dict = {}
    for i in range(len(iedb_df)):
        allele = iedb_df[i][1]        
        if allele not in data_dict.keys():
            data_dict[allele] = [iedb_df[i].tolist()]
        else:
            data_dict[allele].append(iedb_df[i].tolist())

    #randomly generate peptides from the proteome    
    all_neg = []
    for allele in data_dict.keys():
        # allele = 'HLA-B*41:06'
        traing_data = data_dict[allele]
        all_length = [len(traing_data[j][0]) for j in range(len(traing_data))]      
        all_length_times = Counter(all_length)

        all_probabilities = []
        for kmer in [8,9,10,11,12,13,14,15]:
            try:              
                probabilities = all_length_times[kmer]
            except:
                probabilities = 0   
            
            all_probabilities.append(probabilities)
    
        pep_seq = []
        while len(pep_seq) < 10*len(traing_data):  #Set the number of random selections, here is 10 times the number of negative samples as positive samples
            length = random_pick_1([8,9,10,11,12,13,14,15],all_probabilities)  
            accession = random.choice(list(proteome.keys()))
            protein = proteome[accession]
            if len(protein) < length:
                    continue
            pep_start = random.randint(0, len(protein) - length)
            pep = protein[pep_start:pep_start + length]
            
            if set(list(pep)).difference(list('ACDEFGHIKLMNPQRSTVWY')):
                print('No official peptide')
                continue       
                
            if pep in all_positive_peptide:
                print('In positive peptide')
                continue
                
            if pep not in pep_seq:
                pep_seq.append([accession, pep])
    
        for k in pep_seq:
            #k[0] is uniprot ID
            #allele is MHC allele
            #k[1] is ligand sequence
            #all_neg.append([allele, k[0], k[1]])
            all_neg.append([k[1], allele])
            
    return all_neg

In [17]:
neg = Random_peptides()

No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
In positive peptide
No official peptide
In positive peptide
In positive peptide
No official peptide
No official peptide
No official peptide
In positive peptide
No official peptide
No official peptide
No official peptide
In positive peptide
No official peptide
In positive peptide
No official peptide
No official peptide
No official peptide
In positive peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
No official peptide
In positive peptide
In positive peptide
No official peptide
No official peptide


In [18]:
len(neg)

881100

In [19]:
type(neg)

list

In [20]:
neg

[['PLQKQLPAFIS', 'HLA-A2'],
 ['SLAACTNQP', 'HLA-A2'],
 ['DDHVSQVQA', 'HLA-A2'],
 ['KEFSGENLGF', 'HLA-A2'],
 ['GTANGPVNTE', 'HLA-A2'],
 ['MWRSRWDAS', 'HLA-A2'],
 ['PEGPRGEKG', 'HLA-A2'],
 ['PMKMGYPPSPW', 'HLA-A2'],
 ['WKHQFAWPF', 'HLA-A2'],
 ['AGLVPGGPGF', 'HLA-A2'],
 ['SIVHLFEWRW', 'HLA-A2'],
 ['VISWGNSVTIW', 'HLA-A2'],
 ['VPLHSSWTFW', 'HLA-A2'],
 ['HIILVLNAA', 'HLA-A2'],
 ['FWLLIIISS', 'HLA-A2'],
 ['NISKTIATSQ', 'HLA-A2'],
 ['ETVISEEPP', 'HLA-A2'],
 ['GGAAAGDDT', 'HLA-A2'],
 ['RQPAEMAAEL', 'HLA-A2'],
 ['LVFALSIGA', 'HLA-A2'],
 ['IVHDLESPGID', 'HLA-A2'],
 ['EQLYGRLAA', 'HLA-A2'],
 ['YKSSMCVKHFK', 'HLA-A2'],
 ['ERKPGDIR', 'HLA-A2'],
 ['KTRSHVTHR', 'HLA-A2'],
 ['LATLIGWTV', 'HLA-A2'],
 ['SQVGMTAPG', 'HLA-A2'],
 ['SMSYCDESRL', 'HLA-A2'],
 ['SEGKVEEAQGM', 'HLA-A2'],
 ['SRRVETSQR', 'HLA-A2'],
 ['SSSPESASRR', 'HLA-A2'],
 ['AMLPYLINYY', 'HLA-A2'],
 ['GKDLYFMSV', 'HLA-A2'],
 ['DPPLKNVSS', 'HLA-A2'],
 ['KELREVLDQ', 'HLA-A2'],
 ['SYSYLGLGL', 'HLA-A2'],
 ['FASVYSRLV', 'HLA-A2'],
 ['ADSCAAPRA', 'H

In [21]:
neg_affinity = pd.DataFrame(neg)

In [22]:
neg_affinity[2] = 0

In [23]:
neg_affinity.rename(columns = {0:'epitope', 1:'allele', 2:'label'}, inplace = True)

In [24]:
neg_affinity

Unnamed: 0,epitope,allele,label
0,PLQKQLPAFIS,HLA-A2,0
1,SLAACTNQP,HLA-A2,0
2,DDHVSQVQA,HLA-A2,0
3,KEFSGENLGF,HLA-A2,0
4,GTANGPVNTE,HLA-A2,0
...,...,...,...
881095,TVTLSLCQL,HLA-A*68:12,0
881096,TSAPDTRPA,HLA-A*68:12,0
881097,PSPGLARWAE,HLA-A*68:12,0
881098,ELYQVAQEQV,HLA-A*68:12,0


In [25]:
len(neg_affinity['allele'].unique())

163

In [26]:
neg_affinity['allele'].unique()

array(['HLA-A2', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*02:01',
       'HLA-B*27:05', 'HLA-B*27:09', 'HLA-B*35:01', 'HLA-B7',
       'HLA-B*07:02', 'HLA-A*02:02', 'HLA-A*02:06', 'HLA-A*68:02',
       'HLA-A*02:03', 'HLA-A*68:01', 'HLA-A*31:01', 'HLA-A*33:01',
       'HLA-B37', 'HLA-A*01:01', 'HLA-B*58:01', 'HLA-A*24:02',
       'HLA-B*40:01', 'HLA-B*15:01', 'HLA-A68', 'HLA-A*02:05',
       'HLA-B*08:01', 'HLA-B*27:01', 'HLA-B*51:01', 'HLA-B*53:01',
       'HLA-B*54:01', 'HLA-A*02:07', 'HLA-A*23:01', 'HLA-A*30:02',
       'HLA-A*26:01', 'HLA-A*29:02', 'HLA-A11', 'HLA-B*15:16', 'HLA-B27',
       'HLA-A3', 'HLA-B*27:03', 'HLA-A1', 'HLA-B8', 'HLA-B44',
       'HLA-B*27:02', 'HLA-B*44:03', 'HLA-B*44:02', 'HLA-B*45:01',
       'HLA-B*40:02', 'HLA-B*18:01', 'HLA-B*57:01', 'HLA-A*69:01',
       'HLA-A*24:03', 'HLA-A*30:01', 'HLA-B*14:02', 'HLA class I',
       'HLA-B53', 'HLA-B51', 'HLA-A31', 'HLA-A26', 'HLA-B62', 'HLA-B58',
       'HLA-B39', 'HLA-B*35:08', 'HLA-B35', 'HLA-B*39:01', 'HLA-B*46:01

In [None]:
#c = ['epitope', 'allele', 'allele.2', 'affinity']
#neg_affinity_reordered = neg_affinity[c]

In [None]:
#neg_affinity_reordered

In [27]:
neg_affinity.to_csv('negative_epitope_seq.txt', sep = '\t', header = False, index = False)

In [None]:
#ofile = open("/home/jjia1/viralepitope/viralepitope/negative_epitope_seq.txt", "w")
#for line in neg:
#    ofile.write(line[2]+'\t'+line[0]+'\t'+line[1]+'\n')
#    i = i+1
#ofile.close() 