In [1]:
"""
Feature Encoding Script
@jjia
"""

'\nFeature Encoding Script\n@jjia\n'

In [2]:
import os, re, sys
import numpy as np
import pandas as pd
from math import log
import random
import scipy.io
from collections import defaultdict

In [3]:
def read_blosum(path,one_hot):
    '''
    Read the blosum matrix from the file blosum50.txt
    Args:
        1. path: path to the file blosum50.txt
    Return values:
        1. The blosum50 matrix
    '''
    f = open(path,"r")
    blosum = []
    if one_hot ==0: #(blosum 50)
       for line in f:
           blosum.append([(float(i))/10 for i in re.split("\t",line)])
    else:
        for line in f: #(one-hot)
           blosum.append([float(i) for i in re.split("\t",line)])
    f.close()
    return blosum

path_dict = "/home/jjia1/viralepitope/"
blosum_matrix = read_blosum(path_dict + 'blosum50.txt', 0)

pseq_dict = np.load(path_dict + 'pseq_dict_all.npy', allow_pickle = True).item()
#pseq_dict_blosum_matrix = pseudo_seq(pseq_dict, blosum_matrix)


In [4]:
np.save('../viralepitope/blosum_matrix.npy', blosum_matrix)

In [10]:
pos_data = pd.read_csv('positive_epitope_seq.txt', sep = '\t', header = None)
neg_data = pd.read_csv('negative_epitope_seq.txt', sep = '\t', header = None)

In [11]:
pos_data

Unnamed: 0,0,1,2
0,KLEDLERDL,HLA-A*02:01,1
1,LITGRLQSL,HLA-A2,1
2,EVMPVSMAK,HLA-A*03:01,1
3,EVMPVSMAK,HLA-A*11:01,1
4,KTFPPTEPK,HLA-A*03:01,1
...,...,...,...
88046,ILLWQPIPV,HLA-A*02:01,1
88047,ALDVYNGLL,HLA-A*02:01,1
88048,LTDAVKVMDL,HLA-A*02:01,1
88049,KLQCVDLHV,HLA-A*02:01,1


In [12]:
neg_data

Unnamed: 0,0,1,2
0,VLMNHKVFE,HLA-A2,0
1,DLQHLSREER,HLA-A2,0
2,CGFDVLYPP,HLA-A2,0
3,SINNDLNLR,HLA-A2,0
4,DKDTASNEN,HLA-A2,0
...,...,...,...
880495,AMHKAMLMA,HLA-A*68:12,0
880496,KRIFSFLDLF,HLA-A*68:12,0
880497,TSPCRSQVL,HLA-A*68:12,0
880498,VQSGNLALA,HLA-A*68:12,0


In [13]:
data = pd.concat([pos_data, neg_data], axis = 0)

In [15]:
data_shuffled = data.sample(frac =1)

In [18]:
data_shuffled.reset_index(drop = True)

Unnamed: 0,0,1,2
0,SEDCSGEGK,HLA-A*24:02,0
1,KLNSQHKLNA,HLA-A*02:06,0
2,TTLSIYFLL,HLA-A*24:02,1
3,TTSNGAGTA,HLA-B*08:01,0
4,STRPRDTEE,HLA-A*01:01,0
...,...,...,...
968546,DSESGSPETK,HLA-A*68:01,0
968547,YMVTDKTAYI,HLA-A*02:03,1
968548,GNPSVRPGLA,HLA-B*58:01,0
968549,NYVLQTLGT,HLA-A*02:06,0


In [19]:
data_shuffled.to_csv('fulldata_withlabels_combined.txt', sep = '\t', header = False, index = False)

In [5]:
def mhc_peptide_pair(path, pseq_dict_matrix, blosum_matrix):
    aa={"A":0,"R":1,"N":2,"D":3,"C":4,"Q":5,"E":6,"G":7,"H":8,"I":9,"L":10,"K":11,"M":12,"F":13,"P":14,"S":15,"T":16,"W":17,"Y":18,"V":19}
    data_dict = {}
    pep_length = [8,9,10,11,12,13,14,15]
    f = open(path,"r")
    for line in f:
        info = re.split("\t",line)#Retrive information from a tab-delimited line
        allele = info[1].strip()
        if allele in pseq_dict.keys():
            affinity = int(info[-1].strip()) #Retrive lable information 
            pep = info[0].strip() #Retrive ligand information 
            
            if set(list(pep)).difference(list('ACDEFGHIKLMNPQRSTVWY')):
                print('Illegal peptides')
                continue   
            if len(pep) not in pep_length:
                print('Illegal peptides')
                continue 
                
            pep_blosum = []#Encoded peptide seuqence
            for residue_index in range(15):
                #Encode the peptide sequence in the 1-12 columns, with the N-terminal aligned to the left end
                #If the peptide is shorter than 12 residues, the remaining positions on
                #the rightare filled will zero-padding
                if residue_index < len(pep):
                    pep_blosum.append(blosum_matrix[aa[pep[residue_index]]])
                else:
                    pep_blosum.append(np.zeros(20))
            for residue_index in range(15):
                #Encode the peptide sequence in the 13-24 columns, with the C-terminal aligned to the right end
                #If the peptide is shorter than 12 residues, the remaining positions on
                #the left are filled will zero-padding
                if 15 - residue_index > len(pep):
                    pep_blosum.append(np.zeros(20)) 
                else:
                    pep_blosum.append(blosum_matrix[aa[pep[len(pep) - 15 + residue_index]]])

            new_data = [pep_blosum, pseq_dict_matrix[allele], affinity]
            
            if allele not in data_dict.keys():
                data_dict[allele] = [new_data]
            else:
                data_dict[allele].append(new_data)
                
    return data_dict

In [6]:
pos_pseudo_seq = mhc_peptide_pair('/home/jjia1/viralepitope/viralepitope/iedb_data.txt', pseq_dict, blosum_matrix)

Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides
Illegal peptides


In [7]:
type(pos_pseudo_seq)

dict

In [8]:
neg_pseudo_seq = mhc_peptide_pair('/home/jjia1/viralepitope/viralepitope/negative_epitope_seq_affinity.txt', pseq_dict, blosum_matrix)

In [9]:
neg_pseudo_seq['HLA-A*24:03']

[[[[-0.1,
    -0.4,
    -0.3,
    -0.4,
    -0.2,
    -0.3,
    -0.4,
    -0.4,
    -0.4,
    0.5,
    0.2,
    -0.3,
    0.2,
    0.0,
    -0.3,
    -0.3,
    -0.1,
    -0.3,
    -0.1,
    0.4],
   [-0.1,
    0.3,
    0.0,
    -0.1,
    -0.3,
    0.2,
    0.1,
    -0.2,
    0.0,
    -0.3,
    -0.3,
    0.6,
    -0.2,
    -0.4,
    -0.1,
    0.0,
    -0.1,
    -0.3,
    -0.2,
    -0.3],
   [0.0,
    -0.3,
    0.0,
    -0.1,
    -0.3,
    -0.2,
    -0.3,
    0.8,
    -0.2,
    -0.4,
    -0.4,
    -0.2,
    -0.3,
    -0.4,
    -0.2,
    0.0,
    -0.2,
    -0.3,
    -0.3,
    -0.4],
   [-0.2,
    0.0,
    0.1,
    -0.1,
    -0.3,
    0.1,
    0.0,
    -0.2,
    1.0,
    -0.4,
    -0.3,
    0.0,
    -0.1,
    -0.1,
    -0.2,
    -0.1,
    -0.2,
    -0.3,
    0.2,
    -0.4],
   [-0.1,
    0.1,
    0.0,
    0.0,
    -0.3,
    0.7,
    0.2,
    -0.2,
    0.1,
    -0.3,
    -0.2,
    0.2,
    0.0,
    -0.4,
    -0.1,
    0.0,
    -0.1,
    -0.1,
    -0.1,
    -0.3],
   [0.0,
    -0.3,
    -0.3

In [15]:
import pickle
with open('/home/jjia1/jjia1/viralepitope/pos_pseudo_seq_dictionary.pkl', 'wb') as f:
    pickle.dump(pos_pseudo_seq, f)

In [14]:
with open('/home/jjia1/jjia1/viralepitope/neg_pseudo_seq_dictionary.pkl', 'wb') as f:
    pickle.dump(neg_pseudo_seq, f)

In [19]:
#import csv
# open file for writing, "w" is writing
#w = csv.writer(open("viralepitope/positive_pseudo_seq.csv", "w"))

# loop over dictionary keys and values
#for key, val in pos_pseudo_seq.items():
#    # write every key and value to file
#    w.writerow([key, val])

In [20]:
# open file for writing, "w" is writing
#w1 = csv.writer(open("viralepitope/negative_pseudo_seq.csv", "w"))

# loop over dictionary keys and values
#for key, val in neg_pseudo_seq.items():
    # write every key and value to file
#    w1.writerow([key, val])