In [1]:
import requests
import pandas as pd
import numpy as np
import os
import re
from io import StringIO

In [2]:
Test_PATH = '/content/immuno_data_test_IEDB_A0201_HLAseq_2_csv.csv'

In [3]:
test_peptides = []
with open(Test_PATH) as f:
    for num, row in enumerate(f): 
        # Processing 
        if num != 0:
            peptide = row[:11]
            for pos, letter in enumerate(peptide):
                if letter.upper() == "J":
                    final_peptide = peptide[pos + 1:]
            test_peptides.append(final_peptide)

In [4]:
Train_PATH = '/content/immuno_data_train_IEDB_A0201_HLAseq_2_csv.csv'

In [5]:
train_peptides = []
with open(Train_PATH) as f:
    for num, row in enumerate(f): 
        # Processing 
        if num != 0:
            peptide = row[:11]
            for pos, letter in enumerate(peptide):
                if letter.upper() == "J":
                    final_peptide = peptide[pos + 1:]
            train_peptides.append(final_peptide)

In [6]:
def get_immunogenicity_df(model,sequence_string, allele, length):
    
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
    }
    data = 'method='+ model + '&sequence_text=' + sequence_string + '&allele='+ allele + '&length=' + length
    print(data)
    response = requests.post('http://tools-cluster-interface.iedb.org/tools_api/mhci/', headers=headers, data=data)
    return response.text

In [7]:
#function that converts list of strings to fasta format for API query

def convert_to_iedb(list_of_strings):
    dfs = pd.DataFrame(list_of_strings, columns =['sequence'])
    dfs['pep'] = 'peptide'
    dfs['num'] = range(len(dfs))
    dfs['num'] = dfs['num'].astype(str)
    dfs['fafsa'] = dfs['pep'] + dfs['num']
    str_list = []
    for x,y in zip(dfs['fafsa'].tolist(),dfs['sequence'].tolist()):
        str_list.append(x + '%0'+ 'A'+ y + '%0A%3E')
    new_str = ''.join(str_list)
    new_str = '%3E' + new_str[:-6]
    return new_str

In [8]:
converted_test_seqs = convert_to_iedb(test_peptides)
converted_train_seqs = convert_to_iedb(train_peptides) 

In [52]:
### Test Dataset: Class-I Binding 

col_names = ['allele', 'seq_num', 'start', 'end', 'length', 'peptide', 'ic50', 'percentile_rank', 'method']

alleles = ['HLA-A*01:01','HLA-A*02:01']

# Removed 'consensus' as it gives additional results 
# Note: 'netmhcpan_ba', 'netmhcpan_el', 'recommended' provides an additonal 'core' and 'icore' metric that I removed because it is same as peptide sequence due to length of sequence  
methods = ['ann', 'comblib_sidney2008', 'netmhccons', 'netmhcpan_ba', 'netmhcpan_el', 'netmhcstabpan', 'pickpocket', 'recommended', 'smm', 'smmpmbec']

url = 'http://tools-cluster-interface.iedb.org/tools_api/mhci/'

final_test_results_mhci = pd.DataFrame(columns = col_names) 

for method in methods:
  for allele in alleles: 
    # HLA-A*01:01 does not work with comlib_sidney2008 
    if (allele != 'HLA-A*01:01' or method != 'comblib_sidney2008'):
            res = get_immunogenicity_df(method, converted_test_seqs, allele, '9')
            parts = res.split('\n')
            # Append to pd dataframe 
            for part in parts[1: len(parts) - 1]: 
                part = part.split('\t')
                if (method in ['netmhcpan_ba', 'netmhcpan_el', 'recommended']):
                  part = part[0:6] + part[8:]
                part.append(method)
                try: 
                  final_test_results_mhci.loc[len(final_test_results_mhci.index)] = part 
                except Exception as e: print(e)
print("Done!")

method=ann&sequence_text=%3Epeptide0%0AGMPPHMLPVL%0A%3Epeptide1%0AGLALLACAGL%0A%3Epeptide2%0ARIAQCFLRV%0A%3Epeptide3%0AALARWLPPV%0A%3Epeptide4%0ATHLMVLCCV%0A%3Epeptide5%0ALLIKKLPRV%0A%3Epeptide6%0ALLDQLIEEV%0A%3Epeptide7%0ALLDQLIEEV%0A%3Epeptide8%0AVLLNAPSEA%0A%3Epeptide9%0AYLLSGSDLFI%0A%3Epeptide10%0ALMIEYNLLT%0A%3Epeptide11%0AGLADGMEHL%0A%3Epeptide12%0AFLGGHVAVA%0A%3Epeptide13%0AFVVPILLKA%0A%3Epeptide14%0ATLACFVLAAV%0A%3Epeptide15%0AVLIAGYIIVF%0A%3Epeptide16%0ATLEDLLMGT%0A%3Epeptide17%0ATLEDLLMGT%0A%3Epeptide18%0ALMAVAILKEV%0A%3Epeptide19%0AGLGQVPLIV%0A%3Epeptide20%0AIMLEALERV%0A%3Epeptide21%0AYLLPEAEEI%0A%3Epeptide22%0AMLGIWFFTL%0A%3Epeptide23%0AGMVKAALEAI%0A%3Epeptide24%0AAAAWYLWEV%0A%3Epeptide25%0AKIRSDNIKKL%0A%3Epeptide26%0ALLIGICVAV%0A%3Epeptide27%0ANLDTLMTYV%0A%3Epeptide28%0ALLDTNYNLFY%0A%3Epeptide29%0AFLAADGHPA%0A%3Epeptide30%0ATLWYRAPEV%0A%3Epeptide31%0ATLWYRAPEV%0A%3Epeptide32%0AYLHPKEYEW%0A%3Epeptide33%0AVLWDYVYQL%0A%3Epeptide34%0AKLKKIKNSL%0A%3Epeptide35%0AKLIANNTRV%0A%3Ep

In [57]:
# Export into CSV 
final_test_results_mhci.to_csv('test_mhci_data.csv')

In [58]:
### Train Dataset: Class-I Binding 

col_names = ['allele', 'seq_num', 'start', 'end', 'length', 'peptide', 'ic50', 'percentile_rank', 'method']

alleles = ['HLA-A*01:01','HLA-A*02:01']

# Removed 'consensus' as it gives additional results 
# Note: 'netmhcpan_ba', 'netmhcpan_el', 'recommended' provides an additonal 'core' and 'icore' metric that I removed because it is same as peptide sequence due to length of sequence  
methods = ['ann', 'comblib_sidney2008', 'netmhccons', 'netmhcpan_ba', 'netmhcpan_el', 'netmhcstabpan', 'pickpocket', 'recommended', 'smm', 'smmpmbec']

url = 'http://tools-cluster-interface.iedb.org/tools_api/mhci/'

final_train_results_mhci = pd.DataFrame(columns = col_names) 

for method in methods:
  for allele in alleles: 
    # HLA-A*01:01 does not work with comlib_sidney2008 
    if (allele != 'HLA-A*01:01' or method != 'comblib_sidney2008'):
            res = get_immunogenicity_df(method, converted_train_seqs, allele, '9')
            parts = res.split('\n')
            # Append to pd dataframe 
            for part in parts[1: len(parts) - 1]: 
                part = part.split('\t')
                if (method in ['netmhcpan_ba', 'netmhcpan_el', 'recommended']):
                  part = part[0:6] + part[8:]
                part.append(method)
                try: 
                  final_train_results_mhci.loc[len(final_train_results_mhci.index)] = part 
                except Exception as e: print(e)
print("Done!")

method=ann&sequence_text=%3Epeptide0%0ASLILVSQYT%0A%3Epeptide1%0ALMSTLLIYL%0A%3Epeptide2%0ALLHTDFEQV%0A%3Epeptide3%0ALLHTDFEQV%0A%3Epeptide4%0AMMIDDFGTA%0A%3Epeptide5%0ASLLSGDWVL%0A%3Epeptide6%0AKTLETPEFV%0A%3Epeptide7%0AGLYDGMEHC%0A%3Epeptide8%0ALIIPFIHLI%0A%3Epeptide9%0AVLAFGFALL%0A%3Epeptide10%0ALLVRNSFEV%0A%3Epeptide11%0AVDSIFEQWL%0A%3Epeptide12%0ANELFDSLFPV%0A%3Epeptide13%0AIIALLFALV%0A%3Epeptide14%0AFVLVILARL%0A%3Epeptide15%0ADQVILLNKH%0A%3Epeptide16%0AVLILLLLIYL%0A%3Epeptide17%0AFLSEHPNVTL%0A%3Epeptide18%0AYLESFCEDV%0A%3Epeptide19%0ALMIFISSFL%0A%3Epeptide20%0AFLLVIGACV%0A%3Epeptide21%0AAMAVLYLAL%0A%3Epeptide22%0AAMAGASTSA%0A%3Epeptide23%0AVLAGSVDEL%0A%3Epeptide24%0APGLSISGNL%0A%3Epeptide25%0AILDKVLVHL%0A%3Epeptide26%0AFYLTNDVSF%0A%3Epeptide27%0AFYLTNDVSF%0A%3Epeptide28%0ASLAVVSTQL%0A%3Epeptide29%0ALLAILPYYV%0A%3Epeptide30%0ASLLRSLENV%0A%3Epeptide31%0ALIIPCIHLI%0A%3Epeptide32%0AKLVGKTVKV%0A%3Epeptide33%0AHVLKAVFSR%0A%3Epeptide34%0AVLLSICYLL%0A%3Epeptide35%0AYLGGMSYYC%0A%3Epeptide

In [61]:
# Export into CSV 
final_train_results_mhci.to_csv('train_mhci_data.csv')

In [62]:
### Test Datset: MHC Processing Predictions 

col_names = ['allele', 'seq_num', 'start', 'end', 'length', 'peptide', 'ic50', 'percentile_rank', 'method']

alleles = ['HLA-A*01:01','HLA-A*02:01']

# Removed 'consensus' as it gives additional results 
# Note: 'netmhcpan_ba', 'netmhcpan_el', 'recommended' provides an additonal 'core' and 'icore' metric that I removed because it is same as peptide sequence due to length of sequence  
methods = ['ann', 'comblib_sidney2008', 'netmhccons', 'netmhcpan_ba', 'netmhcpan_el', 'netmhcstabpan', 'pickpocket', 'recommended', 'smm', 'smmpmbec']

url = 'http://tools-cluster-interface.iedb.org/tools_api/processing/'

final_test_results_mhcp = pd.DataFrame(columns = col_names) 

for method in methods:
  for allele in alleles: 
    # HLA-A*01:01 does not work with comlib_sidney2008 
    if (allele != 'HLA-A*01:01' or method != 'comblib_sidney2008'):
            res = get_immunogenicity_df(method, converted_test_seqs, allele, '9')
            parts = res.split('\n')
            # Append to pd dataframe 
            for part in parts[1: len(parts) - 1]: 
                part = part.split('\t')
                if (method in ['netmhcpan_ba', 'netmhcpan_el', 'recommended']):
                  part = part[0:6] + part[8:]
                part.append(method)
                try: 
                  final_test_results_mhcp.loc[len(final_test_results_mhcp.index)] = part 
                except Exception as e: print(e)
print("Done!")

method=ann&sequence_text=%3Epeptide0%0AGMPPHMLPVL%0A%3Epeptide1%0AGLALLACAGL%0A%3Epeptide2%0ARIAQCFLRV%0A%3Epeptide3%0AALARWLPPV%0A%3Epeptide4%0ATHLMVLCCV%0A%3Epeptide5%0ALLIKKLPRV%0A%3Epeptide6%0ALLDQLIEEV%0A%3Epeptide7%0ALLDQLIEEV%0A%3Epeptide8%0AVLLNAPSEA%0A%3Epeptide9%0AYLLSGSDLFI%0A%3Epeptide10%0ALMIEYNLLT%0A%3Epeptide11%0AGLADGMEHL%0A%3Epeptide12%0AFLGGHVAVA%0A%3Epeptide13%0AFVVPILLKA%0A%3Epeptide14%0ATLACFVLAAV%0A%3Epeptide15%0AVLIAGYIIVF%0A%3Epeptide16%0ATLEDLLMGT%0A%3Epeptide17%0ATLEDLLMGT%0A%3Epeptide18%0ALMAVAILKEV%0A%3Epeptide19%0AGLGQVPLIV%0A%3Epeptide20%0AIMLEALERV%0A%3Epeptide21%0AYLLPEAEEI%0A%3Epeptide22%0AMLGIWFFTL%0A%3Epeptide23%0AGMVKAALEAI%0A%3Epeptide24%0AAAAWYLWEV%0A%3Epeptide25%0AKIRSDNIKKL%0A%3Epeptide26%0ALLIGICVAV%0A%3Epeptide27%0ANLDTLMTYV%0A%3Epeptide28%0ALLDTNYNLFY%0A%3Epeptide29%0AFLAADGHPA%0A%3Epeptide30%0ATLWYRAPEV%0A%3Epeptide31%0ATLWYRAPEV%0A%3Epeptide32%0AYLHPKEYEW%0A%3Epeptide33%0AVLWDYVYQL%0A%3Epeptide34%0AKLKKIKNSL%0A%3Epeptide35%0AKLIANNTRV%0A%3Ep

In [63]:
# Export into CSV 
final_test_results_mhcp.to_csv('test_mhcp_data.csv')

In [64]:
### Train Dataset: MHC Processing Predictions 

col_names = ['allele', 'seq_num', 'start', 'end', 'length', 'peptide', 'ic50', 'percentile_rank', 'method']

alleles = ['HLA-A*01:01','HLA-A*02:01']

# Removed 'consensus' as it gives additional results 
# Note: 'netmhcpan_ba', 'netmhcpan_el', 'recommended' provides an additonal 'core' and 'icore' metric that I removed because it is same as peptide sequence due to length of sequence  
methods = ['ann', 'comblib_sidney2008', 'netmhccons', 'netmhcpan_ba', 'netmhcpan_el', 'netmhcstabpan', 'pickpocket', 'recommended', 'smm', 'smmpmbec']

url = 'http://tools-cluster-interface.iedb.org/tools_api/processing/'

final_train_results_mhcp = pd.DataFrame(columns = col_names) 

for method in methods:
  for allele in alleles: 
    # HLA-A*01:01 does not work with comlib_sidney2008 
    if (allele != 'HLA-A*01:01' or method != 'comblib_sidney2008'):
            res = get_immunogenicity_df(method, converted_train_seqs, allele, '9')
            parts = res.split('\n')
            # Append to pd dataframe 
            for part in parts[1: len(parts) - 1]: 
                part = part.split('\t')
                if (method in ['netmhcpan_ba', 'netmhcpan_el', 'recommended']):
                  part = part[0:6] + part[8:]
                part.append(method)
                try: 
                  final_train_results_mhcp.loc[len(final_train_results_mhcp.index)] = part 
                except Exception as e: print(e)
print("Done!")

method=ann&sequence_text=%3Epeptide0%0ASLILVSQYT%0A%3Epeptide1%0ALMSTLLIYL%0A%3Epeptide2%0ALLHTDFEQV%0A%3Epeptide3%0ALLHTDFEQV%0A%3Epeptide4%0AMMIDDFGTA%0A%3Epeptide5%0ASLLSGDWVL%0A%3Epeptide6%0AKTLETPEFV%0A%3Epeptide7%0AGLYDGMEHC%0A%3Epeptide8%0ALIIPFIHLI%0A%3Epeptide9%0AVLAFGFALL%0A%3Epeptide10%0ALLVRNSFEV%0A%3Epeptide11%0AVDSIFEQWL%0A%3Epeptide12%0ANELFDSLFPV%0A%3Epeptide13%0AIIALLFALV%0A%3Epeptide14%0AFVLVILARL%0A%3Epeptide15%0ADQVILLNKH%0A%3Epeptide16%0AVLILLLLIYL%0A%3Epeptide17%0AFLSEHPNVTL%0A%3Epeptide18%0AYLESFCEDV%0A%3Epeptide19%0ALMIFISSFL%0A%3Epeptide20%0AFLLVIGACV%0A%3Epeptide21%0AAMAVLYLAL%0A%3Epeptide22%0AAMAGASTSA%0A%3Epeptide23%0AVLAGSVDEL%0A%3Epeptide24%0APGLSISGNL%0A%3Epeptide25%0AILDKVLVHL%0A%3Epeptide26%0AFYLTNDVSF%0A%3Epeptide27%0AFYLTNDVSF%0A%3Epeptide28%0ASLAVVSTQL%0A%3Epeptide29%0ALLAILPYYV%0A%3Epeptide30%0ASLLRSLENV%0A%3Epeptide31%0ALIIPCIHLI%0A%3Epeptide32%0AKLVGKTVKV%0A%3Epeptide33%0AHVLKAVFSR%0A%3Epeptide34%0AVLLSICYLL%0A%3Epeptide35%0AYLGGMSYYC%0A%3Epeptide

In [65]:
# Export into CSV 
final_train_results_mhcp.to_csv('train_mhcp_data.csv')