In [18]:
import pandas as pd

In [19]:
HLA_dir = "../sequences/HLA_I_set_unique.txt"
HLA_list = [] # for netMHCpan
HLA_list_stars = [] # for pop coverage analysis

with open(HLA_dir, 'r') as input:
    hlas = input.readlines()
    for hla in hlas:
        HLA_list_stars.append(hla.strip('\n'))
        HLA_list.append("".join(hla.strip('\n').split("*")))

In [20]:
import os

prot = "NS1"

In [21]:
import glob

def fetch_files(protein):
    conserved_epitopes = glob.glob("../netmhc_binding_inputs/{}/{}_conserved*".format(prot, prot))
    exp_epitopes = glob.glob("../netmhc_binding_inputs/{}/{}_exp*".format(prot, prot))

    exp_mers = [i.split("_")[-2] for i in exp_epitopes]
    known_mers = [i.split("_")[-2] for i in conserved_epitopes]

    return conserved_epitopes, known_mers, exp_epitopes, exp_mers

_, kmers, _, emers = fetch_files(prot)

print(kmers)
print(emers)

['9']
['10', '11', '9']


In [22]:
def run_mhcbinding(prot, file_path, epitopes, status, HLA, mers):
    
    netMHCpan = "/mnt/c/Users/jhlee/Desktop/Thesis/works/models/netMHCpan-4.1/netMHCpan"

    for i in range(len(mers)):
        hla_name = HLA.replace(":", "_")
        xlsout = "{}/{}_{}_{}_mers_{}.xls".format(file_path, prot, status, mers[i], hla_name)
        run_netmhcpan = "{} -f {} -xls -a {} -l {} -xlsfile {}".format(netMHCpan, epitopes[i], HLA, mers[i], xlsout)

        os.system(run_netmhcpan)

In [38]:
file_path = "../netmhcpan_binding_result/{}".format(prot)

if not os.path.exists(file_path):
  os.mkdir(file_path)

_, _, exp_epitopes, exp_mers = fetch_files(prot)

for hla in HLA_list:
  run_mhcbinding(prot, file_path, exp_epitopes, 'experimental', hla, exp_mers)

# /mnt/c/Users/jhlee/Desktop/Thesis/works/models/netMHCpan-4.1/Linux_x86_64/bin/netMHCpan -f ../netmhc_binding_inputs/NS1/NS1_exp_10_mers.fa -xls -a HLA-A01:01 -l 10 -xlsfile ../netmhcpan_binding_result/NS1/NS1_experimental_10_mers_HLA-A01_01.xls
# Tue Nov 23 13:51:17 2021
# User: juhlee
# PWD : /mnt/c/Users/jhlee/Desktop/Thesis/works/src
# Host: Linux JuHyung 4.4.0-22000-Microsoft x86_64
# -f       ../netmhc_binding_inputs/NS1/NS1_exp_10_mers.fa File name with input
# -xls     1                    Save output to xls file
# -a       HLA-A01:01           MHC allele
# -l       10                   Peptide length [8-11] (multiple length with ,)
# -xlsfile ../netmhcpan_binding_result/NS1/NS1_experimental_10_mers_HLA-A01_01.xls Filename for xls dump
# Command line parameters set to:
#	[-rdir filename]     /mnt/c/Users/jhlee/Desktop/Thesis/works/models/netMHCpan-4.1/Linux_x86_64 Home directory for NetMHpan
#	[-syn filename]      /mnt/c/Users/jhlee/Desktop/Thesis/works/models/netMHCpan-4.1/Li

In [11]:
file_path = "../netmhcpan_tool_final_result/{}".format(prot)

if not os.path.exists(file_path):
  os.mkdir(file_path)

for hla in HLA_list:
  run_mhcbinding(prot, file_path, conserved_epitopes, 'conserved', hla, known_mers)

In [6]:
HLA_list[-7:]

['HLA-B58:01',
 'HLA-C04:01',
 'HLA-C05:01',
 'HLA-C07:02',
 'HLA-C12:03',
 'HLA-C15:02',
 'HLA-C16:01']

In [23]:
import io
from xlwt import Workbook
from __future__ import unicode_literals

def recover_xls(filename):
    file1 = io.open(filename, "r")

    data = file1.readlines()
    data

    xldoc = Workbook()
    sheet = xldoc.add_sheet("Sheet1", cell_overwrite_ok=True)
    # Iterating and saving the data to sheet
    for i, row in enumerate(data):
        # Two things are done here
        # Removeing the '\n' which comes while reading the file using io.open
        # Getting the values after splitting using '\t'
        for j, val in enumerate(row.replace('\n', '').split('\t')):
            sheet.write(i, j, val)
    
    xldoc.save(filename)

In [24]:
def netmhcpan_summarize(protein, status, mers, hla_list):

    # Path to netMHCpan results
    results_path = "../netmhcpan_binding_result/{}".format(protein)

    all_results = []

    # For given k-mer, retrieve all the netMHCpan results and summarize
    for hla in hla_list:

        hla_results = []

        for i in range(len(mers)):
            hla_mer_result = "{}/{}_{}_{}_mers_{}.xls".format(results_path, protein, status, mers[i], hla.replace(":","_"))

            # xls file outputted from netMHCpan is somehow corrupted...
            # try to recover it.
            try:
                recover_xls(hla_mer_result)
            except:
                pass

            hla_mer_result = pd.ExcelFile(hla_mer_result).parse('Sheet1')

            col_names = hla_mer_result.iloc[0, :]
            hla_mer_result = hla_mer_result.iloc[1:, :]
            hla_mer_result.columns = col_names
            hla_mer_result.pop("Ave")

            hla_results.append(hla_mer_result)
        
        hla_results = pd.concat(hla_results)
        
        # NB = 0, no binding, NB =1, there is a binding (either weak or strong, depending on EL_Rank)
        hla_results["NB"] = [hla_results["NB"].iloc[i].strip() for i in range(len(hla_results))]

        binding_results = hla_results.loc[hla_results["NB"] == "1"]

        #HLA-A:02:01 -> HLA-A*02:01 put a star in between, for matching formats for population coverage analysis..
        hla_star = hla[:5] + "*" + hla[5:]
        hla_column = [hla_star] * len(binding_results)
        binding_results.insert(5, "HLA", hla_column)
        
        binding_strength = []
        for value in binding_results["EL_Rank"]:         
            if float(value) <= 0.5:
                binding_strength.append("SB")
            else:
                binding_strength.append("WB")
        
        binding_results.insert(6, "Binding", binding_strength)
        binding_results.pop("NB")

        all_results.append(binding_results)

    all_results = pd.concat(all_results).sort_values('Peptide')

    # Worst case. No binding cases at all. . . SAD
    if all_results.empty:
        return all_results

    return all_results

In [25]:
def immunogenicity_result(protein, status):
    """
    Input
        - netmhc_summary = pd table of netMHCpan summary
    """
    netmhc_summary = pd.read_csv("../MHC_neoIM_summary/{}_{}_epitopes_summary.csv".format(protein, status))
    unique_peptides = list(set(netmhc_summary['Peptide'].tolist()))

    # Immunogenicity result for corresponding proteins
    neoIM_file = '../Ju_results2/{}_neoIM.csv'.format(protein)
    neoIM = pd.read_csv(neoIM_file)

    neoim_results = []
    IDs = []

    for pep in unique_peptides:

        neoim_score = neoIM.loc[neoIM['epitope'] == pep]['neoIM_score']

        if len(neoim_score) == 0:
            neoim_results.append("-")
        else:
            neoim_results.append(neoim_score.iloc[0])
        
        identity = netmhc_summary.loc[netmhc_summary['Peptide'] == pep]['ID'].tolist()[0]
        IDs.append(identity)
    
    data = {
        "epitope": unique_peptides,
        "ID": IDs,
        "neoIM_score": neoim_results
    }

    immuno_pd = pd.DataFrame(data=data).sort_values("neoIM_score", ascending=False)

    return immuno_pd

In [26]:
immunogenicity_result("PB1", 'exp')

Unnamed: 0,epitope,ID,neoIM_score
0,FNMLSTVLGV,cand_82_o,0.98385
37,GMFNMLSTVL,cand_99_o,0.98105
62,NMLSTVLGV,cand_104_o,0.97295
24,GMFNMLSTV,cand_107_o,0.97220
101,FVANFSMEL,cand_307_o,0.96910
...,...,...,...
32,NTMTKDAER,cand_8,0.32945
21,RNRSILNTS,cand_333,0.32895
8,TKDAERGKL,cand_11,0.32195
9,DAERGKLKR,cand_46,0.31680


In [27]:
proteins = ["HA", "NA", "M1", "M2", "PB1", "PB2", "PA", "NP", "NS1", "NS2"]

for prot in proteins:
    try:
        immuno_cons = immunogenicity_result(prot, 'cons')
        immuno_cons.to_csv("../MHC_neoIM_summary/neoIM_summary/{}_neoim_cons.csv".format(prot), index=False)
    except:
        pass
    try:
        immuno_exp = immunogenicity_result(prot, 'exp')
        immuno_exp.to_csv("../MHC_neoIM_summary/neoIM_summary/{}_neoim_exp.csv".format(prot), index=False)
    except:
        pass

In [28]:
def output_for_population_coverage(outfile_path, summary):

    unique_peptides = sorted(set(summary["Peptide"].tolist()))

    for peptide in unique_peptides:
        hlas = summary.loc[summary['Peptide'] == peptide]['HLA']
        hlas = ", ".join(hlas.tolist())

        outfile_pep = "{}/{}.txt".format(outfile_path, peptide)

        with open(outfile_pep, 'w') as output:
            output.write(peptide + ", " + hlas)

In [29]:
def output_for_popcoverage(protein, status):
    netmhc_summary = pd.read_csv("../MHC_neoIM_summary/{}_{}_epitopes_summary.csv".format(protein, status))
    immuno_summary = pd.read_csv("../MHC_neoIM_summary/neoIM_summary/{}_neoim_{}.csv".format(protein, status))

    epitopes = immuno_summary['epitope'].tolist()

    for pep in epitopes:
        immunogenicity = list(set(immuno_summary.loc[immuno_summary['epitope'] == pep]['neoIM_score'].tolist()))[0]
        
        if immunogenicity >= 0.64:
            hlas = netmhc_summary.loc[netmhc_summary['Peptide'] == pep]['HLA']
            hlas = ", ".join(hlas.tolist())

            outfile_path = "../MHC_neoIM_summary/immunogenic_epitopes/" + protein

            if not os.path.exists(outfile_path):
                os.mkdir(outfile_path)
            
            status_path = "{}/{}".format(outfile_path, status)

            if not os.path.exists(status_path):
                os.mkdir(status_path)

            filename = "{}/{}.txt".format(status_path, pep)
            with open(filename, 'w') as output:
                output.write(pep + '\t' + hlas)

In [30]:
for prot in proteins:
    try:
        output_for_popcoverage(prot, 'cons')
    except:
        pass
    
    try:
        output_for_popcoverage(prot, 'exp')
    except:
        pass

In [31]:
prots = ['HA', 'NA', 'M1', 'M2', "PB1", "PB2", "PA", "NP", "NS1"]

import pandas as pd

for prot in prots:
    print(prot)

    # Defining path for outputs
    outfile_prot = "../mhcbinding_text/" + prot

    if not os.path.exists(outfile_prot):
        os.mkdir(outfile_prot)

    outfile_cons = "{}/conserved".format(outfile_prot)
    outfile_exp = "{}/experimental".format(outfile_prot)

    if not os.path.exists(outfile_cons):
        os.mkdir(outfile_cons)
    if not os.path.exists(outfile_exp):
        os.mkdir(outfile_exp)

    conserved_epitopes, known_mers, exp_epitopes, exp_mers = fetch_files(prot)

    if len(known_mers) > 0:
        cons_summary = netmhcpan_summarize(prot, 'conserved', known_mers, HLA_list)
        if not cons_summary.empty:
            output_for_population_coverage(outfile_cons, cons_summary)
            cons_summary.to_csv("../MHC_neoIM_summary/" + prot + "_cons_epitopes_summary.csv", index=False)

    if len(exp_mers) > 0:
        exp_summary = netmhcpan_summarize(prot, 'experimental', exp_mers, HLA_list)
        if not exp_summary.empty:
            output_for_population_coverage(outfile_exp, exp_summary)
            exp_summary.to_csv("../MHC_neoIM_summary/" + prot + "_exp_epitopes_summary.csv", index=False)

HA
NA
M1
M2
PB1
PB2
PA
NP
NS1


In [35]:
iedb_population = '../population_coverage/calculate_population_coverage.py'
files = "../MHC_neoIM_summary/immunogenic_epitopes/HA/exp/GMVDGWYGF.txt"

os.system('python {} -p Japan -c I -f {}'.format(iedb_population, files))

* No result found! *


0

In [None]:
population_coverage = '../population_coverage/calculate_population_coverage.py'

prot = 'HA'
results_path = "../netmhcpan_tool_result/{}".format(prot)
input_pop = "{}/../{}_population_input.txt".format(results_path, prot)

pop_analysis = "python {} -p World -c I -f {} --plot ../".format(population_coverage, input_pop)
os.system(pop_analysis)

class I
population/area	coverage	average_hit	pc90
World	93.16%	7.01	2.13
average	93.16%	7.01	2.13
standard_deviation	0.0%	0.0	0.0

population/area	epitope_hits	percent_individuals	cumulative_coverage
World	0	6.84	100.0
World	1	2.17	93.16
World	2	7.46	90.99
World	3	3.9	83.53
World	4	6.71	79.63
World	5	13.14	72.92
World	6	7.95	59.79
World	7	11.42	51.84
World	8	5.84	40.41
World	9	7.73	34.57
World	10	6.6	26.85
World	11	5.49	20.25
World	12	4.4	14.76
World	13	2.83	10.36
World	14	2.54	7.53
World	15	1.46	4.99
World	16	1.3	3.52
World	17	0.82	2.22
World	18	0.57	1.4
World	19	0.31	0.83
World	20	0.22	0.52
World	21	0.13	0.3
World	22	0.08	0.17
World	23	0.05	0.09
World	24	0.02	0.04
World	25	0.01	0.02
World	26	0.0	0.01
World	27	0.0	0.01
World	28	0.0	0.0
World	29	0.0	0.0
World	30	0.0	0.0
World	31	0.0	0.0


* A plot has been generated in '../' directory with <population>_<mhc_class> suffixed.


0

In [None]:
population_coverage_input

Unnamed: 0,epitopes,HLA_alleles
0,ALQLLLEV,HLA-A*02:01
1,EEIRWLIEEV,HLA-B*44:03
2,GMITQFESL,"HLA-A*02:01,HLA-A*32:01"
3,GMITQFESLK,HLA-A*03:01
4,IRWLIEEV,HLA-C*06:02
5,KFEEIRWL,HLA-C*04:01
6,KFEEIRWLI,"HLA-A*32:01,HLA-C*04:01,HLA-A*23:01"
7,LLLEVEQEI,"HLA-C*04:01,HLA-A*32:01,HLA-C*05:01,HLA-A*02:01"
8,LQLLLEVEQEI,HLA-A*02:01
9,MITQFESLK,HLA-A*03:01
