In [None]:
import os, sys, random, json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logomaker as lm
from collections import OrderedDict
from util import *
from tqdm.notebook import tqdm
from venn import venn, generate_petal_labels, draw_venn
from statannot import add_stat_annotation
from scipy.stats.mstats import ttest_rel, ttest_ind

mhcfovea_train_file = '' # train_hit
netmhcpan_dir = '' # download from http://www.cbs.dtu.dk/suppl/immunology/NAR_NetMHCpan_NetMHCIIpan/NetMHCpan_train.tar.gz
mhcflurry_train_file = '' # dowload from https://data.mendeley.com/datasets/zx3kjzc3yx , Data S4

## overlap of alleles

In [None]:
allele_info_file = '../data/allele_info.csv'

# load data
allele_info = pd.read_csv(allele_info_file, index_col=0)

# dict for venn
keys = ['MHCfovea', 'NetMHCpan4.1', 'MHCflurry2.0', 'MixMHCpred2.1']
allele_set_dict = OrderedDict({k: set(allele_info[allele_info[k]==True].index) for k in keys})

# venn plot
fig, ax = plt.subplots(1, 1, figsize=(4.5, 4.5), dpi=600)
venn(allele_set_dict, ax=ax, fontsize=8)
ax.set_title('Allele number')
fig.tight_layout()

## peptide count

In [None]:
count_dict = dict()

# mhcfovea
mhcfovea_train_df = pd.read_csv(mhcfovea_train_file, index_col=0)

## count
count_dict['MHCfovea'] = dict()
allele_num = len(mhcfovea_train_df['mhc'].unique())
positive_num = mhcfovea_train_df[mhcfovea_train_df['source'].isin(['MS', 'assay'])].shape[0]
negative_num = 0
count_dict['MHCfovea'] = OrderedDict({'#MHC': allele_num, '#Pos': positive_num, '#Neg': negative_num})


# NetMHCpan4.1
netmhcpan_train_df = pd.DataFrame()
## ba
for i in range(5):
    temp = pd.read_csv('{}/c00{}_ba'.format(netmhcpan_dir, i), sep='\s+', names=['sequence', 'score', 'mhc'])
    temp['source'] = 'ba'
    netmhcpan_train_df = pd.concat([netmhcpan_train_df, temp], axis=0, ignore_index=True)
## el
for i in range(5):
    temp = pd.read_csv('{}/c00{}_el'.format(netmhcpan_dir, i), sep='\s+', names=['sequence', 'score', 'mhc'])
    temp['source'] = 'el'
    netmhcpan_train_df = pd.concat([netmhcpan_train_df, temp], axis=0, ignore_index=True)

## remove non-human
non_human_allele = ['BoLA-', 'Eqca-', 'Gogo-', 'H-2-', 'Mamu-', 'Patr-', 'SLA-', 'DLA-']
non_human_cell = ['A10', 'A11-A11', 'A12-A15', 'A14', 'A15-A15', 'A18', 'A19-A19', 'A20-A20', 'EBL']
netmhcpan_train_df = netmhcpan_train_df[~netmhcpan_train_df['mhc'].isin(non_human_cell)]
for s in non_human_allele:
    netmhcpan_train_df = netmhcpan_train_df[~netmhcpan_train_df['mhc'].str.contains(s)]
    
## count
allele_num = len(netmhcpan_train_df[netmhcpan_train_df['mhc'].str.contains('HLA-')]['mhc'].unique())
positive_num = netmhcpan_train_df[(netmhcpan_train_df['source']=='ba') | ((netmhcpan_train_df['source']=='el') & (netmhcpan_train_df['score']==1))].shape[0]
negative_num = netmhcpan_train_df.shape[0] - positive_num
count_dict['NetMHCpan4.1'] = OrderedDict({'#MHC': allele_num, '#Pos': positive_num, '#Neg': negative_num})


# MHCflurry2.0
mhcflurry_train_df = pd.read_csv(mhcflurry_train_file)
mhcflurry_train_df = mhcflurry_train_df[mhcflurry_train_df['allele'].str.contains('HLA-')]
mhcflurry_train_df['allele'] = mhcflurry_train_df['allele'].apply(lambda x: x.split('-')[1])
allele_num = len(mhcflurry_train_df['allele'].unique())
positive_num = mhcflurry_train_df.shape[0]
negative_num = mhcflurry_train_df[mhcflurry_train_df['measurement_kind']=='mass_spec'].shape[0] * 99 # 99n
count_dict['MHCflurry2.0'] = OrderedDict({'#MHC': allele_num, '#Pos': positive_num, '#Neg': negative_num})


# print count
for tool in count_dict.keys():
    for tag in count_dict[tool].keys():
        print('{} of {}: {}'.format(tag, tool, count_dict[tool][tag]))

## overlap of peptides

In [None]:
# get unique pair
mhcfovea_train_set = set(mhcfovea_train_df.groupby(['mhc', 'sequence']).groups.keys())

netmhcpan_train_df = netmhcpan_train_df[~((netmhcpan_train_df['source']=='el') & (netmhcpan_train_df['score']==0))]
netmhcpan_train_df = netmhcpan_train_df[netmhcpan_train_df['mhc'].str.contains('HLA')]
netmhcpan_train_df['mhc'] = netmhcpan_train_df['mhc'].apply(lambda x: x.split('-')[1])
netmhcpan_train_df['mhc'] = netmhcpan_train_df['mhc'].apply(lambda x: x[0] + '*' + x[1:])
netmhcpan_pair_set = set(netmhcpan_train_df.groupby(['mhc', 'sequence']).groups.keys())

mhcflurry_pair_set = set(mhcflurry_train_df.groupby(['allele', 'peptide']).groups.keys())

# venn plot
venn_dict = OrderedDict({
    'MHCfovea': mhcfovea_train_set,
    'NetMHCpan4.1': netmhcpan_pair_set,
    'MHCflurry2.0': mhcflurry_pair_set
})

fig, ax = plt.subplots(1, 1, figsize=(4.5, 4.5), dpi=600)
venn(venn_dict, ax=ax, fontsize=8)
ax.set_title('Peptide number')
fig.tight_layout()