In [None]:
import os, sys, random, json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logomaker as lm
from collections import OrderedDict
from util import *
from tqdm.notebook import tqdm
from venn import venn, generate_petal_labels, draw_venn
from statannot import add_stat_annotation
from scipy.stats.mstats import ttest_rel, ttest_ind
from IPython.display import display

data_dir = '' # MHCfovea's dataframe directory
main_dir = ''

# Dissimilar peptides

In [None]:
def BuildSubSeq(seqs, length):
    new_seqs = list()
    for seq in seqs:
        seq_len = len(seq)
        temp_seqs = [seq[i:i+length] for i in range(seq_len - length + 1)]
        new_seqs += temp_seqs
    return new_seqs

In [None]:
# load data

train_hit_df = pd.read_csv('{}/train_hit.csv'.format(data_dir), index_col=0)
valid_df = pd.read_csv('{}/valid.csv'.format(data_dir), index_col=0)
test_df = pd.read_csv('{}/benchmark.csv'.format(data_dir), index_col=0)

# combine training and validation
train_df = pd.concat([train_hit_df, valid_df])

# get allele counts
train_allele_counts = train_df['mhc'].value_counts()
test_allele_counts = test_df['mhc'].value_counts()

## analyzing similar peptides

In [None]:
# similar peptides in training

train_peptides = train_df[train_df['bind']==1]['sequence'].unique()
test_peptides = test_df[test_df['bind']==1]['sequence'].unique()

train_peptide_length_dict = dict()
for peptide in train_peptides:
    length = len(peptide)
    if train_peptide_length_dict.get(length):
        train_peptide_length_dict[length].append(peptide)
    else:
        train_peptide_length_dict[length] = [peptide,]

leave_train_peptides = list()
for length, peptides in train_peptide_length_dict.items():
    comp_peptides = BuildSubSeq(test_peptides, length)
    left_peptides = list(set(peptides) - set(comp_peptides))
    leave_train_peptides += left_peptides
    
outfile = '{}/dissimilar_train_peptides.json'.format(main_dir)
json.dump(leave_train_peptides, open(outfile, 'w'))
    
print("Peptide Number of Training Dataset")
print("Origin training: ", train_df.shape[0])
print("After removing duplicates: ", len(train_peptides))
print("After removing similar peptide set: ", len(leave_train_peptides))

In [None]:
# similar peptides in benchmark

train_peptides = train_df[train_df['bind']==1]['sequence'].unique()
test_peptides = test_df[test_df['bind']==1]['sequence'].unique()

test_peptide_length_dict = dict()
for peptide in test_peptides:
    length = len(peptide)
    if test_peptide_length_dict.get(length):
        test_peptide_length_dict[length].append(peptide)
    else:
        test_peptide_length_dict[length] = [peptide,]

leave_test_peptides = list()
for length, peptides in test_peptide_length_dict.items():
    comp_peptides = BuildSubSeq(train_peptides, length)
    left_peptides = list(set(peptides) - set(comp_peptides))
    leave_test_peptides += left_peptides
    
outfile = '{}/dissimilar_test_peptides.json'.format(main_dir)
json.dump(leave_test_peptides, open(outfile, 'w'))

print("Peptide Number of Testing Dataset")
print("Origin testing: ", test_df[test_df['bind']==1].shape[0])
print("After removing duplicates: ", len(test_peptides))
print("After removing similar peptide set: ", len(leave_test_peptides))

In [None]:
# similar decoys in benchmark

# train_decoy
train_decoy_df_list = list()
for i in range(90):
    train_decoy_df_list.append(pd.read_csv("{}/train_decoy_{}.csv".format(data_dir, i+1), index_col=0))

train_decoy_df_list.append(train_df[train_df['bind']==0])
train_decoy_df = pd.concat(train_decoy_df_list)

train_decoys = train_decoy_df['sequence'].unique()
test_decoys = test_df[test_df['bind']==0]['sequence'].unique()

test_decoy_length_dict = dict()
for decoy in test_decoys:
    length = len(decoy)
    if test_decoy_length_dict.get(length):
        test_decoy_length_dict[length].append(decoy)
    else:
        test_decoy_length_dict[length] = [decoy,]

leave_test_decoys = list()
for length, decoys in test_decoy_length_dict.items():
    comp_decoys = BuildSubSeq(train_decoys, length)
    left_decoys = list(set(decoys) - set(comp_decoys))
    leave_test_decoys += left_decoys
    
outfile = '{}/dissimilar_test_decoys.json'.format(main_dir)
json.dump(leave_test_decoys, open(outfile, 'w'))

print("Decoy Number of Testing Dataset")
print("Origin testing: ", test_df[test_df['bind']==0].shape[0])
print("After removing duplicates: ", len(test_decoys))
print("After removing similar decoy set: ", len(leave_test_decoys))

## separating benchmark to similar and dissimilar peptides

In [None]:
# add peptide tags

save_file = '{}/benchmark_prediction.csv'.format(main_dir)

leave_test_peptides = json.load(open('{}/dissimilar_test_peptides.json'.format(main_dir)))
leave_test_decoys = json.load(open('{}/dissimilar_test_decoys.json'.format(main_dir)))

test_df['peptide_tag'] = 'similar'
test_df.loc[(test_df['bind']==1) & (test_df['sequence'].isin(leave_test_peptides)), 'peptide_tag'] = 'dissimilar'
test_df.loc[(test_df['bind']==0) & (test_df['sequence'].isin(leave_test_decoys)), 'peptide_tag'] = 'dissimilar'
'''
# common unobserved alleles
unobserved_alleles = ['C*03:02', 'A*24:07', 'A*36:01', 'B*38:02', 'C*04:03', 'A*34:02',
                      'C*14:03', 'B*35:07', 'B*07:04', 'A*34:01', 'B*40:06']
'''
unobserved_alleles = ['A*24:07', 'A*33:03', 'A*34:01', 'A*34:02', 'A*36:01', 'B*07:04', 'B*15:10', 'B*35:07',
                      'B*38:02', 'B*40:06', 'B*55:01', 'B*55:02', 'C*03:02', 'C*04:03', 'C*08:01', 'C*14:03']

test_df['allele_tag'] = 'observed'
test_df.loc[test_df['mhc'].isin(unobserved_alleles), 'allele_tag'] = 'unobserved'

test_df.to_csv(save_file)

In [None]:
# load data

def FixPosNegRatio(df, ratio):
    pos_num = df[df['bind']==1].shape[0]
    neg_num = df[df['bind']==0].shape[0]
    if neg_num > pos_num*ratio:
        select_pos_num = pos_num
        select_neg_num = int(pos_num*ratio)
    else:
        select_pos_num = int(neg_num / ratio)
        select_neg_num = neg_num
    
    select_pos_df = df[df['bind']==1].sample(n=select_pos_num, random_state=0)
    select_neg_df = df[df['bind']==0].sample(n=select_neg_num, random_state=0)
    
    return pd.concat([select_pos_df, select_neg_df], ignore_index=True)


test_pred_file = '{}/benchmark_prediction.csv'.format(main_dir)
test_pred_df = pd.read_csv(test_pred_file, index_col=0)

display(test_pred_df.groupby(['bind', 'peptide_tag', 'allele_tag']).count())

# predictors
tools = ['NetMHCpan4.1', 'MHCflurry2.0', 'MixMHCpred2.1', 'MHCfovea']
test_pred_df = test_pred_df[~test_pred_df['MixMHCpred2.1'].isna()]

In [None]:
# performance

ratio = test_pred_df[test_pred_df['bind']==0].shape[0] / test_pred_df[test_pred_df['bind']==1].shape[0]

perform_dict_list = list()

for peptide_tag in ['all', 'similar', 'dissimilar']:
    for allele_tag in ['all', 'observed', 'unobserved']:
        # peptide tag
        if peptide_tag == 'all':
            temp_df = test_pred_df
        else:
            temp_df = test_pred_df[test_pred_df['peptide_tag']==peptide_tag]
        
        # allele tag
        if allele_tag == 'all':
            temp_df = temp_df
        else:
            temp_df = temp_df[temp_df['allele_tag']==allele_tag]
            
        # fix ratio
        if not ((peptide_tag == 'all') and (allele_tag == 'all')):
            temp_df = FixPosNegRatio(temp_df, ratio)
        
        # performance
        temp_y = temp_df['bind'].to_numpy()
        for tool in tools:
            temp_metrics = CalculateMetrics(temp_y, temp_df[tool].to_numpy())
        
            # add to dict list
            pos_num = temp_df[temp_df['bind']==1].shape[0]
            neg_num = temp_df[temp_df['bind']==0].shape[0]
            perform_dict_list.append({
                'positive_num': pos_num,
                'negative_num': neg_num,
                'ratio': neg_num / pos_num,
                'peptide_tag': peptide_tag,
                'allele_tag': allele_tag,
                'predictor': tool,
                'AUC': temp_metrics['AUC'],
                'AUC0.1': temp_metrics['AUC0.1'],
                'AP': temp_metrics['AP'],
                'PPV': temp_metrics['PPV'],
            })
        
perform_df = pd.DataFrame(perform_dict_list)
perform_df.to_csv('{}/test_perform_by_groups.csv'.format(main_dir))
display(perform_df)

In [None]:
# load perform df

perform_df = pd.read_csv('{}/test_perform_by_groups.csv'.format(main_dir), index_col=0)
display(perform_df)

In [None]:
# distribution of peptides with different tags in benchmark

test_pred_df['Peptide Tag'] = np.nan

test_pred_df.loc[(test_pred_df['bind']==1) & (test_pred_df['peptide_tag']=='similar'), 'Peptide Tag'] = 'Positive similar'
test_pred_df.loc[(test_pred_df['bind']==1) & (test_pred_df['peptide_tag']=='dissimilar'), 'Peptide Tag'] = 'Positive dissimilar'
test_pred_df.loc[(test_pred_df['bind']==0) & (test_pred_df['peptide_tag']=='similar'), 'Peptide Tag'] = 'Negative similar'
test_pred_df.loc[(test_pred_df['bind']==0) & (test_pred_df['peptide_tag']=='dissimilar'), 'Peptide Tag'] = 'Negative dissimilar'

fig, ax = plt.subplots(1, 1, figsize=(4, 3), dpi=600)
sns.histplot(data=test_pred_df, hue='Peptide Tag', x='MHCfovea', ax=ax,
             stat='probability', binwidth=0.02, common_norm=False, element='step')

fig.tight_layout()
fig.savefig('{}/benchmark_peptide_dist.png'.format(main_dir))

In [None]:
# plot for comparison of performance

## add tags
perform_df['tag'] = np.nan

for p in ['similar', 'dissimilar']:
    for a in ['observed', 'unobserved']:
        perform_df.loc[(perform_df['peptide_tag'] == p) & (perform_df['allele_tag'] == a), 'tag'] = '{} alleles - {} peptides'.format(a, p)

temp_df = perform_df[~perform_df['tag'].isna()]
temp_df = temp_df.sort_values(by='tag', ascending=False)

## plot by metrics
metric_to_value = {'AUC': 0.9, 'AP':0.7}
for metric in metric_to_value.keys():
    value = metric_to_value[metric]
    temp_df[metric] = temp_df[metric] - value
    
    fig, ax = plt.subplots(1,1, figsize=(8, 4), dpi=600)
    order = ['MHCfovea', 'NetMHCpan4.1', 'MHCflurry2.0', 'MixMHCpred2.1']
    sns.barplot(data=temp_df, x='predictor', hue='tag', y=metric, ax=ax, order=order, palette='muted')
    
    ax.set_yticks([i*(1-value)/10 for i in range(10)])
    ax.set_yticklabels([str(np.round(i*(1-value)/10 + value, 2)) for i in range(10)])
    ax.set_xlabel("")
    ax.legend(bbox_to_anchor=(0, 1), loc='lower left', ncol=2)

    fig.tight_layout()
    fig.savefig('{}/test_{}_by_group.png'.format(main_dir, metric.lower()))

# Negative prediction

In [None]:
# load data

train_hit_file = '{}/train_hit.csv'.format(data_dir)
test_file = '{}/test.csv'.format(data_dir)

train_df = pd.read_csv(train_hit_file, index_col=0)
test_df = pd.read_csv(test_file, index_col=0)

## Build negatives from training positives

In [None]:
# positive sequences
neg_df = train_df[train_df['bind']==1]
neg_df = neg_df.drop_duplicates(subset='sequence')

# pytorch dataset
neg_dataset = BuildDataset(neg_df, 'onehot', 15, with_label=True)
torch.save(neg_dataset, '{}/neg/neg.pt'.format(main_dir))

neg_df.to_csv('{}/neg/neg.csv'.format(main_dir))

In [None]:
# shell script for prediction

alleles = list(sorted(test_df['mhc'].unique()))
split_num = 8

for i in range(0, len(alleles), split_num):
    shell_str = list()
    shell_str.append("#! /bin/bash\n")
    shell_str.append("python3 predictor.py")
    shell_str.append(" --mhc_file ../data/MHCI_res182_seq.json")
    shell_str.append(" --rank_file ../data/score_rank.csv")
    shell_str.append(" --peptide_dataframe {}/neg/neg.csv".format(main_dir))
    shell_str.append(" --peptide_dataset {}/neg/neg.pt".format(main_dir))
    shell_str.append(" --model_file model.py")
    shell_str.append(" --model_state_dir ${TRAIN_RESUILT_DIR}/model_state")
    shell_str.append(" --output_dir {}/neg/{}".format(main_dir, i//split_num+1))
    shell_str.append(" --alleles '{}'".format(','.join(alleles[i: i+split_num])))
    
    with open('{}/neg/run_pred_{}.sh'.format(main_dir, i//split_num+1), 'w') as f:
        f.write(''.join(shell_str))

## Evaluation

### functions

In [None]:
def move_legend(ax, new_loc, **kws):
    old_legend = ax.legend_
    handles = old_legend.legendHandles
    labels = [t.get_text() for t in old_legend.get_texts()]
    title = old_legend.get_title().get_text()
    ax.legend(handles, labels, loc=new_loc, title=title, **kws)
    
    
def get_motif_seqlogo(seqs, sub_motif_len=4):
    aa_str = 'ACDEFGHIKLMNPQRSTVWY'
    seqs = seqs.apply(lambda x: x[:sub_motif_len] + x[-sub_motif_len:])
    seqlogo_df = lm.alignment_to_matrix(sequences=seqs, to_type='information', characters_to_ignore='XU.')
    df = pd.DataFrame(columns=list(aa_str))
    df = pd.concat([df, seqlogo_df], axis=0)
    df = df[list(aa_str)]
    df = df.fillna(0.0)
    return df

    
def motif_plot(seqlogo_df, side, ax, sub_motif_len=4, ylim=4, fontsize=10, title=None, turn_off_label=False):
    if side == 'N':
        xticklabels = list(range(1, sub_motif_len+1))
    elif side == 'C':
        xticklabels = list(range(-sub_motif_len, 0))
    else: # both
        xticklabels = list(range(1, sub_motif_len+1)) + list(range(-sub_motif_len, 0))
    
    logo = lm.Logo(seqlogo_df, color_scheme='skylign_protein', ax=ax)
    
    _ = ax.set_xticks(list(range(len(xticklabels))))
    _ = ax.set_xticklabels(xticklabels)
    _ = ax.set_ylim(0,ylim)
    _ = ax.set_title(title)
    
    for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] + ax.get_xticklabels() + ax.get_yticklabels()):
        item.set_fontsize(fontsize)

    if turn_off_label:
        _ = ax.set_xticks([])
        _ = ax.set_yticks([])
        _ = ax.set_xticklabels([])
        _ = ax.set_yticklabels([])

### loading data

In [None]:
pred_test_df = pd.read_csv('{}/benchmark_prediction.csv'.format(main_dir), index_col=0)

pred_neg_df = pd.read_csv('{}/neg/neg.csv'.format(main_dir), index_col=0)

for i in os.listdir('{}/neg/'.format(main_dir)):
    if os.path.isdir('{}/neg/{}'.format(c, i)):
        temp_df = pd.read_csv('{}/neg/{}/prediction.csv'.format(main_dir, i), index_col=0)
        pred_neg_df = pred_neg_df.merge(temp_df, on='sequence', how='left')
        
display(preg_neg_df)

### the distribution of all peptides with different resouces

In [None]:
# build df

score_df = pd.DataFrame()
for allele in pred_neg_df.columns:
    if not re.match(r'[ABC]\*[0-9]+\:[0-9]+', allele):
        continue
    temp_df = pred_neg_df[['mhc', 'sequence', 'source', allele]]
    temp_df = temp_df[temp_df['mhc'] != allele]
    temp_df = temp_df.rename(columns={allele: 'score'})
    temp_df['source'] = 'aritificial dataset'
    score_df = pd.concat([score_df, temp_df], axis=0)
    
temp_df = pred_test_df[pred_test_df['bind']==1]
temp_df['source'] = 'positives in the benchmark'
temp_df = temp_df[['mhc', 'sequence', 'source', 'MHCfovea']]
temp_df = temp_df.rename(columns={'MHCfovea': 'score'})
score_df = pd.concat([score_df, temp_df], axis=0)

temp_df = pred_test_df[pred_test_df['bind']==0]
temp_df['source'] = 'negatives in the benchmark'
temp_df = temp_df[['mhc', 'sequence', 'source', 'MHCfovea']]
temp_df = temp_df.rename(columns={'MHCfovea': 'score'})
score_df = pd.concat([score_df, temp_df], axis=0)

display(score_df)

In [None]:
# plot

fig, ax = plt.subplots(1, 1, figsize=(5, 3), dpi=600)
sns.histplot(data=score_df, hue='source', x='score', ax=ax,
             stat='probability', binwidth=0.02, common_norm=False, element='step')

fig.tight_layout()
fig.savefig('{}/benchmark_peptide_dist_with_neg.png'.format(main_dir))