In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import tqdm
import collections
from pandas.plotting import parallel_coordinates
import json
import tqdm

In [3]:
base_df = pd.read_csv("../data/output_files/final_base_df.csv")

varient_list = ["WILDTYPE", "Alpha B.1.1.7",
   "Beta B.1.351",
   "Gamma P.1",
   "Kappa B.1.617.1",
   "Delta B.1.617.2",
   "Lambda C.37",
   "Mu B.1.621",
   "Omicron BA.1",
   "Omicron BA.2",
   "AY.4.2",
   "C.1.2",
   "Eta B.1.525",
   "Iota B.1.526",]
base_df["varient"].replace({"original":"WILDTYPE"}, inplace=True)

In [4]:
changed_peptides = []
for i, row in tqdm.tqdm(base_df.iterrows(), total=len(base_df)):
    try:
        if(row["varient"] == "Omicron BA.1" and row["start_pos"] >= 214):
            tmp = list(base_df[(base_df["varient"] == "original")&(base_df["start_pos"] == row["start_pos"] -3)&(base_df["end_pos"] == row["end_pos"]-3)]["peptide"])[0]
        else: 
            tmp = list(base_df[(base_df["varient"] == "original")&(base_df["start_pos"] == row["start_pos"])&(base_df["end_pos"] == row["end_pos"])]["peptide"])[0]
    except:
        tmp = ""
    if(row["peptide"] != tmp):
        changed_peptides.append(row["peptide"])
        changed_peptides.append(tmp)

100%|██████████| 53094/53094 [09:08<00:00, 96.73it/s] 


In [None]:

filtered_df = base_df[base_df.peptide.isin(changed_peptides)]
filtered_df.to_csv("../data/output_files/filtered_df.csv")

In [None]:
# filtered_df = pd.read_csv("../data/output_files/filtered_df.csv")

In [None]:
MHC_TYPES = [i for i in list(set(filtered_df.columns)) if i.startswith("H")]

In [None]:
def json_txt_to_dict(file_path):
    d = json.load(open(file_path))
    return d
mutation_dict = json_txt_to_dict("../data/vocs.txt")

# Insertions and Deletions

deletions

In [None]:
deletions_pos_list = []
for varient in mutation_dict.keys():
    for mutation in mutation_dict[varient]:
        if(mutation[-1] == "-"):
            deletions_pos_list.append(int(mutation[1:-1]))


In [None]:
deleted_peptides = []
for pos in deletions_pos_list:
    for i,row in base_df[base_df["varient"] == "WILDTYPE"].iterrows():
        start_pos = row["start_pos"]
        end_pos = row["end_pos"]
        if(pos>=start_pos and pos<=end_pos):
            deleted_peptides.append(row["peptide"])

insertions

In [None]:
insertion_pos = 214
insetted_peptides = []
for i,row in base_df[base_df["varient"] == "WILDTYPE"].iterrows():
    start_pos = row["start_pos"]
    end_pos = row["end_pos"]
    if(insertion_pos>=start_pos and insertion_pos<=end_pos):
        insetted_peptides.append(row["peptide"])

plot

In [None]:
binder_inserted_counter = 0
for i,row in base_df[base_df["varient"] == "WILDTYPE"].iterrows():
    start_pos = row["start_pos"]
    end_pos = row["end_pos"]
    if(insertion_pos>=start_pos and insertion_pos<=end_pos):
        for mhc_type in MHC_TYPES:
            if(row[mhc_type]<=2):
                binder_inserted_counter+=1
binder_inserted_counter

In [None]:
binder_deleted_counter = 0
for pos in deletions_pos_list:
    for i,row in base_df[base_df["varient"] == "WILDTYPE"].iterrows():
        start_pos = row["start_pos"]
        end_pos = row["end_pos"]
        if(pos>=start_pos and pos<=end_pos):
            for mhc_type in MHC_TYPES:
                if(row[mhc_type]<=2):
                    binder_deleted_counter+=1
binder_deleted_counter

In [None]:
added_delted_df = pd.DataFrame([["insertions", binder_inserted_counter], ["deletions", binder_deleted_counter]], columns=["type", "count"])

In [None]:
sns.set_theme(style="whitegrid")
sns.set(rc = {'figure.figsize':(25,8)})
sns.set(font_scale = 1.2)
ax = sns.barplot(x="type", y="count", data=added_delted_df)

ax.set_title('Number of binders')
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
plt.savefig("plots/chopped-varient.png", dpi=500, bbox_inches='tight')

# Parallel plot

In [None]:
binder_peptides = set()
for i, row in filtered_df[filtered_df["varient"] == "WILDTYPE"].iterrows():
    for mhc_type in MHC_TYPES:
        if(row[mhc_type] <= 2):
            binder_peptides.add(row["peptide"])

In [None]:
varients_dict = dict([(j,i ) for i, j in enumerate(varient_list)])

parallel_df = filtered_df[filtered_df.peptide.isin(binder_peptides)][MHC_TYPES + ["varient"]]
parallel_df["varient"] = parallel_df["varient"].apply(lambda x: varients_dict[x])

row90 = dict([(i,90) for i in MHC_TYPES] + [("varient", None)])
row0 = dict([(i,0) for i in MHC_TYPES] + [("varient", None)])
parallel_df = parallel_df.append(row90, ignore_index=True)
parallel_df = parallel_df.append(row0, ignore_index=True)

In [None]:
display(parallel_df)

In [None]:
import plotly.express as px

fig = px.parallel_coordinates(parallel_df, color="varient", 
                             color_continuous_scale=px.colors.diverging.Tealrose,
                             color_continuous_midpoint=2, width=1450)
fig.show()
plt.savefig("plots/parallel_plot", dpi=500, bbox_inches='tight')

# Rom

In [None]:
wt_binders_peptides = []
for i, row in base_df[base_df["varient"] == "WILDTYPE"].iterrows():
    for mhc_type in MHC_TYPES:
        if(row[mhc_type] <=2):
           wt_binders_peptides.append({"start_pos":row["start_pos"], "end_pos":row["end_pos"], "mhc_type":mhc_type, "varient":row["varient"], "peptide":row["peptide"]})

In [None]:
new_list = []
for i, row in base_df.iterrows():
    tmp_list = []
    for mhc_type in MHC_TYPES:
        #tmp_list.append(row[mhc_type])    
        new_list.append([row[mhc_type], mhc_type, row["varient"], row["peptide"], row["start_pos"], row["end_pos"]])


cols = ["rank", "mhc_type", "varient", "peptide", "start_pos", "end_pos"]
box_df = pd.DataFrame(new_list, columns=cols)
box_df_omicron = box_df[(box_df["varient"] == "Omicron BA.1") & (box_df["start_pos"] >= 214)]
box_df_without_omicron = box_df[box_df["varient"] != "Omicron BA.1"]

In [None]:
binders_df = pd.DataFrame([], columns = ["rank", "mhc_type", "varient", "peptide"])

for peptide_dict in tqdm.tqdm(wt_binders_peptides):
    for mhc_type in MHC_TYPES:
        interesting_rows_omicron = box_df_omicron[(box_df_omicron["mhc_type"] == peptide_dict["mhc_type"]) & (box_df_omicron["start_pos"] == peptide_dict["start_pos"]+3)& (box_df_omicron["end_pos"] == peptide_dict["end_pos"]+3)]
        interesting_rows = box_df_without_omicron[(box_df_without_omicron["mhc_type"] == peptide_dict["mhc_type"]) & (box_df_without_omicron["start_pos"] == peptide_dict["start_pos"])& (box_df_without_omicron["end_pos"] == peptide_dict["end_pos"])]
    binders_df = pd.concat([binders_df, interesting_rows])
    binders_df = pd.concat([binders_df, interesting_rows_omicron])
    
#binders_df.drop_duplicates(inplace = True)
display(binders_df)

In [None]:
binders_df[binders_df["start_pos"] >= 214]


In [None]:
#{'start_pos': 311, 'end_pos': 319, 'mhc_type': 'HLA-A*03:01_rank', 'varient': 'WILDTYPE', 'peptide': 'GIYQTSNFR'}
box_df_omicron[(box_df["peptide"] == "GIYQTSNFR")] 

In [None]:
labels = varient_list

sns.set_theme(style="whitegrid")

sns.set(font_scale = 1.3)
plt.figure(figsize = (30,10))

ax = sns.boxplot(x="varient", y="rank",
                 data=binders_df, palette="Set3", showfliers=False)

ax.set_yscale("log")
ax = sns.stripplot(x="varient", y="rank",  data=binders_df, ax=ax, palette="Set2", split=True, jitter=0.2, alpha=0.6)

plt.savefig("plots/chopped-rank-boxplot_unique_hla", dpi=500, bbox_inches='tight')