# Setup

In [1]:
import os
import random
import re
from pathlib import Path

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib_venn import venn2
from tqdm.auto import tqdm

In [2]:
# Config

# Main directories

project_dir = Path("/path/to/carmen-analysis")

data_dir = project_dir.joinpath("data")
figures_dir = project_dir.joinpath("figures")

# data_pub_dir = data_dir.joinpath("to-be-published")

data_subs_dir = data_dir.joinpath("subsidiary-files")

figures_gen_dir = figures_dir.joinpath("script-generated")
figures_main_dir = figures_gen_dir.joinpath("main-panels")

random_pep_dir = data_subs_dir.joinpath("random-peptides")

# Core source files

# Haplotype frequencies from NMDP Registry Haplotype Frequencies database
freqs_dir = data_dir.joinpath("nmdp-hla-frequencies")
freqs_pop_desc_file = freqs_dir.joinpath("populations-description.csv")
freqs_pop_desc_col_pop_code = "Population_code"  # column name
freqs_pop_desc_col_pop_group = "Population_group"  # column name
freqs_pop_desc_col_desc = "Description"  # column name
freqs_pop_desc_col_a_b_drb1 = "A_B_DRB1_count"  # column name
freqs_abc_org_file = freqs_dir.joinpath("A~C~B.xlsx")

# Other

A4_width = 8.27
A4_height = 11.69

plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["ps.fonttype"] = 42
plt.rcParams["svg.fonttype"] = "none"

# Setting random seed
random_seed = 42

random.seed(random_seed)
np.random.seed(random_seed)

# Load Populations

In [3]:
NMDP_FOLDER = freqs_dir


def load_populations():
    filename = freqs_pop_desc_file
    df = pd.read_csv(filename)
    df = df.rename(
        columns={
            freqs_pop_desc_col_pop_code: "Short",
            freqs_pop_desc_col_pop_group: "Broad",
            freqs_pop_desc_col_desc: "Full",
            freqs_pop_desc_col_a_b_drb1: "Cnt",
        }
    )
    return df.set_index("Short")


def load_haplotype_freqs(load_mhc_2=False):
    filename = (
        os.path.join(NMDP_FOLDER, "A~C~B~DRB3-4-5~DRB1~DQB1.xlsx")
        if load_mhc_2
        else freqs_abc_org_file
    )

    df = pd.read_excel(filename)

    if load_mhc_2:
        df["haplotype"] = df.apply(
            lambda row: f"{trim_hla_name(row.A)}-{trim_hla_name(row.B)}-{trim_hla_name(row.C)}"
            f"-{trim_hla_name(row['DRB3-4-5'])}-{trim_hla_name(row['DRB1'])}-{trim_hla_name(row['DQB1'])}",
            axis=1,
        )
    else:
        df["haplotype"] = df.apply(
            lambda row: f"{trim_hla_name(row.A)}-{trim_hla_name(row.B)}-{trim_hla_name(row.C)}",
            axis=1,
        )

    df = df.set_index("haplotype")
    return df


def trim_hla_name(mhc):
    if mhc in ("DRBX*NNNN"):
        result = "None"
    else:
        h = re.findall(r"^(A|B|C|DRB1|DRB3|DRB4|DRB5|DQB1)\*(\d+):(\d+)[gQNL]*$", mhc)[
            0
        ]
        if h[0] in ("A", "B", "C"):
            result = (
                f"{h[0]}*{h[1]}:{h[2]}".replace("g", "")
                .replace("Q", "")
                .replace("N", "")
                .replace("L", "")
            )
        else:
            result = f"{h[0]}*{h[1]}:{h[2]}".replace("g", "")
    return result

In [4]:
df_populations = load_populations()
df_haplotype_freqs = load_haplotype_freqs()
df_populations_broad = df_populations[["Broad", "Cnt"]].groupby("Broad").sum()

# Generate Results For Haplotypes

## Select Most Common Haplotypes

In [None]:
groups = ["AFA", "API", "CAU", "HIS", "NAM"]
coverage = 0.95

dfs = {}

alleles = set()

for group in tqdm(groups):
    df = df_haplotype_freqs.sort_values(f"{group}_freq", ascending=False)
    df["cumulative"] = df[f"{group}_freq"].cumsum()
    dfs[group] = df[df.cumulative <= coverage]
    additional_row = df[df.cumulative > coverage].iloc[0]
    dfs[group] = pd.concat([dfs[group], additional_row.to_frame().T])

    for idx, row in dfs[group].iterrows():
        for mhc in [row.A, row.B, row.C]:
            alleles.add(f"HLA-{mhc.replace('g', '')}")

## Run The Ranking Pipeline

Here, we utilize part of the [CAPE](https://github.com/hcgasser/CAPE) framework created by [Gasser et al.](https://www.immunoinformaticsjournal.com/article/S2667-1190(24)00005-3/fulltext), specifically the `tools/MHC-I_rank_peptides.py` script described in [this section](https://github.com/hcgasser/CAPE#prepare-the-mhc-class-1-position-weight-matrix-predictor). The script predicts antigenicity for every generated peptide and selected allele.

First, generate command lines to be used with that framework by running the bellow cell and saving its output.

In [None]:
text = ""
for _MHC in alleles:
    text += f"MHC-I_rank_peptides.py --peptides_per_length 1000000 --alleles {_MHC} --tasks rank --output {random_pep_dir}\n"

print(text)

Now, create a separate programming environment for the [CAPE](https://github.com/hcgasser/CAPE) framework by following its [installation instructions](https://github.com/hcgasser/CAPE?tab=readme-ov-file#installation).

After setting up the container, run the above command lines within it.

## Prepare Binding Peptides For Each Allele

In [None]:
ignore = ["HLA-A*15:150"]

path = random_pep_dir

MHCs = {"A": [], "B": [], "C": []}
for filename in os.listdir(path):
    matches = re.findall(r"HLA-(\w)\_(\d+):(\d+).csv", filename)
    if len(matches) > 0:
        matches = matches[0]
        mhc = f"HLA-{matches[0]}*{matches[1]}:{matches[2]}"
        if mhc not in ignore:
            MHCs[matches[0]].append(mhc)

limit = 0.02

sets = {}

for mhc in tqdm(MHCs["A"] + MHCs["B"] + MHCs["C"]):
    if mhc not in sets:
        peptides = set()
        for nr, line in enumerate(
            open(os.path.join(path, f'{mhc.replace("*", "_")}.csv'))
        ):
            line = line.strip()
            if nr > 0:
                peptide, el_rank = line.split(",")
                if float(el_rank) <= limit:
                    peptides.add(peptide)

        sets.update({mhc: peptides})

# Figure 5

## Population Antigenicity Comparison

In [26]:
def get_immuno_peptidome_overlaps(population_1, population_2, N, df_haplotype_freqs):
    population = [population_1, population_2]

    df = {
        person: {
            chromosome: df_haplotype_freqs.sample(
                weights=f"{population[person]}_freq",
                n=N,
                replace=True,
            )
            for chromosome in range(2)
        }
        for person in range(2)
    }

    person_alleles = [[], []]
    only_1 = []
    only_2 = []
    intersection = []

    missing = 0
    for pair in tqdm(range(N), leave=False, desc=str(population)):
        for person in range(2):
            _alleles = []
            for chromosome in range(2):
                match = re.findall(
                    r"A\*(\d+)\:(\d+)-B\*(\d+)\:(\d+)-C\*(\d+)\:(\d+)",
                    df[person][chromosome].iloc[pair].name,
                )[0]
                _alleles += [
                    f"HLA-A*{match[0]}:{match[1]}",
                    f"HLA-B*{match[2]}:{match[3]}",
                    f"HLA-C*{match[4]}:{match[5]}",
                ]
            person_alleles[person].append(_alleles)

        if all(mhc in sets for mhc in person_alleles[0][-1]) and all(
            mhc in sets for mhc in person_alleles[1][-1]
        ):
            peptides_1 = set().union(*[sets[mhc] for mhc in person_alleles[0][-1]])
            peptides_2 = set().union(*[sets[mhc] for mhc in person_alleles[1][-1]])

            _only_1 = len(peptides_1 - peptides_2)
            _only_2 = len(peptides_2 - peptides_1)
            _int = len(peptides_1 & peptides_2)

            _total = _only_1 + _only_2 + _int

            only_1.append(_only_1 / _total)
            only_2.append(_only_2 / _total)
            intersection.append(_int / _total)
        else:
            missing += 1

    return np.array(only_1), np.array(only_2), np.array(intersection)

In [None]:
N = 10000

df_results = pd.DataFrame(
    index=df_populations_broad.index, columns=df_populations_broad.index
)

for i, population_1 in enumerate(df_results.index):
    for j, population_2 in enumerate(df_results.columns):
        if i <= j:
            df_results.at[population_1, population_2] = get_immuno_peptidome_overlaps(
                population_1, population_2, N, df_haplotype_freqs
            )

In [None]:
fig_name = "fig-5-population-antigenicity-comparison"

fig = plt.figure(figsize=(1.5 * A4_width, 1.5 * A4_width))

n_grps = len(df_results.index)

all_axes = []
gs = mpl.gridspec.GridSpec(
    n_grps + 1,
    n_grps + 1,
    height_ratios=[1] * (n_grps + 1),
    width_ratios=[1] * (n_grps + 1),
    wspace=0.0,
    hspace=0.0,
)

for i, population_1 in enumerate(df_results.index):
    for j, population_2 in enumerate(df_results.columns):
        if i <= j:
            r_1, c_1 = i, j + 1
            r_2, c_2 = j + 1, i

            ax = fig.add_subplot(gs[r_1, c_1])
            only_1, only_2, intersection = df_results.at[population_1, population_2]

            o1 = np.mean(only_1)
            o2 = np.mean(only_2)
            ints = np.mean(intersection)

            venn = venn2((o1, o2, ints), set_labels=[population_1, population_2], ax=ax)
            venn.get_label_by_id("10").set_text(f"{o1*100:.1f}%")
            venn.get_label_by_id("01").set_text(f"{o2*100:.1f}%")
            venn.get_label_by_id("11").set_text(f"{ints*100:.1f}%")

            ax = fig.add_subplot(gs[r_2, c_2])
            sns.kdeplot(intersection, ax=ax, color="#AAAA00")

            ax.text(x=0.5, y=0.4, s=f"{population_1} {population_2}")

            ax.spines["top"].set_visible(False)
            ax.spines["right"].set_visible(False)
            ax.spines["left"].set_visible(False)
            ax.get_yaxis().set_visible(False)
            ax.set_xticks([0, 0.25, 0.5, 0.75, 1.0])
            ax.set_xlim((0.0, 1.0))
            ax.set_xticklabels([None, "25%", "50%", "75%", None])
            ax.axvline(ints, linestyle="dashed", color="#AAAA00")

plt.tight_layout()

file_name = figures_main_dir.joinpath(f"{fig_name}.pdf")
fig.savefig(file_name)