# Read & parse snpEFF data

In [None]:
_SAMPLE_TABLE = pd.read_excel("<sampletable")

In [None]:
import pandas as pd
import seaborn as sns
import pathlib
import tqdm.auto as tqdm

tqdm.tqdm.pandas()

In [None]:
files = pd.Series(
    pathlib.Path("variantcalling/annotation/").rglob("*snpEff.ann.vcf.gz")
)

In [None]:
files.index = files.map(lambda x: x.parent.name)

In [None]:
genes = [
    "LZTR1",
    "SMARCA4",
    "PDGFRA",
    "LDB1",
    "EIF1AX",
    "MSH6",
    "HDAC2",
    "PTPMT1",
    "TRAF7",
    "ALK",
    "PTPN11",
    "NF1",
    "SMARCE1",
    "HRAS",
    "PTCH1",
    "KIT",
    "ATRX",
    "AIFM3",
    "PRKAR1A",
    "FLT3",
    "NDUFS3",
    "SPRED1",
    "ACVR1",
    "BRPF1",
    "D2HGDH",
    "MYB",
    "PALB2",
    "CHD7",
    "MET",
    "DGCR8",
    "KLF4",
    "YAP1",
    "CHEK2",
    "PHOX2B",
    "MLL2",
    "PPL",
    "RAF1",
    "SMARCD1",
    "MRE11A",
    "ASXL1",
    "NF2",
    "NBN",
    "CIC",
    "ATXN1L",
    "SDCCAG8",
    "SUZ12",
    "HTRA1",
    "CDKN2C",
    "MRE11",
    "DROSHA",
    "BRCA2",
    "CDKN2A",
    "PIK3C2B",
    "CDK6",
    "STAT6",
    "NRAS",
    "ZMYM3",
    "IST1",
    "EPAS1",
    "PIK3CA",
    "KDM6A",
    "PAX6",
    "GNAQ",
    "HIST1H3B",
    "IKBKAP",
    "MYC",
    "BRCA1",
    "PCDH8",
    "TP53",
    "DAXX",
    "PIK3C2G",
    "H3F3A",
    "HIST1H3C",
    "COL2A1",
    "PIK3R2",
    "AIP",
    "SH3PXD2A",
    "ARID1B",
    "PLCH1",
    "MSH2",
    "MN1",
    "POLE",
    "GNA11",
    "PIK3CG",
    "SMARCD2",
    "AKT2",
    "KMT2B",
    "BCOR",
    "GNAS",
    "KMT2C",
    "IDH2",
    "CDH1",
    "PLXNB1",
    "STAG2",
    "FGFR2",
    "FGFR1",
    "FOXO3",
    "KBTBD4",
    "MYCN",
    "MGMT",
    "SMAD4",
    "PIK3R1",
    "ZIC1",
    "H3F3B",
    "H2AX",
    "PTCH2",
    "APC",
    "FGFR3",
    "MDM2",
    "MEN1",
    "ATM",
    "TSPAN31",
    "TERT",
    "FGFR4",
    "ARID2",
    "ARID1A",
    "SMARCB1",
    "NAB2",
    "KMT2D",
    "H3P6",
    "LRRFIP2",
    "MYBL1",
    "RET",
    "PMS2",
    "JAK3",
    "CCND2",
    "CSF1R",
    "H3C3",
    "DICER1",
    "MLH1",
    "KLK1",
    "IDH1",
    "ZIC4",
    "CTNNB1",
    "RAD50",
    "TBR1",
    "BRAF",
    "KCNK12",
    "SMARCD3",
    "ERBB2",
    "ROS1",
    "FOXR2",
    "IDO2",
    "EZH2",
    "JAK2",
    "MDM4",
    "EZHIP",
    "MPL",
    "MYL1",
    "NOTCH1",
    "ATXN1",
    "C11ORF95",
    "GSE1",
    "CDK4",
    "GABRA6",
    "BRPF3",
    "BAP1",
    "EED",
    "EGFR",
    "HNF1A",
    "DDX3X",
    "RBPJ",
    "TH2LCRR",
    "CREBBP",
    "MYCNOS",
    "ABL1",
    "CCND1",
    "SMARCA2",
    "SUFU",
    "TSC1",
    "TSC2",
    "SMIM4",
    "SETD2",
    "H2AFX",
    "PBRM1",
    "SF3B1",
    "DDR1",
    "FBXW7",
    "AKT3",
    "TCF4",
    "PRKCA",
    "KDR",
    "H3C2",
    "SMO",
    "KRAS",
    "FUBP1",
    "NTRK2",
    "RB1",
    "AKT1",
    "NDRG2",
    "PTEN",
    "ESR1",
    "ATR",
    "GPR161",
    "SRP19",
    "VHL",
    "PPM1D",
    "RELA",
    "CTDNEP1",
    "MTOR",
    "POLD1",
]

In [None]:
def read_and_filter(file):
    tmp = pd.read_csv(file, comment="#", sep="\t", header=None)
    tmp = tmp[tmp[6] == "PASS"].drop(columns=[2, 6, 8])
    tmp2 = pd.DataFrame()
    if not tmp.empty:
        tmp2 = tmp[
            (tmp[7].str.split("|", expand=True)[3] == "TERT")
            & (tmp[7].str.split("|", expand=True)[1] == "upstream_gene_variant")
        ]
        tmp = tmp[tmp[7].str.split("|", expand=True)[2].isin(["HIGH", "MODERATE"])]
    if not tmp.empty:
        genes_idx = (
            tmp[7]
            .str.split("|", expand=True)[3]
            .str.split("&", expand=True)
            .apply(lambda x: x.isin(genes))
            .any(axis=1)
        )
        tmp = tmp[genes_idx]
    tmp = pd.concat((tmp2, tmp)).drop_duplicates()
    return tmp.copy()

In [None]:
data = files.progress_apply(read_and_filter)

In [None]:
data = pd.concat(data.to_dict())

In [None]:
data["Gene"] = data[7].str.split("|", expand=True)[3]
data["Impact"] = data[7].str.split("|", expand=True)[2]

In [None]:
data = data.reset_index(level=1, drop=True)

In [None]:
data["variant"] = data[7].str.split("|", expand=True)[9]

In [None]:
data.to_csv("mutation_status_snpEff.csv")

# Read & parse VEP data

In [None]:
import pandas as pd
import pathlib

In [None]:
files = pd.Series(pathlib.Path("variantcalling/filtered/").glob("*VEP*"))

In [None]:
files.index = files.map(lambda x: x.name).str.split(".", n=1, expand=True)[0]

In [None]:
cols = "Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|UNIPROT_ISOFORM|GENE_PHENO|SIFT|PolyPhen|DOMAINS|miRNA|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|gnomADe_AF|gnomADe_AFR_AF|gnomADe_AMR_AF|gnomADe_ASJ_AF|gnomADe_EAS_AF|gnomADe_FIN_AF|gnomADe_NFE_AF|gnomADe_OTH_AF|gnomADe_SAS_AF|gnomADg_AF|gnomADg_AFR_AF|gnomADg_AMI_AF|gnomADg_AMR_AF|gnomADg_ASJ_AF|gnomADg_EAS_AF|gnomADg_FIN_AF|gnomADg_MID_AF|gnomADg_NFE_AF|gnomADg_OTH_AF|gnomADg_SAS_AF|MAX_AF|MAX_AF_POPS|FREQS|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS"

In [None]:
cols = cols.split("|")

In [None]:
def read(file):
    try:
        tmp = pd.read_csv(
            file,
            comment="#",
            sep="\t",
            header=None,
            index_col=[0, 1, 2, 3, 4],
            usecols=[0, 1, 2, 3, 4, 7],
        )[7]
    except pd.errors.EmptyDataError:
        return pd.DataFrame()
    tmp = tmp.str.extract("CSQ=([^;]+)")
    tmp = tmp[0].str.split("|", expand=True, n=len(cols))
    if len(tmp.columns) == 1:
        return pd.DataFrame()
    tmp.columns = (cols + ["other"]) if len(tmp.columns) > len(cols) else cols
    return tmp[tmp.IMPACT.isin(["HIGH", "MODERATE"])].copy()

In [None]:
data = files.map(read)

In [None]:
data = pd.concat(data.to_dict())

In [None]:
data.to_csv("vep_annotation.csv")

# Plotting

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np

In [None]:
data = _SAMPLE_TABLE.copy()

In [None]:
data = data.set_index("Supplier_Nummer")[
    [
        "Age",
        "Neural Subgroup",
        "Sex",
        "DKFZ methylation subclass",
        "MGMT",
        "Location",
        "Idat",
    ]
]

In [None]:
palette_group = {
    "Sex": {
        "male": sns.color_palette("tab20")[0],
        "female": sns.color_palette("tab20")[2],
    },
    "DKFZ methylation subclass": {
        "RTK 1": sns.color_palette("tab20")[8],
        "RTK 2": sns.color_palette("tab20")[9],
        "MES": sns.color_palette("tab20")[7],
    },
    "MGMT": {
        "methylated": sns.color_palette("Greys")[5],
        "non-methylated": sns.color_palette("Greys")[0],
    },
    "Location": {
        "temporal": sns.color_palette("tab20")[10],
        "parietal": sns.color_palette("tab20")[12],
        "occipital": sns.color_palette("tab20")[14],
        "frontal": sns.color_palette("tab20")[16],
        "brain, nos": sns.color_palette("tab20")[18],
        "cerebrum": sns.color_palette("tab20")[19],
    },
}

In [None]:
palette = {k: v for x in palette_group.values() for k, v in x.items()}

In [None]:
sns.set_context("talk")

In [None]:
anno = data.copy()

In [None]:
data = pd.read_csv("mutation_status_snpEff.csv", index_col=0)

data = data[data["7"].str.extract("MQ=(\d+)")[0].astype(int) > 30]

data["type"] = data["7"].str.split("|", expand=True)[1]

data["type"].unique()

data = data[data["5"] > 20]
data = data[pd.to_numeric(data["9"].str.split(":", expand=True)[3]) > 15]

data["consensus_type"] = pd.NA
data = data[
    ~data.index.isin(
        data.query("Gene == 'TERT' & type != 'upstream_gene_variant'").index
    )
]
data.loc[
    data.consensus_type.isna() & (data.variant.str.contains("c.-")), "consensus_type"
] = "Promoter"
data.loc[
    data.consensus_type.isna() & data.type.str.contains("missense_variant"),
    "consensus_type",
] = "Missense"
data.loc[
    data.consensus_type.isna() & data.type.str.contains("stop_gained"), "consensus_type"
] = "Truncation"
data.loc[
    data.consensus_type.isna()
    & data.variant.str.contains("dup")
    & ~data.type.str.contains("intron_variant"),
    "consensus_type",
] = "InDel"
data.loc[
    data.consensus_type.isna()
    & data.variant.str.contains("del")
    & ~data.type.str.contains("intron_variant"),
    "consensus_type",
] = "InDel"
data.loc[
    data.consensus_type.isna()
    & data.variant.str.contains("ins")
    & ~data.type.str.contains("intron_variant"),
    "consensus_type",
] = "InDel"
data.loc[
    data.consensus_type.isna() & data.type.str.contains("stop_lost"), "consensus_type"
] = "Missense"
data.loc[data.consensus_type.isna(), "consensus_type"] = "Missense"

In [None]:
data = data[~data.type.str.contains("intron_variant")]

In [None]:
data = data.set_index(["0", "1", "3", "4"], append=True)

In [None]:
annot = pd.read_csv("variantcalling/vep_annotation.csv", index_col=[0, 1, 2, 4, 5])

In [None]:
data["1000G_AF"] = annot["AF"].reindex(data.index).fillna(0)

In [None]:
data = data[~data.variant.isin(["c.5885C>T", "c.2447dupA"])]

In [None]:
data = data.reset_index(level=[1, 2, 3, 4])

In [None]:
data = data.query("`1000G_AF` < 0.001")

In [None]:
data = data[~(data.variant == "c.5885C>T")]

In [None]:
data["Supplier_Nummer"] = (
    pd.read_csv("ID_mapping.csv")
    .set_index("Supplier_Nummer")["Supplier_Nummer.1"]
    .squeeze()
    .reindex(data.index)
    .values
)

In [None]:
data = data[data.Supplier_Nummer.isin(anno.index)]

In [None]:
mapped = data.pivot_table(
    index="Supplier_Nummer", columns="Gene", values="consensus_type", aggfunc=list
)

In [None]:
def assign_new(x):
    try:
        x = list(set(x))
        l = len(x)
    except:
        return pd.NA
    if l == 1:
        return x[0]
    return "Multiple"

In [None]:
mutations = mapped.applymap(assign_new)

In [None]:
mutations = mutations.stack().reset_index()

In [None]:
mutations = mutations.rename(columns={0: "type"})

In [None]:
genes = [
    "PDGFRA",
    "PIK3CA",
    "MDM2",
    "PTPN11",
    "NF1",
    "ATM",
    "TERT",
    "TP53",
    "ATRX",
    "KIT",
    "CDK4",
    "EGFR",
    "TSC2",
    "MET",
    "SETD2",
    "CCND2",
    "KDR",
    "IDH1",
    "BCOR",
    "KMT2C",
    "RB1",
    "PTEN",
    "BRAF",
    "CDKN2A",
    "FGFR1",
    "MYCN",
    "PIK3R1",
]

In [None]:
mut_select = mutations[mutations.Gene.isin(genes)]

In [None]:
mut_select.Gene = pd.Categorical(mut_select.Gene, categories=genes)

In [None]:
mutfraq = mutations.Supplier_Nummer.value_counts()

In [None]:
mut_palette = sns.color_palette(n_colors=mutations.type.nunique())
mut_palette = dict(zip(mutations.type.unique(), mut_palette))

In [None]:
sns.set_context("paper")

In [None]:
def compute_plot(data, mutations, mutfreq, pos=((0, -9), (0, -1.5))):
    fig, ax = plt.subplots(
        4,
        2,
        sharex="col",
        sharey="row",
        gridspec_kw=dict(
            width_ratios=[1, 0.1],
            height_ratios=[0.5, 2, 0.5, 0.5],  # 0.5],
            hspace=0.03,
            wspace=0.01,
        ),
        figsize=(11, 9),
    )
    axes_dict = {
        "anno": ax[0, 0],
        "mutation": ax[1, 0],
        "freq": ax[1, 1],
        "age": ax[2, 0],
        #  "CNV": ax[3,0],
        "patfreq": ax[3, 0],
    }
    ax[0, 1].axis("off")
    ax[2, 1].axis("off")
    ax[3, 1].axis("off")
    # ax[4,1].axis("off")
    sns.scatterplot(x=data.index, y=data.Age, ax=axes_dict["age"], color="lightgray")
    axes_dict["age"].set_xticklabels("")
    axes_dict["age"].set_xlabel("")
    axes_dict["age"].tick_params(bottom=False)
    axes_dict["age"].set_ylabel("Age\n(years)")
    axes_dict["age"].set_ylim(15, 86)
    anno = (
        data[["Sex", "DKFZ methylation subclass", "MGMT", "Location"]]
        .reset_index()
        .melt(id_vars=["Supplier_Nummer"])
    )
    sns.scatterplot(
        x="Supplier_Nummer",
        y="variable",
        ax=axes_dict["anno"],
        data=anno,
        hue="value",
        marker="s",
        palette=palette,
        legend=True,
        s=50,
    )
    sns.move_legend(axes_dict["anno"], loc=pos[0], ncol=5, title="Clinical annotation")
    axes_dict["anno"].set_xticklabels("")
    axes_dict["anno"].set_xlabel("")
    axes_dict["anno"].set_ylabel("")
    axes_dict["anno"].set_ylim(-1, 4)
    axes_dict["anno"].tick_params(bottom=False)
    # axes_dict["anno"].legend(bbox_to_anchor=axes_dict["patfreq"].bbox)
    sns.scatterplot(
        x="Supplier_Nummer",
        y="Gene",
        ax=axes_dict["mutation"],
        data=mutations,
        hue="type",
        marker="s",
        legend=True,
        palette=mut_palette,
        s=50,
    )
    axes_dict["mutation"].tick_params(bottom=False)
    axes_dict["mutation"].set_ylabel("")
    sns.move_legend(axes_dict["mutation"], loc=pos[1], ncol=5, title="Alteration")

    counts = (
        mutations.Gene.value_counts(sort=False) / data.index.size * 100
    ).reset_index()
    counts.columns = ["Gene", "count"]
    axes_dict["freq"].barh(counts["Gene"], counts["count"], color="black")
    axes_dict["freq"].tick_params(left=False)
    axes_dict["freq"].set_xticks([0, 10, 20, 30, 40, 50])
    axes_dict["freq"].set_xticklabels(
        ["0\%", "10\%", "20\%", "30\%", "40\%", "50\%"], rotation=-90
    )
    axes_dict["freq"].tick_params(labelbottom=True)
    mutfreq.name = "count"
    sns.barplot(
        x="Supplier_Nummer",
        y="count",
        data=mutfreq.reset_index(),
        color="black",
        ax=axes_dict["patfreq"],
    )
    axes_dict["patfreq"].set_xticklabels(
        axes_dict["patfreq"].get_xticklabels(), rotation=90
    )
    axes_dict["patfreq"].set_xlabel("")
    axes_dict["patfreq"].set_ylabel("N. of\nMutations")
    axes_dict["patfreq"].set_ylim(-0, 73)
    return fig

In [None]:
subset = anno.query("`Neural Subgroup` == 'low'")
mut_subset = mut_select[mut_select.Supplier_Nummer.isin(subset.index)]
mutfraq_subset = mutfraq.reindex(subset.index)
fig = compute_plot(subset, mut_subset, mutfraq_subset, pos=((0, -10.2), (0, -1.8)))
fig.suptitle("Neural low")
fig.savefig("oncopanel_neural_low.svg", dpi=200, bbox_inches="tight")

In [None]:
subset = anno.query("`Neural Subgroup` == 'high'")
mut_subset = mut_select[mut_select.Supplier_Nummer.isin(subset.index)]
mutfraq_subset = mutfraq.reindex(subset.index)
fig = compute_plot(subset, mut_subset, mutfraq_subset, pos=((0, -10.2), (0, -1.8)))
fig.suptitle("Neural high")
fig.savefig("oncopanel_neural_high.svg", dpi=200, bbox_inches="tight")