In [24]:
import pandas as pd
mutations = pd.read_csv('merged_variants_anno_vaf_filtered.txt', sep="\t",
                        header=0, index_col=0, low_memory=False)
metasamples = pd.read_excel('../Metadata2.xlsx')

In [25]:
# Keep only samples that are tumor-normal pair and labeled as included
df = metasamples.loc[(metasamples['INCLUDED'] == "Y")
                     & (metasamples['TUMOR_ONLY'] == "No")
                     & (metasamples['SAMPLE_TYPE'] != "gDNA")
                     & (metasamples['VAR1'].notnull()),
                     ['PATIENT_ID', 'SAMPLE_ID']]
metasamples.index = metasamples['SAMPLE_ID']
meta_filtered = metasamples.reindex(df['SAMPLE_ID'].tolist())
SAMPLES = [x.strip() for x in df['SAMPLE_ID'] + '-' + df['PATIENT_ID']]
meta_filtered.index = SAMPLES
mutations.index = mutations['VARIANT_KEY']
mutations_filtered = mutations.loc[:, ['GENE'] + SAMPLES]

In [26]:
mutations_seen_BEFORE = list()
mutations_seen_AFTER = list()

SAMPLES_BEFORE = meta_filtered[meta_filtered['VAR1'] == 'BEFORE'].index.to_list()
SAMPLES_AFTER = meta_filtered[meta_filtered['VAR1'] == 'AFTER'].index.to_list()
GENES = mutations_filtered['GENE']
for index, row in mutations_filtered.iloc[:,1:].iterrows():
    samples_before = list()
    samples_after = list()
    for column, value in row.items():
        if value >= 5.0:
            if column in SAMPLES_BEFORE:
                samples_before.append(column)
            if column in SAMPLES_AFTER:
                samples_after.append(column)
    if len(samples_before) > 0 and len(samples_after) == 0:
        patients = [x.split('-')[-1] for x in samples_before]
        if len(set(patients)) > 0:
            mutations_seen_BEFORE.append((index,
                                         GENES[index],
                                         len(samples_before), 
                                         len(set([x.split('-')[-1] for x in SAMPLES_BEFORE]))))
    if len(samples_before) == 0 and len(samples_after) > 0:
        patients = [x.split('-')[-1] for x in samples_after]
        if len(set(patients)) > 0:
            mutations_seen_AFTER.append((index,
                                        GENES[index],
                                        len(samples_after), 
                                        len(set([x.split('-')[-1] for x in SAMPLES_AFTER]))))

In [27]:
import gseapy as gp
import matplotlib.pyplot as plt

genes = [x[1] for x in mutations_seen_BEFORE if x[3] > 1]
enr = gp.enrichr(gene_list = genes,
                 gene_sets = ['KEGG_2019_Human','GO_Biological_Process_2018', 'GO_Molecular_Function_2018'],
                 organism = 'Human',
                 description = 'Group BEFORE',
                 outdir = 'analysis_before_after/BEFORE',
                 # no_plot=True,
                 cutoff = 1.0)
mutations.loc[[x[0] for x in mutations_seen_BEFORE if x[3] > 1], :].to_csv('analysis_before_after/BEFORE/mutations_table.txt', sep="\t", header=True)

2021-02-02 15:31:07,684 Error fetching enrichment results: KEGG_2019_Human
2021-02-02 15:31:10,679 Error fetching enrichment results: GO_Biological_Process_2018
2021-02-02 15:31:13,861 Error fetching enrichment results: GO_Molecular_Function_2018


In [28]:
genes = [x[1] for x in mutations_seen_AFTER if x[3] > 1]
enr = gp.enrichr(gene_list = genes,
                 gene_sets = ['KEGG_2019_Human','GO_Biological_Process_2018', 'GO_Molecular_Function_2018'],
                 organism = 'Human',
                 description = 'Group AFTER',
                 outdir = 'analysis_before_after/AFTER',
                 # no_plot=True,
                 cutoff = 1.0)
mutations.loc[[x[0] for x in mutations_seen_AFTER if x[3] > 1], :].to_csv('analysis_before_after/AFTER/mutations_table.txt', sep="\t", header=True)

2021-02-02 15:31:20,837 Error fetching enrichment results: KEGG_2019_Human
2021-02-02 15:31:24,354 Error fetching enrichment results: GO_Biological_Process_2018
2021-02-02 15:31:28,342 Error fetching enrichment results: GO_Molecular_Function_2018
