In [14]:
import pandas as pd
mutations = pd.read_csv('../analysis/merged_variants_anno_vaf_filtered.txt', sep="\t",
                        header=0, index_col=0, low_memory=False)
metasamples = pd.read_excel('../Metadata2.xlsx')

In [15]:
# Keep only samples that are tumor-normal pair and labeled as included
df = metasamples.loc[(metasamples['INCLUDED'] == "Y")
                     & (metasamples['TUMOR_ONLY'] == "No")
                     & (metasamples['SAMPLE_TYPE'] != "gDNA")
                     & (metasamples['VAR1'].notnull()),
                     ['PATIENT_ID', 'SAMPLE_ID']]
metasamples.index = metasamples['SAMPLE_ID']
meta_filtered = metasamples.reindex(df['SAMPLE_ID'].tolist())
SAMPLES = [x.strip() for x in df['SAMPLE_ID'] + '-' + df['PATIENT_ID']]
meta_filtered.index = SAMPLES
mutations_filtered = mutations.loc[:, mutations.columns[0:10].to_list() + SAMPLES]

In [16]:
print(set(meta_filtered['PATIENT_ID']))
print(len(set(meta_filtered['PATIENT_ID'])))

{'VHIO_01', 'VHIO_21', 'VHIO_10', 'VHIO_40'}
4


In [17]:
VAF_FILTER = 5.0
NUM_PATIENTS_FILTER = 1

mutations_seen_BEFORE = list()
mutations_seen_AFTER = list()

SAMPLES_BEFORE = meta_filtered[meta_filtered['VAR1'] == 'BEFORE'].index.to_list()
SAMPLES_AFTER = meta_filtered[meta_filtered['VAR1'] == 'AFTER'].index.to_list()

for index, row in mutations_filtered.iterrows():
    samples_before = list()
    samples_after = list()
    gene = row['GENE']
    for column, value in row[10:].items():
        if float(value) >= VAF_FILTER:
            if column in SAMPLES_BEFORE:
                samples_before.append(column)
            if column in SAMPLES_AFTER:
                samples_after.append(column)
    if len(samples_before) > 0 and len(samples_after) == 0:
        patients = set([x.split('-')[-1] for x in samples_before])
        if len(patients) > NUM_PATIENTS_FILTER:
            mutations_seen_BEFORE.append((index, gene, samples_before, len(samples_before), len(patients)))
    if len(samples_before) == 0 and len(samples_after) > 0:
        patients = set([x.split('-')[-1] for x in samples_after])
        if len(patients) > NUM_PATIENTS_FILTER:
            mutations_seen_AFTER.append((index, gene, samples_after, len(samples_after), len(patients)))

In [18]:
mutations_seen_BEFORE

[('chr7:140753336 A>T',
  'BRAF',
  ['1123_3_cfDNA-VHIO_01',
   'DAS_529_93_cfDNA-VHIO_40',
   'CTAX033_PDX-VHIO_40',
   'DAS_16_658947_MPP216_L18_4333_89_tumor-VHIO_40'],
  4,
  2)]

In [19]:
mutations_seen_AFTER

[]

In [21]:
import gseapy as gp
import matplotlib.pyplot as plt

genes = [x[1] for x in mutations_seen_BEFORE]
enr = gp.enrichr(gene_list = genes,
                 gene_sets = ['KEGG_2019_Human','GO_Biological_Process_2018', 'GO_Molecular_Function_2018'],
                 organism = 'Human',
                 description = 'Group BEFORE',
                 outdir = '../analysis/analysis_before_after/BEFORE',
                 # no_plot=True,
                 cutoff = 1.0)

mutations_BEFORE = mutations_filtered.loc[[x[0] for x in mutations_seen_BEFORE], :]
mutations_BEFORE.insert(10, 'SAMPLES', [';'.join(x[2]) for x in mutations_seen_BEFORE])
mutations_BEFORE.insert(11, 'NUM_SAMPLES', [x[3] for x in mutations_seen_BEFORE])
mutations_BEFORE.insert(12, 'NUM_PATIENTS', [x[4] for x in mutations_seen_BEFORE])
mutations_BEFORE.to_csv('../analysis/analysis_before_after/BEFORE/mutations_table.txt', sep="\t", header=True)

mutations.loc[[x[0] for x in mutations_seen_BEFORE if x[3] > 1], :].to_csv('analysis_before_after/BEFORE/mutations_table.txt', sep="\t", header=True)

2021-02-03 19:02:43,889 Error fetching enrichment results: KEGG_2019_Human
2021-02-03 19:02:46,612 Error fetching enrichment results: GO_Biological_Process_2018
2021-02-03 19:02:49,339 Error fetching enrichment results: GO_Molecular_Function_2018


In [28]:
genes = [x[1] for x in mutations_seen_AFTER]

enr = gp.enrichr(gene_list = genes,
                 gene_sets = ['KEGG_2019_Human','GO_Biological_Process_2018', 'GO_Molecular_Function_2018'],
                 organism = 'Human',
                 description = 'Group AFTER',
                 outdir = 'analysis_before_after/AFTER',
                 # no_plot=True,
                 cutoff = 1.0)

mutations_AFTER = mutations_filtered.loc[[x[0] for x in mutations_seen_AFTER], :]
mutations_AFTER.insert(10, 'SAMPLES', [';'.join(x[2]) for x in mutations_seen_AFTER])
mutations_AFTER.insert(11, 'NUM_SAMPLES', [x[3] for x in mutations_seen_AFTER])
mutations_AFTER.insert(12, 'NUM_PATIENTS', [x[4] for x in mutations_seen_AFTER])
mutations_AFTER.to_csv('../analysis/analysis_before_after/AFTER/mutations_table.txt', sep="\t", header=True)

mutations.loc[[x[0] for x in mutations_seen_BEFORE if x[3] > 1], :].to_csv('analysis_before_after/AFTER/mutations_table.txt', sep="\t", header=True)


2021-02-02 15:31:20,837 Error fetching enrichment results: KEGG_2019_Human
2021-02-02 15:31:24,354 Error fetching enrichment results: GO_Biological_Process_2018
2021-02-02 15:31:28,342 Error fetching enrichment results: GO_Molecular_Function_2018
