In [30]:
import pandas as pd
mutations = pd.read_csv('../analysis/merged_variants_anno_vaf_filtered.txt', sep="\t",
                        header=0, index_col=0, low_memory=False)
metasamples = pd.read_excel('../Metadata2.xlsx')

In [31]:
# Keep only samples that are tumor-normal pair and labeled as included
df = metasamples.loc[(metasamples['INCLUDED'] == "Y")
                     & (metasamples['TUMOR_ONLY'] == "No")
                     & (metasamples['SAMPLE_TYPE'] != "gDNA")
                     & (metasamples['VAR1'].notnull()),
                     ['PATIENT_ID', 'SAMPLE_ID']]
metasamples.index = metasamples['SAMPLE_ID']
meta_filtered = metasamples.reindex(df['SAMPLE_ID'].tolist())
SAMPLES = [x.strip() for x in df['SAMPLE_ID'] + '-' + df['PATIENT_ID']]
meta_filtered.index = SAMPLES
mutations_filtered = mutations.loc[:, mutations.columns[0:10].to_list() + SAMPLES]

In [42]:
print('Total patients (BEFORE) {}'.format(len(set(meta_filtered[meta_filtered['VAR1'] == 'BEFORE']['PATIENT_ID']))))
print('Total samples (BEFORE) {}'.format(len(set(meta_filtered[meta_filtered['VAR1'] == 'BEFORE']['SAMPLE_ID']))))

print('Total patients (AFTER) {}'.format(len(set(meta_filtered[meta_filtered['VAR1'] == 'AFTER']['PATIENT_ID']))))
print('Total samples (AFTER) {}'.format(len(set(meta_filtered[meta_filtered['VAR1'] == 'AFTER']['SAMPLE_ID']))))

Total patients (BEFORE) 4
Total samples (BEFORE) 6
Total patients (AFTER) 4
Total samples (AFTER) 4


In [35]:
from collections import defaultdict

VAF_FILTER = 5.0
NUM_PATIENTS_FILTER = 1

mutations_seen_only_BEFORE = list()
mutations_seen_only_AFTER = list()
mutations_seen_BEFORE = defaultdict(list)
mutations_seen_AFTER = defaultdict(list)

SAMPLES_BEFORE = meta_filtered[meta_filtered['VAR1'] == 'BEFORE'].index.to_list()
SAMPLES_AFTER = meta_filtered[meta_filtered['VAR1'] == 'AFTER'].index.to_list()

for index, row in mutations_filtered.iterrows():
    samples_before = list()
    samples_after = list()
    gene = row['GENE']
    acc_vaf_BEFORE = 0
    acc_vaf_AFTER = 0
    
    for column, value in row[10:].items():
        if float(value) >= VAF_FILTER:
            if column in SAMPLES_BEFORE:
                acc_vaf_BEFORE += float(value)
                samples_before.append(column)
            if column in SAMPLES_AFTER:
                acc_vaf_AFTER += float(value)
                samples_after.append(column)

    if len(samples_before) > 0:
        patients = set([x.split('-')[-1] for x in samples_before])
        for g in gene.split('-'):
            mutations_seen_BEFORE[g].append((index, acc_vaf_BEFORE, samples_before, patients))
            
    if len(samples_after) > 0:
        patients = set([x.split('-')[-1] for x in samples_after])
        for g in gene.split('-'):
            mutations_seen_AFTER[g].append((index, acc_vaf_AFTER, samples_after, patients))
            
    if len(samples_before) > 0 and len(samples_after) == 0:
        patients = set([x.split('-')[-1] for x in samples_before])
        if len(patients) >= NUM_PATIENTS_FILTER:
            mutations_seen_only_BEFORE.append((index, gene, samples_before, len(samples_before), len(patients)))
            
    if len(samples_before) == 0 and len(samples_after) > 0:
        patients = set([x.split('-')[-1] for x in samples_after])
        if len(patients) >= NUM_PATIENTS_FILTER:
            mutations_seen_only_AFTER.append((index, gene, samples_after, len(samples_after), len(patients)))

In [38]:
with open('../analysis/analysis_before_after/BEFORE/genes_table.txt', 'w') as f:
    f.write('GENE\tCUM_VAF\tNUM_MUTATIONS\tMUTATIONS\tNUM_SAMPLES\tSAMPLES\tNUM_PATIENTS\tPATIENTS\n')
    for gene, records in mutations_seen_BEFORE.items():
        acc_vaf = 0
        patients = set()
        samples = set()
        mutations = set()
        for mut in records:
            acc_vaf += mut[1]
            mutations.add(mut[0])
            samples.update(mut[2])
            patients.update(mut[3])
        f.write('\t'.join([gene,
                           str(acc_vaf),
                           str(len(mutations)), 
                           ';'.join(mutations), 
                           str(len(samples)), 
                           ';'.join(samples),
                           str(len(patients)),
                           ';'.join(patients)]) + '\n')

In [39]:
with open('../analysis/analysis_before_after/AFTER/genes_table.txt', 'w') as f:
    f.write('GENE\tCUM_VAF\tNUM_MUTATIONS\tMUTATIONS\tNUM_SAMPLES\tSAMPLES\tNUM_PATIENTS\tPATIENTS\n')
    for gene, records in mutations_seen_AFTER.items():
        acc_vaf = 0
        patients = set()
        samples = set()
        mutations = set()
        for mut in records:
            acc_vaf += mut[1]
            mutations.add(mut[0])
            samples.update(mut[2])
            patients.update(mut[3])
        f.write('\t'.join([gene,
                           str(acc_vaf),
                           str(len(mutations)), 
                           ';'.join(mutations), 
                           str(len(samples)), 
                           ';'.join(samples),
                           str(len(patients)),
                           ';'.join(patients)]) + '\n')

In [40]:
import gseapy as gp
import matplotlib.pyplot as plt

#genes = [x[1] for x in mutations_seen_only_BEFORE]
#enr = gp.enrichr(gene_list = genes,
#                 gene_sets = ['KEGG_2019_Human','GO_Biological_Process_2018', 'GO_Molecular_Function_2018'],
#                 organism = 'Human',
#                 description = 'Group BEFORE',
#                 outdir = '../analysis/analysis_before_after/BEFORE',
#                 # no_plot=True,
#                 cutoff = 1.0)

mutations_BEFORE = mutations_filtered.loc[[x[0] for x in mutations_seen_only_BEFORE], :]
mutations_BEFORE.insert(10, 'SAMPLES', [';'.join(x[2]) for x in mutations_seen_only_BEFORE])
mutations_BEFORE.insert(11, 'NUM_SAMPLES', [x[3] for x in mutations_seen_only_BEFORE])
mutations_BEFORE.insert(12, 'NUM_PATIENTS', [x[4] for x in mutations_seen_only_BEFORE])
mutations_BEFORE.to_csv('../analysis/analysis_before_after/BEFORE/mutations_table.txt', sep="\t", header=True)

In [41]:
genes = [x[1] for x in mutations_seen_only_AFTER]

#enr = gp.enrichr(gene_list = genes,
#                 gene_sets = ['KEGG_2019_Human','GO_Biological_Process_2018', 'GO_Molecular_Function_2018'],
#                 organism = 'Human',
#                 description = 'Group AFTER',
#                 outdir = 'analysis_before_after/AFTER',
#                 # no_plot=True,
#                 cutoff = 1.0)

mutations_AFTER = mutations_filtered.loc[[x[0] for x in mutations_seen_only_AFTER], :]
mutations_AFTER.insert(10, 'SAMPLES', [';'.join(x[2]) for x in mutations_seen_only_AFTER])
mutations_AFTER.insert(11, 'NUM_SAMPLES', [x[3] for x in mutations_seen_only_AFTER])
mutations_AFTER.insert(12, 'NUM_PATIENTS', [x[4] for x in mutations_seen_only_AFTER])
mutations_AFTER.to_csv('../analysis/analysis_before_after/AFTER/mutations_table.txt', sep="\t", header=True)