In [80]:
import pandas as pd
mutations = pd.read_csv('../analysis/merged_variants_anno_vaf_filtered.txt', sep="\t",
                        header=0, index_col=0, low_memory=False)
metasamples = pd.read_excel('../Metadata2.xlsx')

In [81]:
# Keep only samples that are tumor-normal pair and labeled as included
df = metasamples.loc[(metasamples['INCLUDED'] == "Y")
                     & (metasamples['TUMOR_ONLY'] == "No")
                     & (metasamples['SAMPLE_TYPE'] != "gDNA"), 
                     ['PATIENT_ID', 'SAMPLE_ID']]
metasamples.index = metasamples['SAMPLE_ID']
meta_filtered = metasamples.reindex(df['SAMPLE_ID'].tolist())
SAMPLES = [x.strip() for x in df['SAMPLE_ID'] + '-' + df['PATIENT_ID']]
meta_filtered.index = SAMPLES
mutations_filtered = mutations.loc[:, mutations.columns[0:10].to_list() + SAMPLES]

In [82]:
print(set(meta_filtered['PATIENT_ID']))

print(len(set(meta_filtered['PATIENT_ID'])))
print(len(set(meta_filtered['SAMPLE_ID'])))

print(len(set(meta_filtered[meta_filtered['VAR2'] == 'E']['PATIENT_ID'])))
print(len(set(meta_filtered[meta_filtered['VAR2'] == 'E']['SAMPLE_ID'])))

print(len(set(meta_filtered[meta_filtered['VAR2'] == 'F']['PATIENT_ID'])))
print(len(set(meta_filtered[meta_filtered['VAR2'] == 'F']['SAMPLE_ID'])))

{'VHIO_14', 'VHIO_29', 'VHIO_27', 'VHIO_35', 'VHIO_06', 'VHIO_34', 'VHIO_38', 'VHIO_17', 'VHIO_39', 'VHIO_16', 'VHIO_09', 'VHIO_28', 'VHIO_30', 'VHIO_15', 'VHIO_33', 'VHIO_01', 'VHIO_37', 'VHIO_13', 'VHIO_21', 'VHIO_10', 'VHIO_08', 'VHIO_11', 'VHIO_32', 'VHIO_40', 'VHIO_07'}
25
74
19
46
12
28


In [85]:
VAF_FILTER = 5.0
NUM_PATIENTS_FILTER = 1

mutations_seen_E = list()
mutations_seen_F = list()

SAMPLES_E = meta_filtered[meta_filtered['VAR2'] == 'E'].index.to_list()
SAMPLES_F = meta_filtered[meta_filtered['VAR2'] == 'F'].index.to_list()

for index, row in mutations_filtered.iterrows():
    samples_E = list()
    samples_F = list()
    gene = row['GENE']
    for column, value in row[10:].items():
        if float(value) > VAF_FILTER:
            if column in SAMPLES_E:
                samples_E.append(column)
            if column in SAMPLES_F:
                samples_F.append(column)
    if len(samples_E) > 0 and len(samples_F) == 0:
        patients = set([x.split('-')[-1] for x in samples_E])
        if len(patients) > NUM_PATIENTS_FILTER:
            mutations_seen_E.append((index, gene, samples_E, len(samples_E), len(patients)))
    if len(samples_E) == 0 and len(samples_F) > 0:
        patients = set([x.split('-')[-1] for x in samples_F])
        if len(patients) > NUM_PATIENTS_FILTER:
            mutations_seen_F.append((index, gene, samples_F, len(samples_F), len(patients)))

In [95]:
import gseapy as gp
import matplotlib.pyplot as plt

genes_E = [x[1] for x in mutations_seen_E]
enr = gp.enrichr(gene_list = genes_E,
                 gene_sets = ['KEGG_2019_Human','GO_Biological_Process_2018', 'GO_Molecular_Function_2018'],
                 organism = 'Human',
                 description = 'Group E',
                 outdir = '../analysis/analysis_responders/E',
                 # no_plot=True,
                 cutoff = 1.0)

mutations_E = mutations_filtered.loc[[x[0] for x in mutations_seen_E], :]
mutations_E.insert(10, 'SAMPLES', [';'.join(x[2]) for x in mutations_seen_E])
mutations_E.insert(11, 'NUM_SAMPLES', [x[3] for x in mutations_seen_E])
mutations_E.insert(12, 'NUM_PATIENTS', [x[4] for x in mutations_seen_E])
mutations_E.to_csv('../analysis/analysis_responders/E/mutations_table.txt', sep="\t", header=True)

2021-02-03 19:03:04,248 Error fetching enrichment results: KEGG_2019_Human
2021-02-03 19:03:07,043 Error fetching enrichment results: GO_Biological_Process_2018
2021-02-03 19:03:09,940 Error fetching enrichment results: GO_Molecular_Function_2018


In [96]:
import gseapy as gp
import matplotlib.pyplot as plt

genes_F = [x[1] for x in mutations_seen_F]
enr = gp.enrichr(gene_list = genes_F,
                 gene_sets = ['KEGG_2019_Human','GO_Biological_Process_2018', 'GO_Molecular_Function_2018'],
                 organism = 'Human',
                 description = 'Group F',
                 outdir = '../analysis/analysis_responders/F',
                 # no_plot=True,
                 cutoff = 1.0)

mutations_F = mutations_filtered.loc[[x[0] for x in mutations_seen_F], :]
mutations_F.insert(10, 'SAMPLES', [';'.join(x[2]) for x in mutations_seen_F])
mutations_F.insert(11, 'NUM_SAMPLES', [x[3] for x in mutations_seen_F])
mutations_F.insert(12, 'NUM_PATIENTS', [x[4] for x in mutations_seen_F])
mutations_F.to_csv('../analysis/analysis_responders/F/mutations_table.txt', sep="\t", header=True)

2021-02-03 19:03:15,597 Error fetching enrichment results: KEGG_2019_Human
2021-02-03 19:03:18,262 Error fetching enrichment results: GO_Biological_Process_2018
2021-02-03 19:03:21,031 Error fetching enrichment results: GO_Molecular_Function_2018
