# Data Pipeline - IDH1 mutations

## Required Python Libraries

In [None]:
import glob
import numpy as np
import pandas as pd

## Code

List of mutations

In [None]:
mutations = ['H133Q','A134D','R100Q','R132H','R132C','R132G','R132W','R132A','R132Q','R132K','R132N']
patients = [[] for x in mutations]

TCGA samples

In [None]:
# iterate over maf files
for fn in glob.glob('../mutation/_data_/input/TCGA/*.maf'):
    
    # load file
    df_maf = pd.read_table(fn,skiprows=[0,1,2,3,4],header=0)
    
    # only look at IDH1 mutations
    df_maf = df_maf[df_maf['Hugo_Symbol'] == 'IDH1']
    
    # iterate over mutations
    for i,mutation in enumerate(mutations):
        df_maf_mutation = df_maf[df_maf['HGVSp_Short'] == 'p.%s' % mutation]

        # get list of patients with mutation
        samples = df_maf_mutation['Tumor_Sample_Barcode'].values.tolist()
        for sample in samples:
            if sample[:16] not in patients[i]:
                patients[i].append(sample[:16])

CCLE samples

In [None]:
# load file
df_maf = pd.read_table('../mutation/_data_/input/CCLE/CCLE_DepMap_18q3_maf_20180718.txt')

# only look at IDH1 mutations
df_maf = df_maf[df_maf['Hugo_Symbol'] == 'IDH1']

# iterate over mutations
for i,mutation in enumerate(mutations):
    df_maf_mutation = df_maf[df_maf['Protein_Change'] == 'p.%s' % mutation]

    # get list of patients with mutation
    samples = [x.split('_')[0] for x in df_maf_mutation['Tumor_Sample_Barcode'].values.tolist()]
    for sample in samples:
        if sample not in patients[i]:
            patients[i].append(sample)

Output sample list

In [None]:
for i,mutation in enumerate(mutations):
    with open('%s.txt' % mutation,'w') as f:
        for sample in patients[i]:
            f.write('%s\n' % sample)