# Data Pipeline - Mutation data

## Required Python Libraries

In [None]:
import glob
import numpy as np
import pandas as pd
import pickle

## Envision data

Download the file https://envision.gs.washington.edu/shiny/downloads/human_predicted_combined_20170925.csv.bz2. Unzip it, and run the following shell commands to split the large file into multiple smaller files for each Uniprot ID:

## TCGA Samples

### Input Files

Use the GDC Data Transfer Tool (https://gdc.cancer.gov/access-data/gdc-data-transfer-tool) to download TCGA files listed in one of the manifest files. 

You can use data from any of the four somatic variant callers available. TCGA output in this repo are from using Mutect.<br>
MuSE: manifest_muse.txt<br>
Mutect: manifest_mutect.txt<br>
Somatic Sniper: manifest_somaticsniper.txt<br>
VarScan: manifest_varscan.txt<br>
<br>
Rename each .maf file to the associated cohort abbreviation in capital letters. You should have files named 'ACC.maf', 'BLCA.maf', etc.

### Code

Load list of model genes

In [None]:
model_genes = pd.read_table('../recon/genes.tsv',sep='\t')

Get missense mutations in model genes for every TCGA patient

In [None]:
# initialize results data frame
mutations = pd.DataFrame(columns=['SAMPLE','GENE SYMBOL','UNIPROT','POSITION','AA1','AA2','CODON1','CODON2','ENVISION'])

# list of enzymes with envision data avialable
envision_list = [x.split('.')[0].split('\\')[1] for x in glob.glob('input_data/envision/*.csv')]

# iterate over maf files
for fn in glob.glob('input_data/TCGA/*.maf'):
    
    # load file
    df_maf = pd.read_table(fn,skiprows=[0,1,2,3,4],header=0)
    
    # only keep missense mutations
    df_maf = df_maf[df_maf['Variant_Classification'] == 'Missense_Mutation']
    df_maf = df_maf[df_maf['Variant_Type'] == 'SNP']
    df_maf = df_maf.reset_index(drop=True)
    
    # only keep genes in Recon3D
    df_maf = df_maf.loc[[i for i,x in enumerate(df_maf['Entrez_Gene_Id']) if str(x) in [str(a) for a in model_genes['GENEID'].values.tolist()]]]
    df_maf = df_maf.reset_index(drop=True)
    
    # only keep genes with associated Uniprot ID
    df_maf = df_maf.loc[[i for i,x in enumerate(df_maf['SWISSPROT']) if type(x) == str]]
    df_maf = df_maf.reset_index(drop=True)

    # iterate over mutations
    for i in range(df_maf.shape[0]):
        
        # add data to dataframe
        mutations.loc[mutations.shape[0]] = [df_maf.loc[i]['Tumor_Sample_Barcode'],df_maf.loc[i]['Hugo_Symbol'],df_maf.loc[i]['SWISSPROT'],df_maf.loc[i]['Protein_position'].split('/')[0],df_maf.loc[i]['Amino_acids'].split('/')[0],df_maf.loc[i]['Amino_acids'].split('/')[1],df_maf.loc[i]['Codons'].split('/')[0],df_maf.loc[i]['Codons'].split('/')[1],np.nan]

Fix mutation information for genes with multiple mutations at the same position

In [None]:
# amino acid codons
codons = {'TTT':'F','TTC':'F','TTA':'L','TTG':'L','CTT':'L','CTC':'L','CTA':'L','CTG':'L','ATT':'I','ATC':'I','ATA':'I','ATG':'M','GTT':'V','GTC':'V','GTA':'V','GTG':'V','TCT':'S','TCC':'S','TCA':'S','TCG':'S','CCT':'P','CCC':'P','CCA':'P','CCG':'P','ACT':'T','ACC':'T','ACA':'T','ACG':'T','GCT':'A','GCC':'A','GCA':'A','GCG':'A','TAT':'Y','TAC':'Y','TAA':'stop','TAG':'stop','CAT':'H','CAC':'H','CAA':'Q','CAG':'Q','AAT':'N','AAC':'N','AAA':'K','AAG':'K','GAT':'D','GAC':'D','GAA':'E','GAG':'E','TGT':'C','TGC':'C','TGA':'stop','TGG':'W','CGT':'R','CGC':'R','CGA':'R','CGG':'R','AGT':'S','AGC':'S','AGA':'R','AGG':'R','GGT':'G','GGC':'G','GGA':'G','GGG':'G'}

# initialize sample dictionary
data = {}

# iterate over mutations
for i in range(mutations.shape[0]):
        
    # if sample already in dictionary
    if mutations.loc[i]['SAMPLE'] in data:
        
        # if gene already in dictionary
        if mutations.loc[i]['GENE SYMBOL'] in data[mutations.loc[i]['SAMPLE']]:
            
            # if position already in dictionary
            if mutations.loc[i]['POSITION'] in data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']]:
                
                # add codon to list
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']]['id'].append(i)
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']]['codon'].append(mutations.loc[i]['CODON2'])
                
            # if position not in dictionary
            else:
                
                # create codon list
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']] = {'id':[i], 'codon':[mutations.loc[i]['CODON2']]}
            
            
        # if gene not in dictionary
        else:
        
            # create codon list
            data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] = {}
            data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']] = {'id':[i], 'codon':[mutations.loc[i]['CODON2']]}
    
    # if sample not in dictionary
    else:
        
        # create codon list
        data[mutations.loc[i]['SAMPLE']] = {}
        data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] = {}
        data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']] = {'id':[i], 'codon':[mutations.loc[i]['CODON2']]}

# remove codon columns
mutations = mutations.drop('CODON1', axis=1)
mutations = mutations.drop('CODON2', axis=1)

# find any positions with two mutations
to_remove = []
for sample in data:
    for gene in data[sample]:
        for position in data[sample][gene]:
            if len(data[sample][gene][position]['codon']) > 1:
                
                # build true codon
                codon = list(data[sample][gene][position]['codon'][0].upper())
                for i in range(1,len(data[sample][gene][position]['codon'])):
                    
                    # 1st letter capital
                    if list(data[sample][gene][position]['codon'][i])[0] in ['A','G','C','T']:
                        codon[0] = list(data[sample][gene][position]['codon'][i])[0]
                    elif list(data[sample][gene][position]['codon'][i])[1] in ['A','G','C','T']:
                        codon[1] = list(data[sample][gene][position]['codon'][i])[1]
                    elif list(data[sample][gene][position]['codon'][i])[2] in ['A','G','C','T']:
                        codon[2] = list(data[sample][gene][position]['codon'][i])[2]
                
                codon = ''.join(codon)
                
                # if stop codon, just remove from data frame
                if codons[codon] == 'stop':
                    for index in data[sample][gene][position]['id']:
                        to_remove.append(index)
                
                # if not stop codon, change in data frame
                else:
                    mutations.at[data[sample][gene][position]['id'][0],'AA2'] = codons[codon]
                    for index in data[sample][gene][position]['id'][1:]:
                        to_remove.append(index)
 
                              
mutations = mutations.drop(mutations.index[to_remove]).reset_index(drop=True)

Get Envision score for each mutation

In [None]:
# iterate over mutations
for i in range(mutations.shape[0]):

    # if envision data available for that protein
    if mutations.loc[i]['UNIPROT'] in envision_list:

        # load envision data for that protein
        df_envision = pd.read_csv('input_data/envision/%s.csv' % mutations.loc[i]['UNIPROT'])

        # mutation string
        mutation = '%s_%s%s%s' % (mutations.loc[i]['UNIPROT'],mutations.loc[i]['AA1'],mutations.loc[i]['POSITION'],mutations.loc[i]['AA2'])

        # if envision score available for this mutation
        if mutation in df_envision['id2'].values.tolist():
            
            # get envision score
            mutations.at[i,'ENVISION'] = df_envision.loc[df_envision['id2'].values.tolist().index(mutation)]['Envision_predictions']

Combine Envision scores within individual sample and gene, produce output file for each sample

In [None]:
# initialize sample dictionary
data = {}

# iterate over mutations
for i in range(mutations.shape[0]):
    
    # if envision score given
    if not np.isnan(mutations.loc[i]['ENVISION']):
        
        # if sample already in dictionary
        if mutations.loc[i]['SAMPLE'] in data:
            
            # if gene already in dictionary
            if mutations.loc[i]['GENE SYMBOL'] in data[mutations.loc[i]['SAMPLE']]:
                
                # multiple original value by current value
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] *= mutations.loc[i]['ENVISION']
                
            # if gene not in dictionary
            else:
            
                # set current value
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] = mutations.loc[i]['ENVISION']
        
        # if sample not in dictionary
        else:
            
            # set current value
            data[mutations.loc[i]['SAMPLE']] = {}
            data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] = mutations.loc[i]['ENVISION']

# create file for each sample
for sample in data:
    
    # initialize file
    with open('TCGA/%s.csv' % sample[:16],'w') as f:
        f.write('GENE,ENVISION\n')
        
        # write envision scores
        for gene in data[sample]:
            f.write('%s,%f\n' % (gene,data[sample][gene]))

Save associations between gene symbols and Swissprot accessions

In [None]:
# initialize dictionary
swissprot = {}

# load maf files
mafs = []
for fn in glob.glob('_data_/input/TCGA/*.maf'):
    mafs.append(pd.read_table(fn,skiprows=[0,1,2,3,4],header=0))
df_maf = pd.concat(mafs, ignore_index=True, sort=False)
df_maf = df_maf[['Hugo_Symbol','SWISSPROT']]

In [None]:
# swissprot
swissprot = {}
for gene in model_genes['SYMBOL'].tolist():
    df_subset = df_maf[df_maf['Hugo_Symbol'] == gene]
    founds = list(set(df_subset['SWISSPROT'].tolist()))
    if len(founds) == 1:
        swissprot[gene] = founds[0]
    else:
        swissprot[gene] = np.nan

In [None]:
with open('_data_/processing/swissprot.pickle', 'wb') as f:
    pickle.dump(swissprot, f, protocol=pickle.HIGHEST_PROTOCOL)

### CCLE Samples

### Input Files

'CCLE_DepMap_18q3_maf_20180718.txt' - Go to https://portals.broadinstitute.org/ccle/data, download 'CCLE_DepMap_18q3_maf_20180718.txt'

### Code

Load list of model genes

In [None]:
model_genes = pd.read_table('../recon/genes.tsv',sep='\t')

Load Swissprot conversion

In [None]:
with open('_data_/processing/swissprot.pickle', 'r') as f:
    swissprot = pickle.load(f)

Load CCLE mutation data

In [None]:
# list of enzymes with envision data avialable
envision_list = [x.split('.')[0].split('/')[-1] for x in glob.glob('_data_/input/envision/*.csv')]

In [None]:
# initialize results data frame
mutations = pd.DataFrame(columns=['SAMPLE','GENE SYMBOL','UNIPROT','POSITION','AA1','AA2','CODON1','CODON2','ENVISION'])

# load file
df_maf = pd.read_table('_data_/input/CCLE/CCLE_DepMap_18q3_maf_20180718.txt')

# only keep missense mutations
df_maf = df_maf[df_maf['Variant_Classification'] == 'Missense_Mutation']
df_maf = df_maf[df_maf['Variant_Type'] == 'SNP']
df_maf = df_maf.reset_index(drop=True)

# only keep genes in Recon3D
df_maf = df_maf.loc[[i for i,x in enumerate(df_maf['Entrez_Gene_Id']) if str(x) in [str(a) for a in model_genes['GENEID'].values.tolist()]]]
df_maf = df_maf.reset_index(drop=True)

In [None]:
# only keep genes with associated Uniprot ID
df_maf = df_maf.loc[[i for i,x in enumerate(df_maf['Hugo_Symbol']) if x in swissprot]]
df_maf = df_maf.reset_index(drop=True)

In [None]:
# get data
df_maf['SAMPLE'] = [x.split('_')[0] for x in df_maf['Tumor_Sample_Barcode'].tolist()]
df_maf['GENE SYMBOL'] = df_maf['Hugo_Symbol'].tolist()
df_maf['UNIPROT'] = [swissprot[x] for x in df_maf['Hugo_Symbol'].tolist()]
df_maf['POSITION'] = [x[3:-1] for x in df_maf['Protein_Change'].tolist()]
df_maf['AA1'] = [x[2] for x in df_maf['Protein_Change'].tolist()]
df_maf['AA2'] = [x[-1] for x in df_maf['Protein_Change'].tolist()]
df_maf['CODON1'] = [x[-7:-4] for x in df_maf['Codon_Change'].tolist()]
df_maf['CODON2'] = [x[-3:] for x in df_maf['Codon_Change'].tolist()]
df_maf['ENVISION'] = np.nan
mutations = df_maf[['SAMPLE','GENE SYMBOL','UNIPROT','POSITION','AA1','AA2','CODON1','CODON2','ENVISION']]

Fix mutation information for genes with multiple mutations at the same position

In [None]:
# amino acid codons
codons = {'TTT':'F','TTC':'F','TTA':'L','TTG':'L','CTT':'L','CTC':'L','CTA':'L','CTG':'L','ATT':'I','ATC':'I','ATA':'I','ATG':'M','GTT':'V','GTC':'V','GTA':'V','GTG':'V','TCT':'S','TCC':'S','TCA':'S','TCG':'S','CCT':'P','CCC':'P','CCA':'P','CCG':'P','ACT':'T','ACC':'T','ACA':'T','ACG':'T','GCT':'A','GCC':'A','GCA':'A','GCG':'A','TAT':'Y','TAC':'Y','TAA':'stop','TAG':'stop','CAT':'H','CAC':'H','CAA':'Q','CAG':'Q','AAT':'N','AAC':'N','AAA':'K','AAG':'K','GAT':'D','GAC':'D','GAA':'E','GAG':'E','TGT':'C','TGC':'C','TGA':'stop','TGG':'W','CGT':'R','CGC':'R','CGA':'R','CGG':'R','AGT':'S','AGC':'S','AGA':'R','AGG':'R','GGT':'G','GGC':'G','GGA':'G','GGG':'G'}

# initialize sample dictionary
data = {}

# iterate over mutations
for i in range(mutations.shape[0]):
        
    # if sample already in dictionary
    if mutations.loc[i]['SAMPLE'] in data:
        
        # if gene already in dictionary
        if mutations.loc[i]['GENE SYMBOL'] in data[mutations.loc[i]['SAMPLE']]:
            
            # if position already in dictionary
            if mutations.loc[i]['POSITION'] in data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']]:
                
                # add codon to list
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']]['id'].append(i)
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']]['codon'].append(mutations.loc[i]['CODON2'])
                
            # if position not in dictionary
            else:
                
                # create codon list
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']] = {'id':[i], 'codon':[mutations.loc[i]['CODON2']]}
            
            
        # if gene not in dictionary
        else:
        
            # create codon list
            data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] = {}
            data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']] = {'id':[i], 'codon':[mutations.loc[i]['CODON2']]}
    
    # if sample not in dictionary
    else:
        
        # create codon list
        data[mutations.loc[i]['SAMPLE']] = {}
        data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] = {}
        data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']] = {'id':[i], 'codon':[mutations.loc[i]['CODON2']]}

# remove codon columns
mutations = mutations.drop('CODON1', axis=1)
mutations = mutations.drop('CODON2', axis=1)

# find any positions with two mutations
to_remove = []
for sample in data:
    for gene in data[sample]:
        for position in data[sample][gene]:
            if len(data[sample][gene][position]['codon']) > 1:
                
                # build true codon
                codon = list(data[sample][gene][position]['codon'][0].upper())
                for i in range(1,len(data[sample][gene][position]['codon'])):
                    
                    # 1st letter capital
                    if list(data[sample][gene][position]['codon'][i])[0] in ['A','G','C','T']:
                        codon[0] = list(data[sample][gene][position]['codon'][i])[0]
                    elif list(data[sample][gene][position]['codon'][i])[1] in ['A','G','C','T']:
                        codon[1] = list(data[sample][gene][position]['codon'][i])[1]
                    elif list(data[sample][gene][position]['codon'][i])[2] in ['A','G','C','T']:
                        codon[2] = list(data[sample][gene][position]['codon'][i])[2]
                
                codon = ''.join(codon)
                
                # if stop codon, just remove from data frame
                if codons[codon] == 'stop':
                    for index in data[sample][gene][position]['id']:
                        to_remove.append(index)
                
                # if not stop codon, change in data frame
                else:
                    mutations.at[data[sample][gene][position]['id'][0],'AA2'] = codons[codon]
                    for index in data[sample][gene][position]['id'][1:]:
                        to_remove.append(index)
 
                              
mutations = mutations.drop(mutations.index[to_remove]).reset_index(drop=True)

Get Envision score for each mutation

In [None]:
# load envision data
df_uniprot_list = []
uniprot_list = []
for uniprot in list(set(mutations['UNIPROT'].tolist())):
    if type(uniprot) == str:
        if uniprot in envision_list:
            df_uniprot_list.append(pd.read_csv('_data_/input/envision/%s.csv' % uniprot))
            uniprot_list.append(uniprot)

In [None]:
# iterate over mutations
for i in range(mutations.shape[0]):
    
    # if uniprot data available
    if mutations.loc[i]['UNIPROT'] in uniprot_list:
    
        # subset uniprot data
        df_subset = df_uniprot_list[uniprot_list.index(mutations.loc[i]['UNIPROT'])]
        df_subset = df_subset[df_subset['id2'] == '%s_%s%s%s' % (mutations.loc[i]['UNIPROT'],mutations.loc[i]['AA1'],mutations.loc[i]['POSITION'],mutations.loc[i]['AA2'])]

        # if value
        if df_subset.shape[0] == 1:

            # get envision score
            mutations.at[i,'ENVISION'] = df_subset['Envision_predictions'].tolist()[0]

Combine Envision scores within individual sample and gene, produce output file for each sample

In [None]:
# initialize sample dictionary
data = {}

# iterate over mutations
for i in range(mutations.shape[0]):
    
    # if envision score given
    if not np.isnan(mutations.loc[i]['ENVISION']):
        
        # if sample already in dictionary
        if mutations.loc[i]['SAMPLE'] in data:
            
            # if gene already in dictionary
            if mutations.loc[i]['GENE SYMBOL'] in data[mutations.loc[i]['SAMPLE']]:
                
                # multiple original value by current value
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] *= mutations.loc[i]['ENVISION']
                
            # if gene not in dictionary
            else:
            
                # set current value
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] = mutations.loc[i]['ENVISION']
        
        # if sample not in dictionary
        else:
            
            # set current value
            data[mutations.loc[i]['SAMPLE']] = {}
            data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] = mutations.loc[i]['ENVISION']

In [None]:
# create file for each sample
for sample in data:
    if sample != 'TT':
    
        # initialize file
        with open('CCLE/%s.csv' % sample,'w') as f:
            f.write('GENE,ENVISION\n')

            # write envision scores
            for gene in data[sample]:
                f.write('%s,%f\n' % (gene,data[sample][gene]))