# Data Pipeline - Mutations

## Python 2.7 Required Modules

In [1]:
import glob
import numpy as np
import pandas as pd
import pickle

## Input Data

Download 'mutation/input_data' folder from https://www.dropbox.com/sh/ix63l8zb1kpcdj4/AABpiNF10TFdjO35vy1AjkYDa?dl=0. Place 'input_data' folder in 'qFlux/data/mutation/'. Below details the individual files.

### Envision data

Original data file: https://envision.gs.washington.edu/shiny/downloads/human_predicted_combined_20170925.csv.bz2. This file was unzipped, and the following shell commands were ran to split the large file into ultiple smaller files for each Uniprot ID:

### NCI-60 Samples

'WES_variants.xlsx' - Go to https://www.cancerrxgene.org/downloads, choose 'Sequencing - Preprocessed - Cell lines - Cell-line sequence variants'

### HNSCC Samples

'WES_variants.xlsx' - Go to https://www.cancerrxgene.org/downloads, choose 'Sequencing - Preprocessed - Cell lines - Cell-line sequence variants'

### TCGA Samples

'*COHORT*.maf' - Use the GDC Data Transfer Tool (https://gdc.cancer.gov/access-data/gdc-data-transfer-tool) to download TCGA files listed in one of the manifest files. 

You can use data from any of the four somatic variant callers available. Uploaded results are those using Mutect.<br>
MuSE: manifest_muse.txt<br>
Mutect: manifest_mutect.txt<br>
Somatic Sniper: manifest_somaticsniper.txt<br>
VarScan: manifest_varscan.txt<br>
<br>
Rename each .maf file to the associated cohort abbreviation in capital letters. You should have files named 'ACC.maf', 'BLCA.maf', etc.

## Extract Envision scores

Load list of model genes

In [2]:
model_genes = pd.read_table('../recon/genes.tsv',sep='\t')

### TCGA Samples

Get missense mutations in model genes for every TCGA patient

In [None]:
# initialize results data frame
mutations = pd.DataFrame(columns=['SAMPLE','GENE SYMBOL','UNIPROT','POSITION','AA1','AA2','CODON1','CODON2','ENVISION'])

# list of enzymes with envision data avialable
envision_list = [x.split('.')[0].split('\\')[1] for x in glob.glob('input_data/envision/*.csv')]

# iterate over maf files
for fn in glob.glob('input_data/TCGA/*.maf'):
    
    # load file
    df_maf = pd.read_table(fn,skiprows=[0,1,2,3,4],header=0)
    
    # only keep missense mutations
    df_maf = df_maf[df_maf['Variant_Classification'] == 'Missense_Mutation']
    df_maf = df_maf[df_maf['Variant_Type'] == 'SNP']
    df_maf = df_maf.reset_index(drop=True)
    
    # only keep genes in Recon3D
    df_maf = df_maf.loc[[i for i,x in enumerate(df_maf['Entrez_Gene_Id']) if str(x) in [str(a) for a in model_genes['GENEID'].values.tolist()]]]
    df_maf = df_maf.reset_index(drop=True)
    
    # only keep genes with associated Uniprot ID
    df_maf = df_maf.loc[[i for i,x in enumerate(df_maf['SWISSPROT']) if type(x) == str]]
    df_maf = df_maf.reset_index(drop=True)

    # iterate over mutations
    for i in range(df_maf.shape[0]):
        
        # add data to dataframe
        mutations.loc[mutations.shape[0]] = [df_maf.loc[i]['Tumor_Sample_Barcode'],df_maf.loc[i]['Hugo_Symbol'],df_maf.loc[i]['SWISSPROT'],df_maf.loc[i]['Protein_position'].split('/')[0],df_maf.loc[i]['Amino_acids'].split('/')[0],df_maf.loc[i]['Amino_acids'].split('/')[1],df_maf.loc[i]['Codons'].split('/')[0],df_maf.loc[i]['Codons'].split('/')[1],np.nan]

Fix mutation information for genes with multiple mutations at the same position

In [None]:
# amino acid codons
codons = {'TTT':'F','TTC':'F','TTA':'L','TTG':'L','CTT':'L','CTC':'L','CTA':'L','CTG':'L','ATT':'I','ATC':'I','ATA':'I','ATG':'M','GTT':'V','GTC':'V','GTA':'V','GTG':'V','TCT':'S','TCC':'S','TCA':'S','TCG':'S','CCT':'P','CCC':'P','CCA':'P','CCG':'P','ACT':'T','ACC':'T','ACA':'T','ACG':'T','GCT':'A','GCC':'A','GCA':'A','GCG':'A','TAT':'Y','TAC':'Y','TAA':'stop','TAG':'stop','CAT':'H','CAC':'H','CAA':'Q','CAG':'Q','AAT':'N','AAC':'N','AAA':'K','AAG':'K','GAT':'D','GAC':'D','GAA':'E','GAG':'E','TGT':'C','TGC':'C','TGA':'stop','TGG':'W','CGT':'R','CGC':'R','CGA':'R','CGG':'R','AGT':'S','AGC':'S','AGA':'R','AGG':'R','GGT':'G','GGC':'G','GGA':'G','GGG':'G'}

# initialize sample dictionary
data = {}

# iterate over mutations
for i in range(mutations.shape[0]):
        
    # if sample already in dictionary
    if mutations.loc[i]['SAMPLE'] in data:
        
        # if gene already in dictionary
        if mutations.loc[i]['GENE SYMBOL'] in data[mutations.loc[i]['SAMPLE']]:
            
            # if position already in dictionary
            if mutations.loc[i]['POSITION'] in data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']]:
                
                # add codon to list
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']]['id'].append(i)
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']]['codon'].append(mutations.loc[i]['CODON2'])
                
            # if position not in dictionary
            else:
                
                # create codon list
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']] = {'id':[i], 'codon':[mutations.loc[i]['CODON2']]}
            
            
        # if gene not in dictionary
        else:
        
            # create codon list
            data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] = {}
            data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']] = {'id':[i], 'codon':[mutations.loc[i]['CODON2']]}
    
    # if sample not in dictionary
    else:
        
        # create codon list
        data[mutations.loc[i]['SAMPLE']] = {}
        data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] = {}
        data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']] = {'id':[i], 'codon':[mutations.loc[i]['CODON2']]}

# remove codon columns
mutations = mutations.drop('CODON1', axis=1)
mutations = mutations.drop('CODON2', axis=1)

# find any positions with two mutations
to_remove = []
for sample in data:
    for gene in data[sample]:
        for position in data[sample][gene]:
            if len(data[sample][gene][position]['codon']) > 1:
                
                # build true codon
                codon = list(data[sample][gene][position]['codon'][0].upper())
                for i in range(1,len(data[sample][gene][position]['codon'])):
                    
                    # 1st letter capital
                    if list(data[sample][gene][position]['codon'][i])[0] in ['A','G','C','T']:
                        codon[0] = list(data[sample][gene][position]['codon'][i])[0]
                    elif list(data[sample][gene][position]['codon'][i])[1] in ['A','G','C','T']:
                        codon[1] = list(data[sample][gene][position]['codon'][i])[1]
                    elif list(data[sample][gene][position]['codon'][i])[2] in ['A','G','C','T']:
                        codon[2] = list(data[sample][gene][position]['codon'][i])[2]
                
                codon = ''.join(codon)
                
                # if stop codon, just remove from data frame
                if codons[codon] == 'stop':
                    for index in data[sample][gene][position]['id']:
                        to_remove.append(index)
                
                # if not stop codon, change in data frame
                else:
                    mutations.at[data[sample][gene][position]['id'][0],'AA2'] = codons[codon]
                    for index in data[sample][gene][position]['id'][1:]:
                        to_remove.append(index)
 
                              
mutations = mutations.drop(mutations.index[to_remove]).reset_index(drop=True)

Get Envision score for each mutation

In [None]:
# iterate over mutations
for i in range(mutations.shape[0]):

    # if envision data available for that protein
    if mutations.loc[i]['UNIPROT'] in envision_list:

        # load envision data for that protein
        df_envision = pd.read_csv('input_data/envision/%s.csv' % mutations.loc[i]['UNIPROT'])

        # mutation string
        mutation = '%s_%s%s%s' % (mutations.loc[i]['UNIPROT'],mutations.loc[i]['AA1'],mutations.loc[i]['POSITION'],mutations.loc[i]['AA2'])

        # if envision score available for this mutation
        if mutation in df_envision['id2'].values.tolist():
            
            # get envision score
            mutations.at[i,'ENVISION'] = df_envision.loc[df_envision['id2'].values.tolist().index(mutation)]['Envision_predictions']

Combine Envision scores within individual sample and gene, produce output file for each sample

In [None]:
# initialize sample dictionary
data = {}

# iterate over mutations
for i in range(mutations.shape[0]):
    
    # if envision score given
    if not np.isnan(mutations.loc[i]['ENVISION']):
        
        # if sample already in dictionary
        if mutations.loc[i]['SAMPLE'] in data:
            
            # if gene already in dictionary
            if mutations.loc[i]['GENE SYMBOL'] in data[mutations.loc[i]['SAMPLE']]:
                
                # multiple original value by current value
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] *= mutations.loc[i]['ENVISION']
                
            # if gene not in dictionary
            else:
            
                # set current value
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] = mutations.loc[i]['ENVISION']
        
        # if sample not in dictionary
        else:
            
            # set current value
            data[mutations.loc[i]['SAMPLE']] = {}
            data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] = mutations.loc[i]['ENVISION']

# create file for each sample
for sample in data:
    
    # initialize file
    with open('TCGA/%s.csv' % sample[:16],'w') as f:
        f.write('GENE,ENVISION\n')
        
        # write envision scores
        for gene in data[sample]:
            f.write('%s,%f\n' % (gene,data[sample][gene]))

Save associations between gene symbols and Swissprot accessions

In [None]:
# initialize dictionary
swissprot = {}

# load maf files
mafs = []
for fn in glob.glob('_data_/input/TCGA/*.maf'):
    mafs.append(pd.read_table(fn,skiprows=[0,1,2,3,4],header=0))
df_maf = pd.concat(mafs, ignore_index=True, sort=False)
df_maf = df_maf[['Hugo_Symbol','SWISSPROT']]

In [21]:
# swissprot
swissprot = {}
for gene in model_genes['SYMBOL'].tolist():
    df_subset = df_maf[df_maf['Hugo_Symbol'] == gene]
    founds = list(set(df_subset['SWISSPROT'].tolist()))
    if len(founds) == 1:
        swissprot[gene] = founds[0]
    else:
        swissprot[gene] = np.nan

In [23]:
with open('_data_/processing/swissprot.pickle', 'wb') as f:
    pickle.dump(swissprot, f, protocol=pickle.HIGHEST_PROTOCOL)

### NCI-60 Samples

Load NCI-60 mutation data

In [None]:
# load data
wb = pd.ExcelFile('input_data/NCI60/WES_variants.xlsx')
mutation_data = wb.parse('WES_variants')

# load clinical information
clinical = pd.read_csv('../clinical/input_data/NCI60/NCI60.csv',index_col=0)

# amino acid codons
codons = {'TTT':'F','TTC':'F','TTA':'L','TTG':'L','CTT':'L','CTC':'L','CTA':'L','CTG':'L','ATT':'I','ATC':'I','ATA':'I','ATG':'M','GTT':'V','GTC':'V','GTA':'V','GTG':'V','TCT':'S','TCC':'S','TCA':'S','TCG':'S','CCT':'P','CCC':'P','CCA':'P','CCG':'P','ACT':'T','ACC':'T','ACA':'T','ACG':'T','GCT':'A','GCC':'A','GCA':'A','GCG':'A','TAT':'Y','TAC':'Y','TAA':'stop','TAG':'stop','CAT':'H','CAC':'H','CAA':'Q','CAG':'Q','AAT':'N','AAC':'N','AAA':'K','AAG':'K','GAT':'D','GAC':'D','GAA':'E','GAG':'E','TGT':'C','TGC':'C','TGA':'stop','TGG':'W','CGT':'R','CGC':'R','CGA':'R','CGG':'R','AGT':'S','AGC':'S','AGA':'R','AGG':'R','GGT':'G','GGC':'G','GGA':'G','GGG':'G'}

# extract mutation data for each cell line
mutations = []
celllines = []
for cell in clinical.index.tolist():
    
    # if has COSMIC ID
    if not np.isnan(clinical.loc[cell]['COSMIC']):
        
        # subset data
        celllines.append(cell)
        df_mutations_subset = mutation_data.loc[mutation_data['COSMIC_ID'] == int(clinical.loc[cell,'COSMIC'])]
        df_mutations_subset = df_mutations_subset.reset_index(drop=True)
        df_mutations_subset['UNIPROT'] = ''
        df_mutations_subset['POSITION'] = ''
        df_mutations_subset['AA1'] = ''
        df_mutations_subset['AA2'] = ''
        df_mutations_subset['CODON1'] = ''
        df_mutations_subset['CODON2'] = ''
        keep = []
        
        # iterate over mutations    
        for i in range(df_mutations_subset.shape[0]):
            
            # if missense mutation, gene is in model, gene has known uniprot
            if (df_mutations_subset.loc[i]['Classification'] == 'missense') and (df_mutations_subset.loc[i]['Gene'] in [x for x in model_genes['SYMBOL'].values.tolist() if x != '-']):
                
                # if gene has known uniprot
                if df_mutations_subset.loc[i]['Gene'] in swissprot:
                    if (swissprot[df_mutations_subset.loc[i]['Gene']][0] != '-'):

                        # figure out codons
                        possible_codons_1 = [x for x in codons if codons[x] == df_mutations_subset.loc[i]['AA'][2]]
                        possible_codons_2 = [x for x in codons if codons[x] == df_mutations_subset.loc[i]['AA'][-1]]
                        possibilities_1 = []
                        possibilities_2 = []
                        for a in range(len(possible_codons_1)):
                            for b in range(len(possible_codons_2)):

                                # check first letter
                                if (possible_codons_1[a][0] != possible_codons_2[b][0]) and (possible_codons_1[a][1] == possible_codons_2[b][1]) and (possible_codons_1[a][2] == possible_codons_2[b][2]):
                                    if (possible_codons_1[a][0] == df_mutations_subset.loc[i]['cDNA'][-3]) and (possible_codons_2[b][0] == df_mutations_subset.loc[i]['cDNA'][-1]):
                                        possibilities_1.append(possible_codons_1[a])
                                        possibilities_2.append(possible_codons_2[b])

                                # check second letter
                                elif (possible_codons_1[a][0] == possible_codons_2[b][0]) and (possible_codons_1[a][1] != possible_codons_2[b][1]) and (possible_codons_1[a][2] == possible_codons_2[b][2]):
                                    if (possible_codons_1[a][1] == df_mutations_subset.loc[i]['cDNA'][-3]) and (possible_codons_2[b][1] == df_mutations_subset.loc[i]['cDNA'][-1]):
                                        possibilities_1.append(possible_codons_1[a])
                                        possibilities_2.append(possible_codons_2[b])

                                # check third letter
                                elif (possible_codons_1[a][0] == possible_codons_2[b][0]) and (possible_codons_1[a][1] == possible_codons_2[b][1]) and (possible_codons_1[a][2] != possible_codons_2[b][2]):
                                    if (possible_codons_1[a][2] == df_mutations_subset.loc[i]['cDNA'][-3]) and (possible_codons_2[b][2] == df_mutations_subset.loc[i]['cDNA'][-1]):
                                        possibilities_1.append(possible_codons_1[a])
                                        possibilities_2.append(possible_codons_2[b])

                        # build consensus codon
                        codon1 = 'XXX'
                        codon2 = 'XXX'
                        if len(list(set([x[0] for x in possibilities_1]))) == 1:
                            codon1 = list(set([x[0] for x in possibilities_1]))[0] + codon1[1:]
                        if len(list(set([x[1] for x in possibilities_1]))) == 1:
                            codon1 = codon1[0] + list(set([x[1] for x in possibilities_1]))[0] + codon1[2]
                        if len(list(set([x[2] for x in possibilities_1]))) == 1:
                            codon1 = codon1[:2] + list(set([x[2] for x in possibilities_1]))[0]
                        if len(list(set([x[0] for x in possibilities_2]))) == 1:
                            codon2 = list(set([x[0] for x in possibilities_2]))[0] + codon2[1:]
                        if len(list(set([x[1] for x in possibilities_2]))) == 1:
                            codon2 = codon2[0] + list(set([x[1] for x in possibilities_2]))[0] + codon2[2]
                        if len(list(set([x[2] for x in possibilities_2]))) == 1:
                            codon2 = codon2[:2] + list(set([x[2] for x in possibilities_2]))[0]

                        # lowercase non-changing letters
                        if codon1[0] == codon2[0]:
                            codon1 = codon1[0].lower() + codon1[1:]
                            codon2 = codon2[0].lower() + codon2[1:]
                        if codon1[1] == codon2[1]:
                            codon1 = codon1[0] + codon1[1].lower() + codon1[2]
                            codon2 = codon2[0] + codon2[1].lower() + codon2[2]
                        if codon1[2] == codon2[2]:
                            codon1 = codon1[:2] + codon1[2].lower()
                            codon2 = codon2[:2] + codon2[2].lower()

                        # add codons to dataframe
                        df_mutations_subset.at[i,'UNIPROT'] = swissprot[df_mutations_subset.loc[i]['Gene']][0]
                        df_mutations_subset.at[i,'POSITION'] = df_mutations_subset.loc[i]['AA'][3:-1]
                        df_mutations_subset.at[i,'AA1'] = df_mutations_subset.loc[i]['AA'][2]
                        df_mutations_subset.at[i,'AA2'] = df_mutations_subset.loc[i]['AA'][-1]
                        df_mutations_subset.at[i,'CODON1'] = codon1
                        df_mutations_subset.at[i,'CODON2'] = codon2

                        # keep data
                        keep.append(i)
                
        # only keep necessary information
        df_mutations_subset['GENE'] = df_mutations_subset['Gene']
        df_mutations_subset = df_mutations_subset[['GENE','UNIPROT','POSITION','AA1','AA2','CODON1','CODON2']]
        df_mutations_subset = df_mutations_subset.iloc[keep]
        
        # save data
        mutations.append(df_mutations_subset.reset_index(drop=True))

Fix mutation information for genes with multiple mutations at the same position

In [None]:
# iterate over celllines
for a in range(len(mutations)):
    
    # initialize sample dictionary
    data = {}

    # iterate over mutations
    for i in range(mutations[a].shape[0]):

        # if gene already in dictionary
        if mutations[a].loc[i]['GENE'] in data:

            # if position already in dictionary
            if mutations[a].loc[i]['POSITION'] in data[mutations[a].loc[i]['GENE']]:

                # add codon to list
                data[mutations[a].loc[i]['GENE']][mutations[a].loc[i]['POSITION']]['id'].append(i)
                data[mutations[a].loc[i]['GENE']][mutations[a].loc[i]['POSITION']]['codon1'].append(mutations[a].loc[i]['CODON1'])
                data[mutations[a].loc[i]['GENE']][mutations[a].loc[i]['POSITION']]['codon2'].append(mutations[a].loc[i]['CODON2'])

            # if position not in dictionary
            else:

                # create codon list
                data[mutations[a].loc[i]['GENE']][mutations[a].loc[i]['POSITION']] = {'id':[i], 'codon1':[mutations[a].loc[i]['CODON1']], 'codon2':[mutations[a].loc[i]['CODON2']]}


        # if gene not in dictionary
        else:

            # create codon list
            data[mutations[a].loc[i]['GENE']] = {}
            data[mutations[a].loc[i]['GENE']][mutations[a].loc[i]['POSITION']] = {'id':[i], 'codon1':[mutations[a].loc[i]['CODON1']], 'codon2':[mutations[a].loc[i]['CODON2']]}

    # remove codon columns
    mutations[a] = mutations[a].drop('CODON1', axis=1)
    mutations[a] = mutations[a].drop('CODON2', axis=1)

    # find any positions with two mutations
    to_remove = []
    for gene in data:
        for position in data[gene]:
            if len(data[gene][position]['id']) > 1:

                # build true codon
                codon = list(data[gene][position]['codon1'][0].upper())
                for i in range(len(data[gene][position]['id'])):
                    if 'X' not in  data[gene][position]['codon1'][i].upper():
                        codon = list(data[gene][position]['codon1'][i].upper())
                        break     
                
                for i in range(len(data[gene][position]['id'])):

                    # 1st letter capital
                    if list(data[gene][position]['codon2'][i])[0] in ['A','G','C','T']:
                        codon[0] = list(data[gene][position]['codon2'][i])[0]
                    elif list(data[gene][position]['codon2'][i])[1] in ['A','G','C','T']:
                        codon[1] = list(data[gene][position]['codon2'][i])[1]
                    elif list(data[gene][position]['codon2'][i])[2] in ['A','G','C','T']:
                        codon[2] = list(data[gene][position]['codon2'][i])[2]

                codon = ''.join(codon)

                # if stop codon, just remove from data frame
                if codon in codons:
                    if codons[codon] == 'stop':
                        for index in data[gene][position]['id']:
                            to_remove.append(index)
                            
                    # if not stop codon, change in data frame
                    else:
                        mutations[a].at[data[gene][position]['id'][0],'AA2'] = codons[codon]
                        for index in data[gene][position]['id'][1:]:
                            to_remove.append(index)
                            
                # if 'X' in codon, just remove from data frame       
                else:
                    for index in data[gene][position]['id']:
                        to_remove.append(index)

    mutations[a] = mutations[a].drop(mutations[a].index[to_remove]).reset_index(drop=True)

Get Envision score for each mutation

In [None]:
# list of enzymes with envision data avialable
envision_list = [x.split('.')[0].split('\\')[1] for x in glob.glob('input_data/envision/*.csv')]

# iterate over celllines
for a in range(len(mutations)):

    # iterate over mutations
    for i in range(mutations[a].shape[0]):

        # if envision data available for that protein
        if mutations[a].loc[i]['UNIPROT'] in envision_list:

            # load envision data for that protein
            df_envision = pd.read_csv('input_data/envision/%s.csv' % mutations[a].loc[i]['UNIPROT'])

            # mutation string
            mutation = '%s_%s%s%s' % (mutations[a].loc[i]['UNIPROT'],mutations[a].loc[i]['AA1'],mutations[a].loc[i]['POSITION'],mutations[a].loc[i]['AA2'])

            # if envision score available for this mutation
            if mutation in df_envision['id2'].values.tolist():

                # get envision score
                mutations[a].at[i,'ENVISION'] = df_envision.loc[df_envision['id2'].values.tolist().index(mutation)]['Envision_predictions']

Combine Envision scores within individual sample and gene, produce output file for each sample

In [None]:
# initialize sample dictionary
data = {}

# iterate over cell lines
for a in range(len(mutations)):
    
    # add sample to dictionary
    sample = celllines[a]
    data[sample] = {}
    
    # iterate over mutations
    for i in range(mutations[a].shape[0]):

        # if envision score given
        if not np.isnan(mutations[a].loc[i]['ENVISION']):
            
            # if gene already in dictionary
            if mutations[a].loc[i]['GENE'] in data[sample]:

                # multiple original value by current value
                data[sample][mutations[a].loc[i]['GENE']] *= mutations[a].loc[i]['ENVISION']

            # if gene not in dictionary
            else:

                # set current value
                data[sample][mutations[a].loc[i]['GENE']] = mutations[a].loc[i]['ENVISION']
                               
# create file for each sample
for sample in data:
    
    # initialize file
    with open('NCI60/%s.csv' % sample.replace('/','-'),'w') as f:
        f.write('GENE,ENVISION\n')
        
        # write envision scores
        for gene in data[sample]:
            f.write('%s,%f\n' % (gene,data[sample][gene]))

### HNSCC Samples

Load HNSCC mutation data

In [None]:
# load data
wb = pd.ExcelFile('input_data/HNSCC/WES_variants.xlsx')
mutation_data = wb.parse('WES_variants')

# load clinical information
clinical = pd.read_csv('../clinical/input_data/HNSCC/HNSCC.csv',index_col=0)

# amino acid codons
codons = {'TTT':'F','TTC':'F','TTA':'L','TTG':'L','CTT':'L','CTC':'L','CTA':'L','CTG':'L','ATT':'I','ATC':'I','ATA':'I','ATG':'M','GTT':'V','GTC':'V','GTA':'V','GTG':'V','TCT':'S','TCC':'S','TCA':'S','TCG':'S','CCT':'P','CCC':'P','CCA':'P','CCG':'P','ACT':'T','ACC':'T','ACA':'T','ACG':'T','GCT':'A','GCC':'A','GCA':'A','GCG':'A','TAT':'Y','TAC':'Y','TAA':'stop','TAG':'stop','CAT':'H','CAC':'H','CAA':'Q','CAG':'Q','AAT':'N','AAC':'N','AAA':'K','AAG':'K','GAT':'D','GAC':'D','GAA':'E','GAG':'E','TGT':'C','TGC':'C','TGA':'stop','TGG':'W','CGT':'R','CGC':'R','CGA':'R','CGG':'R','AGT':'S','AGC':'S','AGA':'R','AGG':'R','GGT':'G','GGC':'G','GGA':'G','GGG':'G'}

# extract mutation data for each cell line
mutations = []
celllines = []
for cell in clinical.index.tolist():
    
    # if has COSMIC ID
    if not np.isnan(clinical.loc[cell]['COSMIC']):
        
        # subset data
        celllines.append(cell)
        df_mutations_subset = mutation_data.loc[mutation_data['COSMIC_ID'] == int(clinical.loc[cell,'COSMIC'])]
        df_mutations_subset = df_mutations_subset.reset_index(drop=True)
        df_mutations_subset['UNIPROT'] = ''
        df_mutations_subset['POSITION'] = ''
        df_mutations_subset['AA1'] = ''
        df_mutations_subset['AA2'] = ''
        df_mutations_subset['CODON1'] = ''
        df_mutations_subset['CODON2'] = ''
        keep = []
        
         # iterate over mutations
        for i in range(df_mutations_subset.shape[0]):
            
            # if missense mutation, gene is in model, gene has known uniprot
            if (df_mutations_subset.loc[i]['Classification'] == 'missense') and (df_mutations_subset.loc[i]['Gene'] in [x for x in model_genes['SYMBOL'].values.tolist() if x != '-']):
                
                # if gene has known uniprot
                if df_mutations_subset.loc[i]['Gene'] in swissprot:
                    if (swissprot[df_mutations_subset.loc[i]['Gene']][0] != '-'):

                        # figure out codons
                        possible_codons_1 = [x for x in codons if codons[x] == df_mutations_subset.loc[i]['AA'][2]]
                        possible_codons_2 = [x for x in codons if codons[x] == df_mutations_subset.loc[i]['AA'][-1]]
                        possibilities_1 = []
                        possibilities_2 = []
                        for a in range(len(possible_codons_1)):
                            for b in range(len(possible_codons_2)):

                                # check first letter
                                if (possible_codons_1[a][0] != possible_codons_2[b][0]) and (possible_codons_1[a][1] == possible_codons_2[b][1]) and (possible_codons_1[a][2] == possible_codons_2[b][2]):
                                    if (possible_codons_1[a][0] == df_mutations_subset.loc[i]['cDNA'][-3]) and (possible_codons_2[b][0] == df_mutations_subset.loc[i]['cDNA'][-1]):
                                        possibilities_1.append(possible_codons_1[a])
                                        possibilities_2.append(possible_codons_2[b])

                                # check second letter
                                elif (possible_codons_1[a][0] == possible_codons_2[b][0]) and (possible_codons_1[a][1] != possible_codons_2[b][1]) and (possible_codons_1[a][2] == possible_codons_2[b][2]):
                                    if (possible_codons_1[a][1] == df_mutations_subset.loc[i]['cDNA'][-3]) and (possible_codons_2[b][1] == df_mutations_subset.loc[i]['cDNA'][-1]):
                                        possibilities_1.append(possible_codons_1[a])
                                        possibilities_2.append(possible_codons_2[b])

                                # check third letter
                                elif (possible_codons_1[a][0] == possible_codons_2[b][0]) and (possible_codons_1[a][1] == possible_codons_2[b][1]) and (possible_codons_1[a][2] != possible_codons_2[b][2]):
                                    if (possible_codons_1[a][2] == df_mutations_subset.loc[i]['cDNA'][-3]) and (possible_codons_2[b][2] == df_mutations_subset.loc[i]['cDNA'][-1]):
                                        possibilities_1.append(possible_codons_1[a])
                                        possibilities_2.append(possible_codons_2[b])

                        # build consensus codon
                        codon1 = 'XXX'
                        codon2 = 'XXX'
                        if len(list(set([x[0] for x in possibilities_1]))) == 1:
                            codon1 = list(set([x[0] for x in possibilities_1]))[0] + codon1[1:]
                        if len(list(set([x[1] for x in possibilities_1]))) == 1:
                            codon1 = codon1[0] + list(set([x[1] for x in possibilities_1]))[0] + codon1[2]
                        if len(list(set([x[2] for x in possibilities_1]))) == 1:
                            codon1 = codon1[:2] + list(set([x[2] for x in possibilities_1]))[0]
                        if len(list(set([x[0] for x in possibilities_2]))) == 1:
                            codon2 = list(set([x[0] for x in possibilities_2]))[0] + codon2[1:]
                        if len(list(set([x[1] for x in possibilities_2]))) == 1:
                            codon2 = codon2[0] + list(set([x[1] for x in possibilities_2]))[0] + codon2[2]
                        if len(list(set([x[2] for x in possibilities_2]))) == 1:
                            codon2 = codon2[:2] + list(set([x[2] for x in possibilities_2]))[0]

                        # lowercase non-changing letters
                        if codon1[0] == codon2[0]:
                            codon1 = codon1[0].lower() + codon1[1:]
                            codon2 = codon2[0].lower() + codon2[1:]
                        if codon1[1] == codon2[1]:
                            codon1 = codon1[0] + codon1[1].lower() + codon1[2]
                            codon2 = codon2[0] + codon2[1].lower() + codon2[2]
                        if codon1[2] == codon2[2]:
                            codon1 = codon1[:2] + codon1[2].lower()
                            codon2 = codon2[:2] + codon2[2].lower()

                        # add codons to dataframe
                        df_mutations_subset.at[i,'UNIPROT'] = swissprot[df_mutations_subset.loc[i]['Gene']][0]
                        df_mutations_subset.at[i,'POSITION'] = df_mutations_subset.loc[i]['AA'][3:-1]
                        df_mutations_subset.at[i,'AA1'] = df_mutations_subset.loc[i]['AA'][2]
                        df_mutations_subset.at[i,'AA2'] = df_mutations_subset.loc[i]['AA'][-1]
                        df_mutations_subset.at[i,'CODON1'] = codon1
                        df_mutations_subset.at[i,'CODON2'] = codon2

                        # keep data
                        keep.append(i)
                
        # only keep necessary information
        df_mutations_subset['GENE'] = df_mutations_subset['Gene']
        df_mutations_subset = df_mutations_subset[['GENE','UNIPROT','POSITION','AA1','AA2','CODON1','CODON2']]
        df_mutations_subset = df_mutations_subset.iloc[keep]
        
        # save data
        mutations.append(df_mutations_subset.reset_index(drop=True))

Fix mutation information for genes with multiple mutations at the same position

In [None]:
# iterate over celllines
for a in range(len(mutations)):
    
    # initialize sample dictionary
    data = {}

    # iterate over mutations
    for i in range(mutations[a].shape[0]):

        # if gene already in dictionary
        if mutations[a].loc[i]['GENE'] in data:

            # if position already in dictionary
            if mutations[a].loc[i]['POSITION'] in data[mutations[a].loc[i]['GENE']]:

                # add codon to list
                data[mutations[a].loc[i]['GENE']][mutations[a].loc[i]['POSITION']]['id'].append(i)
                data[mutations[a].loc[i]['GENE']][mutations[a].loc[i]['POSITION']]['codon1'].append(mutations[a].loc[i]['CODON1'])
                data[mutations[a].loc[i]['GENE']][mutations[a].loc[i]['POSITION']]['codon2'].append(mutations[a].loc[i]['CODON2'])

            # if position not in dictionary
            else:

                # create codon list
                data[mutations[a].loc[i]['GENE']][mutations[a].loc[i]['POSITION']] = {'id':[i], 'codon1':[mutations[a].loc[i]['CODON1']], 'codon2':[mutations[a].loc[i]['CODON2']]}


        # if gene not in dictionary
        else:

            # create codon list
            data[mutations[a].loc[i]['GENE']] = {}
            data[mutations[a].loc[i]['GENE']][mutations[a].loc[i]['POSITION']] = {'id':[i], 'codon1':[mutations[a].loc[i]['CODON1']], 'codon2':[mutations[a].loc[i]['CODON2']]}

    # remove codon columns
    mutations[a] = mutations[a].drop('CODON1', axis=1)
    mutations[a] = mutations[a].drop('CODON2', axis=1)

    # find any positions with two mutations
    to_remove = []
    for gene in data:
        for position in data[gene]:
            if len(data[gene][position]['id']) > 1:

                # build true codon
                codon = list(data[gene][position]['codon1'][0].upper())
                for i in range(len(data[gene][position]['id'])):
                    if 'X' not in  data[gene][position]['codon1'][i].upper():
                        codon = list(data[gene][position]['codon1'][i].upper())
                        break     
                
                for i in range(len(data[gene][position]['id'])):

                    # 1st letter capital
                    if list(data[gene][position]['codon2'][i])[0] in ['A','G','C','T']:
                        codon[0] = list(data[gene][position]['codon2'][i])[0]
                    elif list(data[gene][position]['codon2'][i])[1] in ['A','G','C','T']:
                        codon[1] = list(data[gene][position]['codon2'][i])[1]
                    elif list(data[gene][position]['codon2'][i])[2] in ['A','G','C','T']:
                        codon[2] = list(data[gene][position]['codon2'][i])[2]

                codon = ''.join(codon)

                # if stop codon, just remove from data frame
                if codon in codons:
                    if codons[codon] == 'stop':
                        for index in data[gene][position]['id']:
                            to_remove.append(index)
                            
                    # if not stop codon, change in data frame
                    else:
                        mutations[a].at[data[gene][position]['id'][0],'AA2'] = codons[codon]
                        for index in data[gene][position]['id'][1:]:
                            to_remove.append(index)
                            
                # if 'X' in codon, just remove from data frame       
                else:
                    for index in data[gene][position]['id']:
                        to_remove.append(index)

    mutations[a] = mutations[a].drop(mutations[a].index[to_remove]).reset_index(drop=True)

Get Envision score for each mutation

In [None]:
# list of enzymes with envision data avialable
envision_list = [x.split('.')[0].split('\\')[1] for x in glob.glob('input_data/envision/*.csv')]

# iterate over celllines
for a in range(len(mutations)):

    # iterate over mutations
    for i in range(mutations[a].shape[0]):

        # if envision data available for that protein
        if mutations[a].loc[i]['UNIPROT'] in envision_list:

            # load envision data for that protein
            df_envision = pd.read_csv('input_data/envision/%s.csv' % mutations[a].loc[i]['UNIPROT'])

            # mutation string
            mutation = '%s_%s%s%s' % (mutations[a].loc[i]['UNIPROT'],mutations[a].loc[i]['AA1'],mutations[a].loc[i]['POSITION'],mutations[a].loc[i]['AA2'])

            # if envision score available for this mutation
            if mutation in df_envision['id2'].values.tolist():

                # get envision score
                mutations[a].at[i,'ENVISION'] = df_envision.loc[df_envision['id2'].values.tolist().index(mutation)]['Envision_predictions']

Combine Envision scores within individual sample and gene, produce output file for each sample

In [None]:
# initialize sample dictionary
data = {}

# iterate over cell lines
for a in range(len(mutations)):
    
    # add sample to dictionary
    sample = celllines[a]
    data[sample] = {}
    
    # iterate over mutations
    for i in range(mutations[a].shape[0]):

        # if envision score given
        if not np.isnan(mutations[a].loc[i]['ENVISION']):
            
            # if gene already in dictionary
            if mutations[a].loc[i]['GENE'] in data[sample]:

                # multiple original value by current value
                data[sample][mutations[a].loc[i]['GENE']] *= mutations[a].loc[i]['ENVISION']

            # if gene not in dictionary
            else:

                # set current value
                data[sample][mutations[a].loc[i]['GENE']] = mutations[a].loc[i]['ENVISION']
                               
# create file for each sample
for sample in data:
    
    # initialize file
    with open('HNSCC/%s.csv' % sample.replace('/','-'),'w') as f:
        f.write('GENE,ENVISION\n')
        
        # write envision scores
        for gene in data[sample]:
            f.write('%s,%f\n' % (gene,data[sample][gene]))

### CCLE Samples

Load Swissprot conversion

In [3]:
with open('_data_/processing/swissprot.pickle', 'r') as f:
    swissprot = pickle.load(f)

Load CCLE mutation data

In [6]:
# list of enzymes with envision data avialable
envision_list = [x.split('.')[0].split('/')[-1] for x in glob.glob('_data_/input/envision/*.csv')]

In [4]:
# initialize results data frame
mutations = pd.DataFrame(columns=['SAMPLE','GENE SYMBOL','UNIPROT','POSITION','AA1','AA2','CODON1','CODON2','ENVISION'])

# load file
df_maf = pd.read_table('_data_/input/CCLE/CCLE_DepMap_18q3_maf_20180718.txt')

# only keep missense mutations
df_maf = df_maf[df_maf['Variant_Classification'] == 'Missense_Mutation']
df_maf = df_maf[df_maf['Variant_Type'] == 'SNP']
df_maf = df_maf.reset_index(drop=True)

# only keep genes in Recon3D
df_maf = df_maf.loc[[i for i,x in enumerate(df_maf['Entrez_Gene_Id']) if str(x) in [str(a) for a in model_genes['GENEID'].values.tolist()]]]
df_maf = df_maf.reset_index(drop=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# only keep genes with associated Uniprot ID
df_maf = df_maf.loc[[i for i,x in enumerate(df_maf['Hugo_Symbol']) if x in swissprot]]
df_maf = df_maf.reset_index(drop=True)

In [6]:
# get data
df_maf['SAMPLE'] = [x.split('_')[0] for x in df_maf['Tumor_Sample_Barcode'].tolist()]
df_maf['GENE SYMBOL'] = df_maf['Hugo_Symbol'].tolist()
df_maf['UNIPROT'] = [swissprot[x] for x in df_maf['Hugo_Symbol'].tolist()]
df_maf['POSITION'] = [x[3:-1] for x in df_maf['Protein_Change'].tolist()]
df_maf['AA1'] = [x[2] for x in df_maf['Protein_Change'].tolist()]
df_maf['AA2'] = [x[-1] for x in df_maf['Protein_Change'].tolist()]
df_maf['CODON1'] = [x[-7:-4] for x in df_maf['Codon_Change'].tolist()]
df_maf['CODON2'] = [x[-3:] for x in df_maf['Codon_Change'].tolist()]
df_maf['ENVISION'] = np.nan
mutations = df_maf[['SAMPLE','GENE SYMBOL','UNIPROT','POSITION','AA1','AA2','CODON1','CODON2','ENVISION']]

Fix mutation information for genes with multiple mutations at the same position

In [7]:
# amino acid codons
codons = {'TTT':'F','TTC':'F','TTA':'L','TTG':'L','CTT':'L','CTC':'L','CTA':'L','CTG':'L','ATT':'I','ATC':'I','ATA':'I','ATG':'M','GTT':'V','GTC':'V','GTA':'V','GTG':'V','TCT':'S','TCC':'S','TCA':'S','TCG':'S','CCT':'P','CCC':'P','CCA':'P','CCG':'P','ACT':'T','ACC':'T','ACA':'T','ACG':'T','GCT':'A','GCC':'A','GCA':'A','GCG':'A','TAT':'Y','TAC':'Y','TAA':'stop','TAG':'stop','CAT':'H','CAC':'H','CAA':'Q','CAG':'Q','AAT':'N','AAC':'N','AAA':'K','AAG':'K','GAT':'D','GAC':'D','GAA':'E','GAG':'E','TGT':'C','TGC':'C','TGA':'stop','TGG':'W','CGT':'R','CGC':'R','CGA':'R','CGG':'R','AGT':'S','AGC':'S','AGA':'R','AGG':'R','GGT':'G','GGC':'G','GGA':'G','GGG':'G'}

# initialize sample dictionary
data = {}

# iterate over mutations
for i in range(mutations.shape[0]):
        
    # if sample already in dictionary
    if mutations.loc[i]['SAMPLE'] in data:
        
        # if gene already in dictionary
        if mutations.loc[i]['GENE SYMBOL'] in data[mutations.loc[i]['SAMPLE']]:
            
            # if position already in dictionary
            if mutations.loc[i]['POSITION'] in data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']]:
                
                # add codon to list
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']]['id'].append(i)
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']]['codon'].append(mutations.loc[i]['CODON2'])
                
            # if position not in dictionary
            else:
                
                # create codon list
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']] = {'id':[i], 'codon':[mutations.loc[i]['CODON2']]}
            
            
        # if gene not in dictionary
        else:
        
            # create codon list
            data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] = {}
            data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']] = {'id':[i], 'codon':[mutations.loc[i]['CODON2']]}
    
    # if sample not in dictionary
    else:
        
        # create codon list
        data[mutations.loc[i]['SAMPLE']] = {}
        data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] = {}
        data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']][mutations.loc[i]['POSITION']] = {'id':[i], 'codon':[mutations.loc[i]['CODON2']]}

# remove codon columns
mutations = mutations.drop('CODON1', axis=1)
mutations = mutations.drop('CODON2', axis=1)

# find any positions with two mutations
to_remove = []
for sample in data:
    for gene in data[sample]:
        for position in data[sample][gene]:
            if len(data[sample][gene][position]['codon']) > 1:
                
                # build true codon
                codon = list(data[sample][gene][position]['codon'][0].upper())
                for i in range(1,len(data[sample][gene][position]['codon'])):
                    
                    # 1st letter capital
                    if list(data[sample][gene][position]['codon'][i])[0] in ['A','G','C','T']:
                        codon[0] = list(data[sample][gene][position]['codon'][i])[0]
                    elif list(data[sample][gene][position]['codon'][i])[1] in ['A','G','C','T']:
                        codon[1] = list(data[sample][gene][position]['codon'][i])[1]
                    elif list(data[sample][gene][position]['codon'][i])[2] in ['A','G','C','T']:
                        codon[2] = list(data[sample][gene][position]['codon'][i])[2]
                
                codon = ''.join(codon)
                
                # if stop codon, just remove from data frame
                if codons[codon] == 'stop':
                    for index in data[sample][gene][position]['id']:
                        to_remove.append(index)
                
                # if not stop codon, change in data frame
                else:
                    mutations.at[data[sample][gene][position]['id'][0],'AA2'] = codons[codon]
                    for index in data[sample][gene][position]['id'][1:]:
                        to_remove.append(index)
 
                              
mutations = mutations.drop(mutations.index[to_remove]).reset_index(drop=True)

Get Envision score for each mutation

In [24]:
# load envision data
df_uniprot_list = []
uniprot_list = []
for uniprot in list(set(mutations['UNIPROT'].tolist())):
    if type(uniprot) == str:
        if uniprot in envision_list:
            df_uniprot_list.append(pd.read_csv('_data_/input/envision/%s.csv' % uniprot))
            uniprot_list.append(uniprot)

In [50]:
# iterate over mutations
for i in range(mutations.shape[0]):
    
    # if uniprot data available
    if mutations.loc[i]['UNIPROT'] in uniprot_list:
    
        # subset uniprot data
        df_subset = df_uniprot_list[uniprot_list.index(mutations.loc[i]['UNIPROT'])]
        df_subset = df_subset[df_subset['id2'] == '%s_%s%s%s' % (mutations.loc[i]['UNIPROT'],mutations.loc[i]['AA1'],mutations.loc[i]['POSITION'],mutations.loc[i]['AA2'])]

        # if value
        if df_subset.shape[0] == 1:

            # get envision score
            mutations.at[i,'ENVISION'] = df_subset['Envision_predictions'].tolist()[0]

Combine Envision scores within individual sample and gene, produce output file for each sample

In [52]:
# initialize sample dictionary
data = {}

# iterate over mutations
for i in range(mutations.shape[0]):
    
    # if envision score given
    if not np.isnan(mutations.loc[i]['ENVISION']):
        
        # if sample already in dictionary
        if mutations.loc[i]['SAMPLE'] in data:
            
            # if gene already in dictionary
            if mutations.loc[i]['GENE SYMBOL'] in data[mutations.loc[i]['SAMPLE']]:
                
                # multiple original value by current value
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] *= mutations.loc[i]['ENVISION']
                
            # if gene not in dictionary
            else:
            
                # set current value
                data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] = mutations.loc[i]['ENVISION']
        
        # if sample not in dictionary
        else:
            
            # set current value
            data[mutations.loc[i]['SAMPLE']] = {}
            data[mutations.loc[i]['SAMPLE']][mutations.loc[i]['GENE SYMBOL']] = mutations.loc[i]['ENVISION']

In [54]:
# create file for each sample
for sample in data:
    if sample != 'TT':
    
        # initialize file
        with open('CCLE/%s.csv' % sample,'w') as f:
            f.write('GENE,ENVISION\n')

            # write envision scores
            for gene in data[sample]:
                f.write('%s,%f\n' % (gene,data[sample][gene]))

Gray VE, Hause RJ, Luebeck J, Shendure J, Fowler DM. Quantitative missense variant effect prediction using large-scale mutagenesis data. Cell Systems. 2018;6(1):116-124. doi: 10.1016/j.cels.2017.11.003. PubMed PMID: 29226803. 