In [7]:
# Packages
import os
import sys
import math
import re
import scipy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Check working directory
print('Current working dir:', os.getcwd())

Current working dir: /Users/jakeharris/Documents/GitHub/df-assignment


In [65]:
# Preview sample-info.tsv file
sample_info_path = os.path.join('vanallen-assessment', 'sample-information.tsv')
sample_info_df = pd.read_csv(sample_info_path, sep='\t')
print(sample_info_df.columns)
print('Number of patient entries:', len(sample_info_df))
sample_info_df.head()

Index(['Patient_ID', 'Tumor_Sample_Barcode', 'Matched_Norm_Sample_Barcode',
       'Response', 'Silent_mutations_per_Mb', 'Nonsynonymous_mutations_per_Mb',
       'Mutations_per_Mb'],
      dtype='object')
Number of patient entries: 50


Unnamed: 0,Patient_ID,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,Response,Silent_mutations_per_Mb,Nonsynonymous_mutations_per_Mb,Mutations_per_Mb
0,Patient-0,Patient-0-Tumor,Patient-0-Normal,Non-Responder,2.87,6.77,9.64
1,Patient-1,Patient-1-Tumor,Patient-1-Normal,Responder,1.92,6.14,8.06
2,Patient-2,Patient-2-Tumor,Patient-2-Normal,Responder,1.32,2.84,4.16
3,Patient-3,Patient-3-Tumor,Patient-3-Normal,Non-Responder,1.78,5.0,6.78
4,Patient-4,Patient-4-Tumor,Patient-4-Normal,Responder,4.93,10.5,15.43


In [66]:
# Preview Patient 0 mutation .maf file
patient0_path = os.path.join('vanallen-assessment', 'mafs', 'Patient-0.somatic.snvs.maf')
patient0_df = pd.read_csv(patient0_path, sep='\t')
print(patient0_df.columns)
print('Number of annotation rows for Patient 0:', len(patient0_df))
patient0_df.head()

Index(['Hugo_Symbol', 'Chromosome', 'Start_position', 'End_position',
       'Variant_Classification', 'Variant_Type', 'Reference_Allele',
       'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'Tumor_Sample_Barcode',
       'Matched_Norm_Sample_Barcode', 'Protein_Change', 't_alt_count',
       't_ref_count'],
      dtype='object')
Number of annotation rows for Patient 0: 334


Unnamed: 0,Hugo_Symbol,Chromosome,Start_position,End_position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,Protein_Change,t_alt_count,t_ref_count
0,AMOT,X,112035152,112035152,Missense_Mutation,SNP,G,G,C,Patient-0-Tumor,Patient-0-Normal,p.Q612E,8,31
1,SEMA6D,15,48062786,48062786,Missense_Mutation,SNP,G,G,A,Patient-0-Tumor,Patient-0-Normal,p.A676T,4,87
2,PRR12,19,50100969,50100969,Missense_Mutation,SNP,C,C,G,Patient-0-Tumor,Patient-0-Normal,p.S1126C,5,25
3,TNR,1,175372529,175372529,Silent,SNP,G,G,A,Patient-0-Tumor,Patient-0-Normal,p.S241S,30,155
4,CPA4,7,129944344,129944344,Silent,SNP,C,C,G,Patient-0-Tumor,Patient-0-Normal,p.A137A,12,100


In [67]:
# Load in all patient .maf data as single dataframe
maf_dir = os.listdir(os.path.join('vanallen-assessment', 'mafs'))  # patient .maf data folder
mutations_df = pd.DataFrame()  # blank DF for later concatenation
for item in maf_dir:
    patient_df = pd.read_csv(os.path.join('vanallen-assessment', 'mafs', item), sep='\t')
    mutations_df = pd.concat([mutations_df, patient_df], ignore_index=True)
print('Number of annotation rows for all patients:', len(mutations_df))
mutations_df.head()

Number of annotation rows for all patients: 15673


Unnamed: 0,Hugo_Symbol,Chromosome,Start_position,End_position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,Protein_Change,t_alt_count,t_ref_count
0,CEP350,1,180063656,180063656,Missense_Mutation,SNP,G,G,A,Patient-36-Tumor,Patient-36-Normal,p.E2806K,12,28
1,CCDC88C,14,91739009,91739009,Missense_Mutation,SNP,C,C,T,Patient-36-Tumor,Patient-36-Normal,p.G2016E,36,57
2,KDM6B,17,7749509,7749509,Missense_Mutation,SNP,A,A,T,Patient-36-Tumor,Patient-36-Normal,p.Y117F,4,25
3,PGAP1,2,197781268,197781268,Missense_Mutation,SNP,G,G,C,Patient-36-Tumor,Patient-36-Normal,p.F117L,11,25
4,PARD6B,20,49366765,49366765,Missense_Mutation,SNP,G,G,C,Patient-36-Tumor,Patient-36-Normal,p.E287Q,21,88


In [75]:
# Merge mutations_df and sample_info_df
merged_df = mutations_df.merge(sample_info_df, how='left', on=['Tumor_Sample_Barcode', 'Matched_Norm_Sample_Barcode'])
print('Number of entries in merged_df:', len(merged_df))

# Filter out silent mutations from variant_classification column
print('Mutation variant types:', merged_df['Variant_Classification'].unique())
nonsynon_df = merged_df[merged_df['Variant_Classification'] != 'Silent']  # nonsynonymous mutations df
print('Number of nonsynonymous mutations:', len(nonsynon_df))
nonsynon_df.head()

Number of entries in merged_df: 15673
Mutation variant types: ['Missense_Mutation' 'Silent' 'Nonsense_Mutation' 'Splice_Site']
Number of nonsynonymous mutations: 11247


Unnamed: 0,Hugo_Symbol,Chromosome,Start_position,End_position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,Protein_Change,t_alt_count,t_ref_count,Patient_ID,Response,Silent_mutations_per_Mb,Nonsynonymous_mutations_per_Mb,Mutations_per_Mb
0,CEP350,1,180063656,180063656,Missense_Mutation,SNP,G,G,A,Patient-36-Tumor,Patient-36-Normal,p.E2806K,12,28,Patient-36,Responder,2.47,6.0,8.47
1,CCDC88C,14,91739009,91739009,Missense_Mutation,SNP,C,C,T,Patient-36-Tumor,Patient-36-Normal,p.G2016E,36,57,Patient-36,Responder,2.47,6.0,8.47
2,KDM6B,17,7749509,7749509,Missense_Mutation,SNP,A,A,T,Patient-36-Tumor,Patient-36-Normal,p.Y117F,4,25,Patient-36,Responder,2.47,6.0,8.47
3,PGAP1,2,197781268,197781268,Missense_Mutation,SNP,G,G,C,Patient-36-Tumor,Patient-36-Normal,p.F117L,11,25,Patient-36,Responder,2.47,6.0,8.47
4,PARD6B,20,49366765,49366765,Missense_Mutation,SNP,G,G,C,Patient-36-Tumor,Patient-36-Normal,p.E287Q,21,88,Patient-36,Responder,2.47,6.0,8.47


In [97]:
nonsynon_df[nonsynon_df['Variant_Classification']=='Splice_Site'].head()

Unnamed: 0,Hugo_Symbol,Chromosome,Start_position,End_position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,Protein_Change,t_alt_count,t_ref_count,Patient_ID,Response,Silent_mutations_per_Mb,Nonsynonymous_mutations_per_Mb,Mutations_per_Mb
27,STXBP3,1,109294878,109294878,Splice_Site,SNP,G,G,C,Patient-36-Tumor,Patient-36-Normal,,32,61,Patient-36,Responder,2.47,6.0,8.47
35,SFXN5,2,73198699,73198699,Splice_Site,SNP,G,G,A,Patient-36-Tumor,Patient-36-Normal,p.H247H,8,12,Patient-36,Responder,2.47,6.0,8.47
65,SPG7,16,89590413,89590413,Splice_Site,SNP,G,G,A,Patient-36-Tumor,Patient-36-Normal,,5,10,Patient-36,Responder,2.47,6.0,8.47
128,NEO1,15,73590688,73590688,Splice_Site,SNP,G,G,C,Patient-36-Tumor,Patient-36-Normal,,11,16,Patient-36,Responder,2.47,6.0,8.47
160,ABCA7,19,1057312,1057312,Splice_Site,SNP,G,G,C,Patient-36-Tumor,Patient-36-Normal,,32,66,Patient-36,Responder,2.47,6.0,8.47


In [150]:
# Find the 15 most common mutations
### NOTE: accounting for NaN values unique to splice_site variants yields different results
for variant in nonsynon_df['Variant_Classification'].unique():
    nan_check = nonsynon_df[nonsynon_df['Variant_Classification']==variant].isnull().values.any()
    print(variant, nan_check)  # check for NaN values in variant_classifications
print('Number of splice_site mutations:', len(nonsynon_df[nonsynon_df['Variant_Classification']=='Splice_Site']))  # 392 splice_sites
print('Number of splice_site NaN values:', nonsynon_df['Protein_Change'].isnull().sum())  # 209 splice_site NaNs

# Not accounting for splice_site variant classification (drop NaN protein_change values)
# top15_noNan = nonsynon_df[['Hugo_Symbol', 'Variant_Classification', 'Protein_Change']].value_counts().to_frame('Count').reset_index()
# print('Without NaN Variant_Classification values: \n', top15_noNan.head(15))

# Accounting for splice_site variant classification (don't drop NaN protein_change values)
top15_wSplice = nonsynon_df[['Hugo_Symbol', 'Variant_Classification', 'Protein_Change']].value_counts(dropna=False).to_frame('Count').reset_index()
print('With NaN Variant_Classification values:')
top15_wSplice.head(15)

Missense_Mutation False
Nonsense_Mutation False
Splice_Site True
Number of splice_site mutations: 392
Number of splice_site NaN values: 209
With NaN Variant_Classification values:


Unnamed: 0,Hugo_Symbol,Variant_Classification,Protein_Change,Count
0,ERBB4,Missense_Mutation,p.S1289A,14
1,ERBB3,Missense_Mutation,p.H228Q,5
2,ERBB4,Missense_Mutation,p.Q707E,5
3,PIK3CA,Missense_Mutation,p.E545K,5
4,TYRO3,Missense_Mutation,p.L819M,4
5,ERBB3,Missense_Mutation,p.M91I,4
6,ERBB4,Missense_Mutation,p.E317K,4
7,RXRA,Missense_Mutation,p.S330F,4
8,MAP2K1,Missense_Mutation,p.F53L,4
9,FAM47C,Missense_Mutation,p.Q225E,4
