In [1]:
# This notebook takes as input GOI_out_AA.csv files (from getMutationCounts_overall_and_GOI.py),
# patient metadata, seurat metadata, fusionsDF, and creates a BY CELL 
# summaryTable. The goal with this table is to provide an answer to questions like
# 'which patients have which mutations?', and 'how many cells have clinically relevant
# mutations?' 

In [2]:
import summarizeLib
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # want to disable this SettingWithCopyWarning

In [3]:
# READ IN ALL OF THESE BY-GENE AMINO-ACID LEVEL MUTATION COUNTS OBJECTS
mutsPATH = '/Users/lincoln.harris/code/SNP_calling_pipeline/getMutationCounts/'
egfrPATH = mutsPATH + 'egfr_germline_out_AA.csv'
brafPATH = mutsPATH + 'braf_germline_out_AA.csv'
krasPATH = mutsPATH + 'kras_germline_out_AA.csv'

egfr_df = pd.read_csv(egfrPATH, header=None, names=['cell', 'mutations'])
braf_df = pd.read_csv(brafPATH, header=None, names=['cell', 'mutations'])
kras_df = pd.read_csv(krasPATH, header=None, names=['cell', 'mutations'])
egfr_df

Unnamed: 0,cell,mutations
0,K21_B003995,[]
1,L22_1001000408,['Q787Q']
2,A8_B001557,[]
3,L22_B001016,[]
4,C20_B002073,[]
5,D8_B001474,[]
6,C5_B002572,[]
7,E17_B003116,[]
8,A15_B000420,[]
9,H10_B002573,[]


In [4]:
# FIRST STEP IS TO GENERATE THE mutationsDF
mutationsDF = pd.DataFrame(columns=['cell', 'brafMut', 'egfrMut', 'krasMut'])
mutationsDF['cell'] = egfr_df['cell']
mutationsDF['egfrMut'] = egfr_df['mutations'] # fill in EGFR first -- this is ok bc the cell order is based on egfr_df
summarizeLib.mutationsDF_fillIn('braf', braf_df, mutationsDF) 
summarizeLib.mutationsDF_fillIn('kras', kras_df, mutationsDF)
mutationsDF

Unnamed: 0,cell,brafMut,egfrMut,krasMut
0,K21_B003995,[],[],[]
1,L22_1001000408,[],['Q787Q'],[]
2,A8_B001557,[],[],[]
3,L22_B001016,[],[],[]
4,C20_B002073,[],[],[]
5,D8_B001474,[],[],[]
6,C5_B002572,[],[],[]
7,E17_B003116,[],[],[]
8,A15_B000420,[],[],[]
9,H10_B002573,[],[],[]


In [5]:
# CONVERTING LISTS INTO STRS. MAKES DOWNSTEAM ANALYSIS EASIER
summarizeLib.removeExtraCharacters_mutationsDF('egfr', mutationsDF)
summarizeLib.removeExtraCharacters_mutationsDF('braf', mutationsDF)
summarizeLib.removeExtraCharacters_mutationsDF('kras', mutationsDF)
mutationsDF

Unnamed: 0,cell,brafMut,egfrMut,krasMut
0,K21_B003995,,,
1,L22_1001000408,,Q787Q,
2,A8_B001557,,,
3,L22_B001016,,,
4,C20_B002073,,,
5,D8_B001474,,,
6,C5_B002572,,,
7,E17_B003116,,,
8,A15_B000420,,,
9,H10_B002573,,,


In [6]:
# READ IN patientMetadata
patientMetadata = pd.read_csv('/Users/lincoln.harris/code/SNP_calling_pipeline/cDNA_plate_metadata.csv')
patientMetadata = patientMetadata.drop([0,1]) # first two rows are wierd
patientMetadata

Unnamed: 0,plate,sample_type,patient_id,DOB,gender,race,smokingHx,histolgy,driver_gene,driver_mutation,...,sample_name,processing_status,physical_description,sort_data_exported,cell_density,cDNA_cells,sequenced_cells_passQC,Sequence_Run1,Sequence_Run2,Sequence_Run3
2,1001000332,cell_line,H1975,,,,,,,,...,CL_S1,Sequenced,,,,38.0,,170504_NS500126_0691_AHC22JBGX2,,
3,1001000330,cell_line,TPH1,,,,,,,,...,CL_S1,Sequenced,,,,37.0,,170504_NS500126_0691_AHC22JBGX2,,
4,1001000302,Lung_tumor,TH158,1959-11-23,Female,Native Hawaiian or Other Pacific Island,Never,Adenocarcinoma,EGFR,del19,...,LT_S01,Sequenced,,,,,44,170215_NS500126_0658_AH7TLYBGX2,,
5,1001000301,Lung_tumor,TH157,2016-11-25,Male,Asian,Never,Adenocarcinoma,ALK,fusion,...,LT_S02,Sequenced,,,,,27,170215_NS500126_0658_AH7TLYBGX2,,
6,1001000292,Lung_tumor,TH156,1973-03-17,Male,White or Caucasian,Former,Adenocarcinoma,ALK,fusion,...,LT_S03,Sequenced,,,,,63,170125_NS500126_0647_AHVHJ2BGXY,170202_NS500126_0653_AHVL5NBGXY,
7,1001000293,Lung_tumor,TH156,1973-03-17,Male,White or Caucasian,Former,Adenocarcinoma,ALK,fusion,...,LT_S03,Sequenced,,,,,64,170125_NS500126_0647_AHVHJ2BGXY,170202_NS500126_0653_AHVL5NBGXY,
8,1001000294,Lung_tumor,TH156,1973-03-17,Male,White or Caucasian,Former,Adenocarcinoma,ALK,fusion,...,LT_S03,Sequenced,,,,,59,170125_NS500126_0647_AHVHJ2BGXY,170202_NS500126_0653_AHVL5NBGXY,
9,1001000295,Lung_tumor,TH156,1973-03-17,Male,B003109,Former,Adenocarcinoma,ALK,fusion,...,LT_S03,Sequenced,,,,,89,170125_NS500126_0647_AHVHJ2BGXY,170202_NS500126_0653_AHVL5NBGXY,
10,1001000296,Lung_tumor,TH166,1974-12-15,Male,Hispanic or Latino,<5py,Adenocarcinoma,BRAF,V600E,...,LT_S04,Sequenced,,y,,,71,170129_NS500126_0650_AHVM75BGXY,170202_NS500126_0653_AHVL5NBGXY,
11,1001000297,Lung_tumor,TH166,1974-12-15,Male,Hispanic or Latino,<5py,Adenocarcinoma,BRAF,V600E,...,LT_S04,Sequenced,,y,,,79,170129_NS500126_0650_AHVM75BGXY,170202_NS500126_0653_AHVL5NBGXY,


In [7]:
# INIT THE SUMMARY TABLE
cols = ['cell', 'patient', 'clinical_driver_gene', 'clinical_mutation', 'coverage_to_ROI', 'clin_mut_found_bool', 'mutations_found_EGFR', 'mutations_found_BRAF', 'mutations_found_KRAS', 'fusions_found', 'tumorCell_bool']
summaryTable = pd.DataFrame(columns=cols)
summaryTable['cell'] = mutationsDF['cell']
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,K21_B003995,,,,,,,,,,
1,L22_1001000408,,,,,,,,,,
2,A8_B001557,,,,,,,,,,
3,L22_B001016,,,,,,,,,,
4,C20_B002073,,,,,,,,,,
5,D8_B001474,,,,,,,,,,
6,C5_B002572,,,,,,,,,,
7,E17_B003116,,,,,,,,,,
8,A15_B000420,,,,,,,,,,
9,H10_B002573,,,,,,,,,,


In [8]:
# FILL IN VARIOUS METADATA COLS
summarizeLib.genericSummaryTableFillIn('patient_id', 'patient', summaryTable, patientMetadata)
summarizeLib.genericSummaryTableFillIn('driver_gene', 'clinical_driver_gene', summaryTable, patientMetadata)
summarizeLib.genericSummaryTableFillIn('driver_mutation', 'clinical_mutation', summaryTable, patientMetadata)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,K21_B003995,TH236,EGFR,del19,,,,,,,
1,L22_1001000408,TH185,EGFR,L858R,,,,,,,
2,A8_B001557,TH179,BRAF,V600E,,,,,,,
3,L22_B001016,,,,,,,,,,
4,C20_B002073,TH238_Normal,BRAF,V600E,,,,,,,
5,D8_B001474,TH248,EGFR,del19,,,,,,,
6,C5_B002572,TH266,ALK,fusion,,,,,,,
7,E17_B003116,TH231,ALK,fusion,,,,,,,
8,A15_B000420,TH238,BRAF,V600E,,,,,,,
9,H10_B002573,TH266,ALK,fusion,,,,,,,


In [9]:
# FILL IN MUTATIONS FOUND COL 
summaryTable['mutations_found_EGFR'] = mutationsDF['egfrMut']
summaryTable['mutations_found_KRAS'] = mutationsDF['krasMut']
summaryTable['mutations_found_BRAF'] = mutationsDF['brafMut']
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,K21_B003995,TH236,EGFR,del19,,,,,,,
1,L22_1001000408,TH185,EGFR,L858R,,,Q787Q,,,,
2,A8_B001557,TH179,BRAF,V600E,,,,,,,
3,L22_B001016,,,,,,,,,,
4,C20_B002073,TH238_Normal,BRAF,V600E,,,,,,,
5,D8_B001474,TH248,EGFR,del19,,,,,,,
6,C5_B002572,TH266,ALK,fusion,,,,,,,
7,E17_B003116,TH231,ALK,fusion,,,,,,,
8,A15_B000420,TH238,BRAF,V600E,,,,,,,
9,H10_B002573,TH266,ALK,fusion,,,,,,,


In [10]:
# READ IN FUSIONS DATAFRAME, THEN FILL IN summaryTable
fusionsDF = pd.read_csv('/Users/lincoln.harris/code/SNP_calling_pipeline/summaryTable/fusion_dataframe.csv')
fusionsDF

Unnamed: 0,ALK--EML4,ALK_any,EML4_any,NTRK_any,RET_any,ROS1_any
0,C2_B000862,C2_B000862,L18_B003120,,,D10_B003523
1,P1_B001464,P1_B001464,D10_B003523,,,G5_1001000327
2,M11_B003522,M11_B003522,I22_B000276,,,O24_1001000377
3,G8_1001000317,G8_1001000317,A4_B001607,,,O23_1001000377
4,A7_10001000325,A7_10001000325,I6_B003642,,,A6_B003132
5,M12_B003522,M12_B003522,I4_B001607,,,H7_1001000377
6,B11_10001000325,B11_10001000325,P20_B002571,,,E10_B003528
7,G2_B000862,G2_B000862,E5_B001545,,,M19_B003777
8,J15_B000862,J15_B000862,E1_B003117,,,B3_B003187
9,E7_1001000317,E7_1001000317,O2_B003067,,,H2_B003126


In [11]:
summarizeLib.fusionsFillIn(fusionsDF, summaryTable)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool
0,K21_B003995,TH236,EGFR,del19,,,,,,,
1,L22_1001000408,TH185,EGFR,L858R,,,Q787Q,,,,
2,A8_B001557,TH179,BRAF,V600E,,,,,,,
3,L22_B001016,,,,,,,,,,
4,C20_B002073,TH238_Normal,BRAF,V600E,,,,,,,
5,D8_B001474,TH248,EGFR,del19,,,,,,,
6,C5_B002572,TH266,ALK,fusion,,,,,,,
7,E17_B003116,TH231,ALK,fusion,,,,,,,
8,A15_B000420,TH238,BRAF,V600E,,,,,,,
9,H10_B002573,TH266,ALK,fusion,,,,,,,


In [12]:
# SET UP A COL TO TRANSLATE 'RAW' MUTATION CALLS TO 'CLINICAL'
summaryTable['mutations_found_translated'] = ""
summarizeLib.translatedMutsFillIn_EGFR(summaryTable)
summarizeLib.translatedMutsFillIn_nonEGFR('KRAS', summaryTable)
summarizeLib.translatedMutsFillIn_nonEGFR('BRAF', summaryTable)
summarizeLib.translatedMutsFillIn_fusions(summaryTable)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool,mutations_found_translated
0,K21_B003995,TH236,EGFR,del19,,,,,,,,[]
1,L22_1001000408,TH185,EGFR,L858R,,,Q787Q,,,,,[EGFR Q787Q]
2,A8_B001557,TH179,BRAF,V600E,,,,,,,,[]
3,L22_B001016,,,,,,,,,,,[]
4,C20_B002073,TH238_Normal,BRAF,V600E,,,,,,,,[]
5,D8_B001474,TH248,EGFR,del19,,,,,,,,[]
6,C5_B002572,TH266,ALK,fusion,,,,,,,,[]
7,E17_B003116,TH231,ALK,fusion,,,,,,,,[]
8,A15_B000420,TH238,BRAF,V600E,,,,,,,,[]
9,H10_B002573,TH266,ALK,fusion,,,,,,,,[]


In [13]:
# CONVERT LISTS TO STRING, SO I CAN GET SET -- probably not necessary, actually 
summarizeLib.convertToString(summaryTable)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool,mutations_found_translated
0,K21_B003995,TH236,EGFR,del19,,,,,,,,
1,L22_1001000408,TH185,EGFR,L858R,,,Q787Q,,,,,EGFR Q787Q
2,A8_B001557,TH179,BRAF,V600E,,,,,,,,
3,L22_B001016,,,,,,,,,,,
4,C20_B002073,TH238_Normal,BRAF,V600E,,,,,,,,
5,D8_B001474,TH248,EGFR,del19,,,,,,,,
6,C5_B002572,TH266,ALK,fusion,,,,,,,,
7,E17_B003116,TH231,ALK,fusion,,,,,,,,
8,A15_B000420,TH238,BRAF,V600E,,,,,,,,
9,H10_B002573,TH266,ALK,fusion,,,,,,,,


In [14]:
# FILL IN clin_mut_found_bool
summarizeLib.clinMutFound_fillIn(summaryTable)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool,mutations_found_translated
0,K21_B003995,TH236,EGFR,del19,,0,,,,,,
1,L22_1001000408,TH185,EGFR,L858R,,0,Q787Q,,,,,EGFR Q787Q
2,A8_B001557,TH179,BRAF,V600E,,0,,,,,,
3,L22_B001016,,,,,0,,,,,,
4,C20_B002073,TH238_Normal,BRAF,V600E,,0,,,,,,
5,D8_B001474,TH248,EGFR,del19,,0,,,,,,
6,C5_B002572,TH266,ALK,fusion,,0,,,,,,
7,E17_B003116,TH231,ALK,fusion,,0,,,,,,
8,A15_B000420,TH238,BRAF,V600E,,0,,,,,,
9,H10_B002573,TH266,ALK,fusion,,0,,,,,,


In [15]:
# FILL IN  tumorCellBool
summarizeLib.tumorCellBoolFillIn(summaryTable)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool,mutations_found_translated
0,K21_B003995,TH236,EGFR,del19,,0,,,,,0,
1,L22_1001000408,TH185,EGFR,L858R,,0,Q787Q,,,,1,EGFR Q787Q
2,A8_B001557,TH179,BRAF,V600E,,0,,,,,0,
3,L22_B001016,,,,,0,,,,,0,
4,C20_B002073,TH238_Normal,BRAF,V600E,,0,,,,,0,
5,D8_B001474,TH248,EGFR,del19,,0,,,,,1,
6,C5_B002572,TH266,ALK,fusion,,0,,,,,0,
7,E17_B003116,TH231,ALK,fusion,,0,,,,,1,
8,A15_B000420,TH238,BRAF,V600E,,0,,,,,0,
9,H10_B002573,TH266,ALK,fusion,,0,,,,,0,


In [16]:
# GET PER-CELL ROI COVERAGE DFs
braf_V600E_cov_nonZero = summarizeLib.getNonZeroCovROI('braf', 'V600E')
egfr_L858R_cov_nonZero = summarizeLib.getNonZeroCovROI('egfr', 'L858R')
egfr_exon19del_cov_nonZero = summarizeLib.getNonZeroCovROI('egfr', 'exon19del')
egfr_exon20ins_cov_nonZero = summarizeLib.getNonZeroCovROI('egfr', 'exon20ins') # this guy is totally empty...
egfr_G719X_cov_nonZero = summarizeLib.getNonZeroCovROI('egfr', 'G719X')
egfr_L861Q_cov_nonZero = summarizeLib.getNonZeroCovROI('egfr', 'L861Q')
egfr_S768I_cov_nonZero = summarizeLib.getNonZeroCovROI('egfr', 'S768I')
egfr_T790M_cov_nonZero = summarizeLib.getNonZeroCovROI('egfr', 'T790M')
kras_G12C_cov_nonZero = summarizeLib.getNonZeroCovROI('kras', 'G12C')
kras_G13X_cov_nonZero = summarizeLib.getNonZeroCovROI('kras', 'G13X')
kras_Q61X_cov_nonZero = summarizeLib.getNonZeroCovROI('kras', 'Q61X')
egfr_L858R_cov_nonZero

Unnamed: 0,cellName,coverage_bool_vcf,depth_vcf,coverage_bool_gvcf,depth_gvcf
66,A13_1001000407,1,12,1,23
67,A13_1001000408,0,0,1,2
121,A15_1001000408,0,0,1,3
153,A16_1001000407,1,6,1,15
155,A16_1001000412,1,16,1,29
185,A17_1001000407,0,0,1,5
212,A18_1001000408,0,0,1,2
285,A20_1001000408,1,6,1,14
384,A3_1001000412,0,0,1,3
542,B10_1001000412,1,3,1,7


In [17]:
# FIX UP SOME OF THE WEIRD ONES
kras_G13X_cov_nonZero['depth_gvcf'][4202] = 34	
kras_Q61X_cov_nonZero['depth_gvcf'][6431] = 92
egfr_exon19del_cov_nonZero['depth_gvcf'] = egfr_exon19del_cov_nonZero['depth_gvcf'].str.strip('[')
egfr_exon19del_cov_nonZero['depth_gvcf'] = egfr_exon19del_cov_nonZero['depth_gvcf'].str.strip(']')
egfr_exon19del_cov_nonZero['depth_gvcf'] = egfr_exon19del_cov_nonZero['depth_gvcf'].str.strip("'")
egfr_exon19del_cov_nonZero

Unnamed: 0,cellName,coverage_bool_vcf,depth_vcf,coverage_bool_gvcf,depth_gvcf
0,A10_1001000407,0,0,0,0
1,A10_1001000408,0,0,0,0
2,A10_1001000412,0,0,0,0
3,A10_B000863,0,0,0,0
4,A10_B001007,0,0,0,0
5,A10_B001470,0,0,0,0
6,A10_B001474,0,0,0,0
7,A10_B001545,0,0,0,0
8,A10_B001548,0,0,0,0
9,A10_B001554,0,0,0,0


In [18]:
# FILL IN ROI COVERAGE TO SUMMARY TABLE
summarizeLib.ROI_coverage_fillIn(braf_V600E_cov_nonZero, 'BRAF', 'V600E', summaryTable)
summarizeLib.ROI_coverage_fillIn(egfr_G719X_cov_nonZero, 'EGFR', 'G719X', summaryTable)
summarizeLib.ROI_coverage_fillIn(egfr_L858R_cov_nonZero, 'EGFR', 'L858R', summaryTable)
summarizeLib.ROI_coverage_fillIn(egfr_L861Q_cov_nonZero, 'EGFR', 'L861Q', summaryTable)
summarizeLib.ROI_coverage_fillIn(egfr_S768I_cov_nonZero, 'EGFR', 'S768I', summaryTable)
summarizeLib.ROI_coverage_fillIn(egfr_T790M_cov_nonZero, 'EGFR', 'T790M', summaryTable)
summarizeLib.ROI_coverage_fillIn(kras_G12C_cov_nonZero, 'KRAS', 'G12C', summaryTable)
summarizeLib.ROI_coverage_fillIn(kras_G13X_cov_nonZero, 'KRAS', 'G13X', summaryTable)
summarizeLib.ROI_coverage_fillIn(kras_Q61X_cov_nonZero, 'KRAS', 'Q61X', summaryTable)
summarizeLib.ROI_coverage_fillIn(egfr_exon19del_cov_nonZero, 'EGFR', 'del19', summaryTable)
summarizeLib.ROI_coverage_fillIn(egfr_exon20ins_cov_nonZero, 'EGFR', 'ins20', summaryTable)
summaryTable

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,coverage_to_ROI,clin_mut_found_bool,mutations_found_EGFR,mutations_found_BRAF,mutations_found_KRAS,fusions_found,tumorCell_bool,mutations_found_translated
0,K21_B003995,TH236,EGFR,del19,0,0,,,,,0,
1,L22_1001000408,TH185,EGFR,L858R,0,0,Q787Q,,,,1,EGFR Q787Q
2,A8_B001557,TH179,BRAF,V600E,0,0,,,,,0,
3,L22_B001016,,,,,0,,,,,0,
4,C20_B002073,TH238_Normal,BRAF,V600E,0,0,,,,,0,
5,D8_B001474,TH248,EGFR,del19,0,0,,,,,1,
6,C5_B002572,TH266,ALK,fusion,,0,,,,,0,
7,E17_B003116,TH231,ALK,fusion,,0,,,,,1,
8,A15_B000420,TH238,BRAF,V600E,0,0,,,,,0,
9,H10_B002573,TH266,ALK,fusion,,0,,,,,0,


In [19]:
# TRIM IT DOWN
summaryTable_trimmed = summaryTable[['cell', 'patient', 'clinical_driver_gene', 'clinical_mutation', 'coverage_to_ROI', 'clin_mut_found_bool', 'tumorCell_bool', 'mutations_found_translated']]
summaryTable_trimmed.columns = ['cell', 'patient', 'clinical_driver_gene', 'clinical_mutation', 'coverage_to_ROI', 'clinical_mutation_found_bool', 'tumorCell_bool', 'mutations_found']
summaryTable_trimmed = summaryTable_trimmed[['cell', 'patient', 'clinical_driver_gene', 'clinical_mutation', 'mutations_found', 'coverage_to_ROI', 'clinical_mutation_found_bool', 'tumorCell_bool']]
summaryTable_trimmed

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,mutations_found,coverage_to_ROI,clinical_mutation_found_bool,tumorCell_bool
0,K21_B003995,TH236,EGFR,del19,,0,0,0
1,L22_1001000408,TH185,EGFR,L858R,EGFR Q787Q,0,0,1
2,A8_B001557,TH179,BRAF,V600E,,0,0,0
3,L22_B001016,,,,,,0,0
4,C20_B002073,TH238_Normal,BRAF,V600E,,0,0,0
5,D8_B001474,TH248,EGFR,del19,,0,0,1
6,C5_B002572,TH266,ALK,fusion,,,0,0
7,E17_B003116,TH231,ALK,fusion,,,0,1
8,A15_B000420,TH238,BRAF,V600E,,0,0,0
9,H10_B002573,TH266,ALK,fusion,,,0,0


In [20]:
# WRITE TO FILE
summaryTable_trimmed.to_csv('/Users/lincoln.harris/Desktop/validationTable_TEST.csv', index=False)

In [29]:
# ADD SAMPLE_NAME COL TO SUMMARYTABLE
summaryTable_trimmed['sample_name'] = ''
summarizeLib.genericSummaryTableFillIn('sample_name', 'sample_name', summaryTable_trimmed, patientMetadata)
summaryTable_trimmed

Unnamed: 0,cell,patient,clinical_driver_gene,clinical_mutation,mutations_found,coverage_to_ROI,clinical_mutation_found_bool,tumorCell_bool,sample_name
0,K21_B003995,TH236,EGFR,del19,,0,0,0,LT_S71
1,L22_1001000408,TH185,EGFR,L858R,EGFR Q787Q,0,0,1,LT_S21
2,A8_B001557,TH179,BRAF,V600E,,0,0,0,LT_S80
3,L22_B001016,,,,,,0,0,
4,C20_B002073,TH238_Normal,BRAF,V600E,,0,0,0,LT_S65
5,D8_B001474,TH248,EGFR,del19,,0,0,1,LT_S74
6,C5_B002572,TH266,ALK,fusion,,,0,0,LT_S81
7,E17_B003116,TH231,ALK,fusion,,,0,1,LT_S56
8,A15_B000420,TH238,BRAF,V600E,,0,0,0,LT_S66
9,H10_B002573,TH266,ALK,fusion,,,0,0,LT_S81


In [30]:
# GET MIN SET OF SAMPLE NAMES
relevantSamplesSet = set(summaryTable_trimmed['sample_name'])
relevantSamplesList = list(relevantSamplesSet)
relevantSamplesSeries = pd.Series(relevantSamplesList)
relevantSamplesSeries

0           
1     LT_S09
2     LT_S48
3     LT_S41
4     LT_S80
5     LT_S13
6     LT_S16
7     LT_S79
8     LT_S81
9     LT_S66
10    LT_S49
11    LT_S03
12    LT_S69
13    LT_S17
14    LT_S05
15    LT_S63
16    LT_S55
17    LT_S35
18    LT_S12
19    LT_S67
20    LT_S74
21    LT_S51
22    LT_S23
23    LT_S78
24    LT_S50
25    LT_S72
26    LT_S42
27    LT_S21
28    LT_S07
29    LT_S22
30    LT_S65
31    LT_S01
32    LT_S02
33    LT_S11
34    LT_S53
35    LT_S57
36    LT_S19
37    LT_S34
38    LT_S44
39    LT_S75
40    LT_S58
41    LT_S08
42    LT_S54
43    LT_S28
44    LT_S71
45    LT_S47
46    LT_S52
47    LT_S43
48    LT_S56
49    LT_S45
50    LT_S37
51    LT_S38
52    LT_S29
dtype: object

In [31]:
# INIT VALIDATIONTABLE_SAMPLES
cols = ['sample', 'patient', 'driver_gene', 'driver_mutation', 'mutations_found', 'numCells', 'numTumorCells', 'numTumorCells_w_coverage_to_ROI', 'numTumorCells_clinMut_found']
validationTable_samples = pd.DataFrame(columns=cols)
validationTable_samples['sample'] = relevantSamplesSeries
validationTable_samples

Unnamed: 0,sample,patient,driver_gene,driver_mutation,mutations_found,numCells,numTumorCells,numTumorCells_w_coverage_to_ROI,numTumorCells_clinMut_found
0,,,,,,,,,
1,LT_S09,,,,,,,,
2,LT_S48,,,,,,,,
3,LT_S41,,,,,,,,
4,LT_S80,,,,,,,,
5,LT_S13,,,,,,,,
6,LT_S16,,,,,,,,
7,LT_S79,,,,,,,,
8,LT_S81,,,,,,,,
9,LT_S66,,,,,,,,


In [32]:
# FILL IN METADATA FIELDS
summarizeLib.validationTable_metadata_fillIn('patient_id', 'patient', validationTable_samples, patientMetadata)
summarizeLib.validationTable_metadata_fillIn('driver_gene', 'driver_gene', validationTable_samples, patientMetadata)
summarizeLib.validationTable_metadata_fillIn('driver_mutation', 'driver_mutation', validationTable_samples, patientMetadata)
validationTable_samples

Unnamed: 0,sample,patient,driver_gene,driver_mutation,mutations_found,numCells,numTumorCells,numTumorCells_w_coverage_to_ROI,numTumorCells_clinMut_found
0,,,,,,,,,
1,LT_S09,TH067,EGFR,del19,,,,,
2,LT_S48,TH155,EGFR,del19,,,,,
3,LT_S41,TH210,ALK,fusion,,,,,
4,LT_S80,TH179,BRAF,V600E,,,,,
5,LT_S13,TH169,EGFR,L858R,,,,,
6,LT_S16,TH146,ROS1,ROS1-CD74,,,,,
7,LT_S79,TH179,BRAF,V600E,,,,,
8,LT_S81,TH266,ALK,fusion,,,,,
9,LT_S66,TH238,BRAF,V600E,,,,,


In [33]:
# FILL IN MUTATIONS FOUND
muts_dict = summarizeLib.validationTable_dict_muts(validationTable_samples, summaryTable_trimmed)
validationTable_samples['mutations_found'] = muts_dict.values()
validationTable_samples

Unnamed: 0,sample,patient,driver_gene,driver_mutation,mutations_found,numCells,numTumorCells,numTumorCells_w_coverage_to_ROI,numTumorCells_clinMut_found
0,,,,,"EGFR Q787Q, KRAS G13D, EGFR R521K, BRAF V600E...",,,,
1,LT_S09,TH067,EGFR,del19,,,,,
2,LT_S48,TH155,EGFR,del19,"EML4 fusion, BRAF G469R,",,,,
3,LT_S41,TH210,ALK,fusion,,,,,
4,LT_S80,TH179,BRAF,V600E,"KRAS A146V,",,,,
5,LT_S13,TH169,EGFR,L858R,,,,,
6,LT_S16,TH146,ROS1,ROS1-CD74,"ROS1 fusion,",,,,
7,LT_S79,TH179,BRAF,V600E,"EML4 fusion, BRAF V600E, KRAS A146P,",,,,
8,LT_S81,TH266,ALK,fusion,"BRAF G643G, KRAS G13C, BRAF Q609H, EML4 fusion...",,,,
9,LT_S66,TH238,BRAF,V600E,"KRAS G12C, EGFR F856L, EGFR G598V, EML4 fusion...",,,,


In [34]:
# FILL IN NUMTUMORCELLS
tc_dict = summarizeLib.validationTable_dict_generic(validationTable_samples, summaryTable_trimmed, 'tumorCell_bool')
tc_cov_dict = summarizeLib.validationTable_dict_generic(validationTable_samples, summaryTable_trimmed, 'coverage_to_ROI')
clinMut_dict = summarizeLib.validationTable_dict_generic(validationTable_samples, summaryTable_trimmed, 'clinical_mutation_found_bool')

validationTable_samples['numTumorCells'] = tc_dict.values()
validationTable_samples['numTumorCells_w_coverage_to_ROI'] = tc_cov_dict.values()
validationTable_samples['numTumorCells_clinMut_found'] = clinMut_dict.values()
validationTable_samples

Unnamed: 0,sample,patient,driver_gene,driver_mutation,mutations_found,numCells,numTumorCells,numTumorCells_w_coverage_to_ROI,numTumorCells_clinMut_found
0,,,,,"EGFR Q787Q, KRAS G13D, EGFR R521K, BRAF V600E...",,6,0,0
1,LT_S09,TH067,EGFR,del19,,,0,0,0
2,LT_S48,TH155,EGFR,del19,"EML4 fusion, BRAF G469R,",,0,0,0
3,LT_S41,TH210,ALK,fusion,,,15,0,0
4,LT_S80,TH179,BRAF,V600E,"KRAS A146V,",,13,0,0
5,LT_S13,TH169,EGFR,L858R,,,1,0,0
6,LT_S16,TH146,ROS1,ROS1-CD74,"ROS1 fusion,",,0,0,0
7,LT_S79,TH179,BRAF,V600E,"EML4 fusion, BRAF V600E, KRAS A146P,",,60,3,3
8,LT_S81,TH266,ALK,fusion,"BRAF G643G, KRAS G13C, BRAF Q609H, EML4 fusion...",,0,0,0
9,LT_S66,TH238,BRAF,V600E,"KRAS G12C, EGFR F856L, EGFR G598V, EML4 fusion...",,260,2,2


In [35]:
# CLEAN UP A BIT
validationTable_samples = validationTable_samples.drop([0]) # this can change
cols = ['sample', 'patient', 'driver_gene', 'driver_mutation', 'mutations_found', 'numTumorCells', 'numTumorCells_w_coverage_to_ROI', 'numTumorCells_clinMut_found']
validationTable_samples = validationTable_samples[cols]
validationTable_samples

Unnamed: 0,sample,patient,driver_gene,driver_mutation,mutations_found,numTumorCells,numTumorCells_w_coverage_to_ROI,numTumorCells_clinMut_found
1,LT_S09,TH067,EGFR,del19,,0,0,0
2,LT_S48,TH155,EGFR,del19,"EML4 fusion, BRAF G469R,",0,0,0
3,LT_S41,TH210,ALK,fusion,,15,0,0
4,LT_S80,TH179,BRAF,V600E,"KRAS A146V,",13,0,0
5,LT_S13,TH169,EGFR,L858R,,1,0,0
6,LT_S16,TH146,ROS1,ROS1-CD74,"ROS1 fusion,",0,0,0
7,LT_S79,TH179,BRAF,V600E,"EML4 fusion, BRAF V600E, KRAS A146P,",60,3,3
8,LT_S81,TH266,ALK,fusion,"BRAF G643G, KRAS G13C, BRAF Q609H, EML4 fusion...",0,0,0
9,LT_S66,TH238,BRAF,V600E,"KRAS G12C, EGFR F856L, EGFR G598V, EML4 fusion...",260,2,2
10,LT_S49,TH223,EGFR,del19,"EGFR T903T,",7,0,0


In [36]:
# write this bitch
validationTable_samples.to_csv('./validationTable_samples_4.1.19.csv', index=False)