# Parsing data down to usable data

#### In this notebook, we will take the output and perform a series of removals based on the output of biopsyAnnotation
1. Create a template for investigating data
2. Investigate all data, remove those that may be repeated (look for spaces in the biopsy name) [1stParse]
3. Use template to look at all data 
4. Remove biopsies w/ missing tissue or consent [2ndParse]
5. Remove those w/ no FFPE tissue [3rdParse]
6. Remove those w/ missing screenshots from BrainLab Neuronav software [4thParse]
7. Remove those w/o pathological outcome assigned in multnom_out (missing pathology) [5thParse]
8. Remove those whose imaging failed to be quantified for some reason [6thParse]
9. Remove ependymomas and other extraneous histologies [7thParse]
10. Remove necrotic samples [8thParse]
11. Only include rHGG or TxE [9thParse]

## 1. Create template: 

In [86]:
options(repr.matrix.max.rows=600, repr.matrix.max.cols=200)

temp = matrix(nrow = 3, ncol = 4)
colnames(temp) = c("data", "patients", "scans", "samples")
temp[1,1] = "old_po1"
temp[2,1] = "REC_HGG"
temp[3,1] = "TOTAL"

## 2. Investigate data, remove those that look repeated, look for those w/ spaces in biopsy names (when merged w/ igt_stats data) 

In [87]:
recgli = read.csv("../AnnotateData/REC_HGG_Annotation_Oct2018/10thAnnot_researchPath_withInVivo.csv")

In [88]:
dim(recgli)

In [89]:
## looking at duplicates: 
table(duplicated(recgli$roi.label))
recgli[duplicated(recgli$roi.label),]


FALSE  TRUE 
  299     4 

Unnamed: 0,b_number,t_number,sf_number,current_scan_date,current_surgery_date,current_hist_grade,current_hist_type,roi.label,tumor_cell_evaluation,necrosis,mv_hyperplasia,f8_delicate,f8_simple,f8_complex,mib_1,avgerage_cells,bx_pure_treatment_effect,percent_tumor_nuclei,nfse,nfl,nt1c,nt1v,nt1d,nadc.1,nfa.1,nadc.2,nfa.2,phn_nlin,cbvn_nlin,recov_nlin,phn_npar,recov_npar,recovn_npar,cni,ccri,crni,ncho,ncre,nnaa,nlip,nlac,laclip,comments,notes.,imaging_code,perf_quant,spec_quant,include_anat,include_diffu1000,include_diffu2000,include_diffu_all,include_perf,include_spec,include_one_advanced_mod,include_all_advanced_mods,X.CEL,X.NEL,X.NEC,sum,in_CEL,in_T2all,in_NEL,in_NEC,in_ROI,desired_hist,multnom_out,no_ffpe,waiting_on_path,rhgg_txe_analysis
65,3645,9213,10138,2014-10-22,2014-10-23,Grade IV,Glioblastoma,49B74,3,0,2,,,,40.81,,f,80,1.65,1.55,1.32,0.66,-6.92,1.7,0.5,1.74,0.61,1.29,1.62,100,1.24,100,100,3.51,2.14,0.94,0.69,0.26,0.07,0.12,0.28,0.4,,,,,0,1,1,1,1,1,1,1,1,95.74,4.26,,100,1,1,0,0,1,1,rHGG,0,0,1
72,3670,9288,10199,2014-12-16,2014-12-17,Grade IV,Glioblastoma,49B75,2,0,2,,,,5.92,261.0,,70,1.74,1.82,1.35,0.92,-3.35,1.64,0.72,1.71,0.72,1.12,1.09,100,1.04,100,100,,,,,,,,,,"""foci of macrophages""",,,,1,1,1,1,1,1,0,1,0,58.33,41.67,0.0,100,1,1,0,0,1,1,rHGG,0,0,1
105,3791,9970,10507,2015-08-25,2015-08-26,Treatment Effect,Treatment Effect,49B78,3,0,1,,,,10.53,339.0,f,85,2.11,1.83,0.96,0.68,-2.03,2.077567,0.6070571,1.99,0.61,0.57,0.59,100,0.55,100,100,1.54,0.51,1.02,0.68,0.51,0.27,0.86,0.86,0.86,,,,,0,1,0,1,1,1,1,1,1,20.0,80.0,0.0,100,0,1,1,0,1,1,rHGG,0,0,1
208,3783,11192,11150,2017-01-29,2017-01-30,Grade III,Astrocytoma,36B97,3,0,1,2.0,1.0,0.0,9.56,207.2,f,75,1.8,1.86,1.39,1.26,-1.17,1.67,0.73,1.69,1.06,0.79,0.67,100,0.75,100,100,5.83,4.55,0.31,1.97,0.9,0.81,0.05,0.05,0.05,,,,,0,1,1,1,1,1,1,1,1,33.33,66.67,,100,1,1,1,0,1,1,rHGG,0,0,1


In [90]:
## looks like they all somehow got duplicated ... all have the exact same parameters, so we can just eliminate these. Just to make sure: 
recgli[grep('49B74', recgli$roi.label),]
## yep, all the same; we can actually now just delete these: 
recgli = recgli[!duplicated(recgli$roi.label),]
dim(recgli)

Unnamed: 0,b_number,t_number,sf_number,current_scan_date,current_surgery_date,current_hist_grade,current_hist_type,roi.label,tumor_cell_evaluation,necrosis,mv_hyperplasia,f8_delicate,f8_simple,f8_complex,mib_1,avgerage_cells,bx_pure_treatment_effect,percent_tumor_nuclei,nfse,nfl,nt1c,nt1v,nt1d,nadc.1,nfa.1,nadc.2,nfa.2,phn_nlin,cbvn_nlin,recov_nlin,phn_npar,recov_npar,recovn_npar,cni,ccri,crni,ncho,ncre,nnaa,nlip,nlac,laclip,comments,notes.,imaging_code,perf_quant,spec_quant,include_anat,include_diffu1000,include_diffu2000,include_diffu_all,include_perf,include_spec,include_one_advanced_mod,include_all_advanced_mods,X.CEL,X.NEL,X.NEC,sum,in_CEL,in_T2all,in_NEL,in_NEC,in_ROI,desired_hist,multnom_out,no_ffpe,waiting_on_path,rhgg_txe_analysis
64,3645,9213,10138,2014-10-22,2014-10-23,Grade IV,Glioblastoma,49B74,3,0,2,,,,40.81,,f,80,1.65,1.55,1.32,0.66,-6.92,1.7,0.5,1.74,0.61,1.29,1.62,100,1.24,100,100,3.51,2.14,0.94,0.69,0.26,0.07,0.12,0.28,0.4,,,,,0,1,1,1,1,1,1,1,1,95.74,4.26,,100,1,1,0,0,1,1,rHGG,0,0,1
65,3645,9213,10138,2014-10-22,2014-10-23,Grade IV,Glioblastoma,49B74,3,0,2,,,,40.81,,f,80,1.65,1.55,1.32,0.66,-6.92,1.7,0.5,1.74,0.61,1.29,1.62,100,1.24,100,100,3.51,2.14,0.94,0.69,0.26,0.07,0.12,0.28,0.4,,,,,0,1,1,1,1,1,1,1,1,95.74,4.26,,100,1,1,0,0,1,1,rHGG,0,0,1


In [91]:
## looking at spaces: 
recgli[grep(' ', recgli$roi.label),]

b_number,t_number,sf_number,current_scan_date,current_surgery_date,current_hist_grade,current_hist_type,roi.label,tumor_cell_evaluation,necrosis,mv_hyperplasia,f8_delicate,f8_simple,f8_complex,mib_1,avgerage_cells,bx_pure_treatment_effect,percent_tumor_nuclei,nfse,nfl,nt1c,nt1v,nt1d,nadc.1,nfa.1,nadc.2,nfa.2,phn_nlin,cbvn_nlin,recov_nlin,phn_npar,recov_npar,recovn_npar,cni,ccri,crni,ncho,ncre,nnaa,nlip,nlac,laclip,comments,notes.,imaging_code,perf_quant,spec_quant,include_anat,include_diffu1000,include_diffu2000,include_diffu_all,include_perf,include_spec,include_one_advanced_mod,include_all_advanced_mods,X.CEL,X.NEL,X.NEC,sum,in_CEL,in_T2all,in_NEL,in_NEC,in_ROI,desired_hist,multnom_out,no_ffpe,waiting_on_path,rhgg_txe_analysis


In [92]:
## if we delete the spaces, do we have duplicates ? or no? 
recgli$roi.label = gsub(" ", "", recgli$roi.label)

In [93]:
table(duplicated(recgli$roi.label))


FALSE 
  299 

In [94]:
## great, so the spaces don't really matter all that much it turns out. Now we have 615 biopsies to start with. 

In [95]:
write.csv(recgli, "Parsing_REC_HGG_Oct2018/1stParse_researchpath_withInVivo.csv", row.names = F)

## 3. Use template to look at all data


In [96]:
recgli = read.csv("Parsing_REC_HGG_Oct2018/1stParse_researchpath_withInVivo.csv")
dim(recgli)

In [97]:
recgli = data.frame(recgli, olddata =0, newdata = 1)

In [98]:
all_possible = temp
all_possible[1,2] = sum(ifelse(duplicated(recgli$b_number)==FALSE & recgli$olddata==1, 1, 0))
all_possible[1,3] = sum(ifelse(duplicated(recgli$t_number)==FALSE & recgli$olddata==1, 1, 0))
all_possible[1,4] = sum(recgli$olddata==1)
all_possible[2,2] = sum(ifelse(duplicated(recgli$b_number)==FALSE & recgli$newdata==1, 1, 0))
all_possible[2,3] = sum(ifelse(duplicated(recgli$t_number)==FALSE & recgli$newdata==1, 1, 0))
all_possible[2,4] = sum(recgli$newdata==1)
all_possible[3,2:4]= as.numeric(all_possible[1,2:4]) + as.numeric(all_possible[2,2:4])
all_possible

data,patients,scans,samples
old_po1,0,0,0
REC_HGG,91,95,299
TOTAL,91,95,299


## 4. Remove samples without tissue/consent: 

In [99]:
recgli.noconsent = recgli[-which(recgli$imaging_code=="tiss_cons"),]
dim(recgli.noconsent)
noconsent = temp
noconsent[1,2] = sum(ifelse(duplicated(recgli.noconsent$b_number)==FALSE & recgli.noconsent$olddata==1, 1, 0))
noconsent[1,3] = sum(ifelse(duplicated(recgli.noconsent$t_number)==FALSE & recgli.noconsent$olddata==1, 1, 0))
noconsent[1,4] = sum(recgli.noconsent$olddata==1)
noconsent[2,2] = sum(ifelse(duplicated(recgli.noconsent$b_number)==FALSE & recgli.noconsent$newdata==1, 1, 0))
noconsent[2,3] = sum(ifelse(duplicated(recgli.noconsent$t_number)==FALSE & recgli.noconsent$newdata==1, 1, 0))
noconsent[2,4] = sum(recgli.noconsent$newdata==1)
noconsent[3,2:4]= as.numeric(noconsent[1,2:4]) + as.numeric(noconsent[2,2:4])
noconsent

data,patients,scans,samples
old_po1,0,0,0
REC_HGG,89,93,294
TOTAL,89,93,294


In [100]:
write.csv(recgli.noconsent, "Parsing_REC_HGG_Oct2018/2ndparse_researchPath_withInVivo.csv", row.names = F)

## 5. Remove those w/ no FFPE tissue

In [101]:
recgli.noconsent = read.csv("Parsing_REC_HGG_Oct2018/2ndparse_researchPath_withInVivo.csv")

In [102]:
recgli.noffpe = recgli.noconsent[-which(recgli.noconsent$no_ffpe==1),]
dim(recgli.noffpe) #584
noffpe = temp
noffpe[1,2] = sum(ifelse(duplicated(recgli.noffpe$b_number)==FALSE & recgli.noffpe$olddata==1, 1, 0))
noffpe[1,3] = sum(ifelse(duplicated(recgli.noffpe$t_number)==FALSE & recgli.noffpe$olddata==1, 1, 0))
noffpe[1,4] = sum(recgli.noffpe$olddata==1)
noffpe[2,2] = sum(ifelse(duplicated(recgli.noffpe$b_number)==FALSE & recgli.noffpe$newdata==1, 1, 0))
noffpe[2,3] = sum(ifelse(duplicated(recgli.noffpe$t_number)==FALSE & recgli.noffpe$newdata==1, 1, 0))
noffpe[2,4] = sum(recgli.noffpe$newdata==1)
noffpe[3,2:4]= as.numeric(noffpe[1,2:4]) + as.numeric(noffpe[2,2:4])
noffpe 

data,patients,scans,samples
old_po1,0,0,0
REC_HGG,89,92,272
TOTAL,89,92,272


In [103]:
write.csv(recgli.noffpe, "Parsing_REC_HGG_Oct2018/3rdParse_researchPath_withInVivo.csv", row.names = F)

## 6. Remove those w/ missing screenshots from BrainLab Neuronav software


In [104]:
recgli.noffpe = read.csv("Parsing_REC_HGG_Oct2018/3rdParse_researchPath_withInVivo.csv")
dim(recgli.noffpe)

In [105]:
recgli.noscreenshots = recgli.noffpe[-which(recgli.noffpe$imaging_code=="scrn"),]
dim(recgli.noscreenshots)  # 541 
noscreenshots = temp
noscreenshots[1,2] = sum(ifelse(duplicated(recgli.noscreenshots$b_number)==FALSE & recgli.noscreenshots$olddata==1, 1, 0))
noscreenshots[1,3] = sum(ifelse(duplicated(recgli.noscreenshots$t_number)==FALSE & recgli.noscreenshots$olddata==1, 1, 0))
noscreenshots[1,4] = sum(recgli.noscreenshots$olddata==1)
noscreenshots[2,2] = sum(ifelse(duplicated(recgli.noscreenshots$b_number)==FALSE & recgli.noscreenshots$newdata==1, 1, 0))
noscreenshots[2,3] = sum(ifelse(duplicated(recgli.noscreenshots$t_number)==FALSE & recgli.noscreenshots$newdata==1, 1, 0))
noscreenshots[2,4] = sum(recgli.noscreenshots$newdata==1)
noscreenshots[3,2:4]= as.numeric(noscreenshots[1,2:4]) + as.numeric(noscreenshots[2,2:4])
noscreenshots

data,patients,scans,samples
old_po1,0,0,0
REC_HGG,86,89,261
TOTAL,86,89,261


In [106]:
write.csv(recgli.noscreenshots, "Parsing_REC_HGG_Oct2018/4thParse_researchPath_withInVivo.csv", row.names = F)

## 7. Remove those w/o pathological outcome assigned in multnom_out (missing pathology)


In [107]:
recgli.noscreenshots = read.csv("Parsing_REC_HGG_Oct2018/4thParse_researchPath_withInVivo.csv")

In [108]:
nopath_index = c(grep("dneDNI", recgli.noscreenshots$multnom_out))
recgli.nopath = recgli.noscreenshots[-nopath_index,]
dim(recgli.nopath) # 480 
nopath = temp
nopath[1,2] = sum(ifelse(duplicated(recgli.nopath$b_number)==FALSE & recgli.nopath$olddata==1, 1, 0))
nopath[1,3] = sum(ifelse(duplicated(recgli.nopath$t_number)==FALSE & recgli.nopath$olddata==1, 1, 0))
nopath[1,4] = sum(recgli.nopath$olddata==1)
nopath[2,2] = sum(ifelse(duplicated(recgli.nopath$b_number)==FALSE & recgli.nopath$newdata==1, 1, 0))
nopath[2,3] = sum(ifelse(duplicated(recgli.nopath$t_number)==FALSE & recgli.nopath$newdata==1, 1, 0))
nopath[2,4] = sum(recgli.nopath$newdata==1)
nopath[3,2:4]= as.numeric(nopath[1,2:4]) + as.numeric(nopath[2,2:4])
nopath

data,patients,scans,samples
old_po1,0,0,0
REC_HGG,69,71,205
TOTAL,69,71,205


In [109]:
write.csv(recgli.nopath, "Parsing_REC_HGG_Oct2018/5thParse_researchPath_withInVivo.csv", row.names = F)

In [110]:
recgli.noscreenshots[grep(5934, recgli.noscreenshots$t_number),]

b_number,t_number,sf_number,current_scan_date,current_surgery_date,current_hist_grade,current_hist_type,roi.label,tumor_cell_evaluation,necrosis,mv_hyperplasia,f8_delicate,f8_simple,f8_complex,mib_1,avgerage_cells,bx_pure_treatment_effect,percent_tumor_nuclei,nfse,nfl,nt1c,nt1v,nt1d,nadc.1,nfa.1,nadc.2,nfa.2,phn_nlin,cbvn_nlin,recov_nlin,phn_npar,recov_npar,recovn_npar,cni,ccri,crni,ncho,ncre,nnaa,nlip,nlac,laclip,comments,notes.,imaging_code,perf_quant,spec_quant,include_anat,include_diffu1000,include_diffu2000,include_diffu_all,include_perf,include_spec,include_one_advanced_mod,include_all_advanced_mods,X.CEL,X.NEL,X.NEC,sum,in_CEL,in_T2all,in_NEL,in_NEC,in_ROI,desired_hist,multnom_out,no_ffpe,waiting_on_path,rhgg_txe_analysis,olddata,newdata


## 8. Remove those whose imaging failed to be quantified for some reason 


In [111]:
recgli.nopath = read.csv("Parsing_REC_HGG_Oct2018/5thParse_researchPath_withInVivo.csv")
dim(recgli.nopath)

In [112]:
recgli.imagingfailed = recgli.nopath[!is.na(recgli.nopath$nfse),]
dim(recgli.imagingfailed) # 438 
imagingfailed = temp
imagingfailed[1,2] = sum(ifelse(duplicated(recgli.imagingfailed$b_number)==FALSE & recgli.imagingfailed$olddata==1, 1, 0))
imagingfailed[1,3] = sum(ifelse(duplicated(recgli.imagingfailed$t_number)==FALSE & recgli.imagingfailed$olddata==1, 1, 0))
imagingfailed[1,4] = sum(recgli.imagingfailed$olddata==1)
imagingfailed[2,2] = sum(ifelse(duplicated(recgli.imagingfailed$b_number)==FALSE & recgli.imagingfailed$newdata==1, 1, 0))
imagingfailed[2,3] = sum(ifelse(duplicated(recgli.imagingfailed$t_number)==FALSE & recgli.imagingfailed$newdata==1, 1, 0))
imagingfailed[2,4] = sum(recgli.imagingfailed$newdata==1)
imagingfailed[3,2:4]= as.numeric(imagingfailed[1,2:4]) + as.numeric(imagingfailed[2,2:4])
imagingfailed

data,patients,scans,samples
old_po1,0,0,0
REC_HGG,68,70,195
TOTAL,68,70,195


In [113]:
write.csv(recgli.imagingfailed, "Parsing_REC_HGG_Oct2018/6thParse_researchPath_withInVivo.csv", row.names = F)

## 9. Remove ependymomas and other extraneous histologies


In [114]:
recgli.imagingfailed = read.csv("Parsing_REC_HGG_Oct2018/6thParse_researchPath_withInVivo.csv")

In [115]:
dim(recgli.imagingfailed)
colnames(recgli.imagingfailed)

In [116]:
recgli.desiredhist = recgli.imagingfailed[recgli.imagingfailed$desired_hist==1,]
dim(recgli.desiredhist) 
desiredhist = temp
desiredhist[1,2] = sum(ifelse(duplicated(recgli.desiredhist$b_number)==FALSE & recgli.desiredhist$olddata==1, 1, 0))
desiredhist[1,3] = sum(ifelse(duplicated(recgli.desiredhist$t_number)==FALSE & recgli.desiredhist$olddata==1, 1, 0))
desiredhist[1,4] = sum(recgli.desiredhist$olddata==1)
desiredhist[2,2] = sum(ifelse(duplicated(recgli.desiredhist$b_number)==FALSE & recgli.desiredhist$newdata==1, 1, 0))
desiredhist[2,3] = sum(ifelse(duplicated(recgli.desiredhist$t_number)==FALSE & recgli.desiredhist$newdata==1, 1, 0))
desiredhist[2,4] = sum(recgli.desiredhist$newdata==1)
desiredhist[3,2:4]= as.numeric(desiredhist[1,2:4]) + as.numeric(desiredhist[2,2:4])
desiredhist

data,patients,scans,samples
old_po1,0,0,0
REC_HGG,67,69,192
TOTAL,67,69,192


In [117]:
write.csv(recgli.desiredhist, file = "Parsing_REC_HGG_Oct2018/7thParse_researchPath_withInVivo.csv", row.names = F)

## 10. Removing Necrotic samples

There are two definitions of necrosis: if the sample is in the NEC as defined by X.NEC, or if it's necrotic based on its pathology (with an "necrosis" score of 2) 

In [118]:
recgli.desiredhist = read.csv("Parsing_REC_HGG_Oct2018/7thParse_researchPath_withInVivo.csv")

In [119]:
dim(recgli.desiredhist)

In [120]:
desiredhist = temp
recgli.nonecrosis = recgli.desiredhist[-which(recgli.desiredhist$necrosis==2),]
recgli.nonecrosis = recgli.nonecrosis[-which(recgli.nonecrosis$in_NEC == 1),]
dim(recgli.nonecrosis) 
nonecrosis = temp
nonecrosis[1,2] = sum(ifelse(duplicated(recgli.nonecrosis$b_number)==FALSE & recgli.nonecrosis$olddata==1, 1, 0))
nonecrosis[1,3] = sum(ifelse(duplicated(recgli.nonecrosis$t_number)==FALSE & recgli.nonecrosis$olddata==1, 1, 0))
nonecrosis[1,4] = sum(recgli.nonecrosis$olddata==1)
nonecrosis[2,2] = sum(ifelse(duplicated(recgli.nonecrosis$b_number)==FALSE & recgli.nonecrosis$newdata==1, 1, 0))
nonecrosis[2,3] = sum(ifelse(duplicated(recgli.nonecrosis$t_number)==FALSE & recgli.nonecrosis$newdata==1, 1, 0))
nonecrosis[2,4] = sum(recgli.nonecrosis$newdata==1)
nonecrosis[3,2:4]= as.numeric(nonecrosis[1,2:4]) + as.numeric(nonecrosis[2,2:4])
nonecrosis

data,patients,scans,samples
old_po1,0,0,0
REC_HGG,62,64,173
TOTAL,62,64,173


In [121]:
write.csv(recgli.nonecrosis, file = "Parsing_REC_HGG_Oct2018/8thParse_researchPath_withInVivo.csv", row.names = F)

## 11. Only including rHGG or TxE designation: 

In [122]:
recgli.nonecrosis = read.csv("Parsing_REC_HGG_Oct2018/8thParse_researchPath_withInVivo.csv")

In [123]:
recgli.desiredpath = recgli.nonecrosis[recgli.nonecrosis$multnom_out=="TxE" |
                                         recgli.nonecrosis$multnom_out=="rHGG",]
dim(recgli.desiredpath)
table(recgli.desiredpath$multnom_out)

dim(recgli.desiredpath) # 438 
desiredpath = temp
desiredpath[1,2] = sum(ifelse(duplicated(recgli.desiredpath$b_number)==FALSE & recgli.desiredpath$olddata==1, 1, 0))
desiredpath[1,3] = sum(ifelse(duplicated(recgli.desiredpath$t_number)==FALSE & recgli.desiredpath$olddata==1, 1, 0))
desiredpath[1,4] = sum(recgli.desiredpath$olddata==1)
desiredpath[2,2] = sum(ifelse(duplicated(recgli.desiredpath$b_number)==FALSE & recgli.desiredpath$newdata==1, 1, 0))
desiredpath[2,3] = sum(ifelse(duplicated(recgli.desiredpath$t_number)==FALSE & recgli.desiredpath$newdata==1, 1, 0))
desiredpath[2,4] = sum(recgli.desiredpath$newdata==1)
desiredpath[3,2:4]= as.numeric(desiredpath[1,2:4]) + as.numeric(desiredpath[2,2:4])
desiredpath


    PN   rHGG Ts1DNI    TxE 
     0    123      0     24 

data,patients,scans,samples
old_po1,0,0,0
REC_HGG,60,62,147
TOTAL,60,62,147


In [124]:
write.csv(recgli.desiredpath, file = "Parsing_REC_HGG_Oct2018/9thParse_researchPath_withInVivo.csv", row.names = F)