In [1]:
import os
import glob
import argparse 
import subprocess as sub
import shlex 
from subprocess import PIPE, Popen
import pandas as pd
import logging
import re
import datetime
import sys

In [2]:
def getting_bnum_tnum_list(bnum_tnum_csv): 
    ########### Get bnum_tnum list: 
    savedir = os.getcwd()
    os.chdir(csv_dir)
    bnum_tnum_df = pd.read_csv(bnum_tnum_csv, header = None)
    bnum_tnum_df.columns = ['bnum', 'tnum', 'DUMMY']
    os.chdir(savedir)
    return bnum_tnum_df

In [3]:
def get_diffu_data(measure = "median"):
    os.chdir(recgli_path_root+"/"+bnum+"/"+tnum)
    if len(glob.glob('svk_roi_analysis/*adcfa1000.csv'))>0:
        ## read in data
        diffu_data_file = pd.read_csv(glob.glob('svk_roi_analysis/*adcfa1000.csv')[0])
        if len(vial_files)>0: 
            ## shorten vial names
            vial_names = [vial.replace(".", "_").split("_")[2] for vial in vial_files]
            ## shorten the diffuomic data down to only the biopsies of interest: 
            diffu_data_biopsy_file = diffu_data_file[diffu_data_file['roi-label'].isin(vial_names)]
            ## find only the measure of interest: 
            diffu_data_biopsy_file_measure = diffu_data_biopsy_file[diffu_data_biopsy_file['measure']==measure]
        else: 
            vial_names = []
            print("ERROR! No biopsies.")
    else: 
        print('ERROR! No diffuomic data present.')
    return diffu_data_biopsy_file_measure

In [4]:
def get_ev_data(measure = "median"):
    os.chdir(recgli_path_root+"/"+bnum+"/"+tnum)
    if len(glob.glob('svk_roi_analysis/*ev1ev2ev31000.csv'))>0:
        ## read in data
        ev_data_file = pd.read_csv(glob.glob('svk_roi_analysis/*ev1ev2ev31000.csv')[0])
        if len(vial_files)>0: 
            ## shorten vial names
            vial_names = [vial.replace(".", "_").split("_")[2] for vial in vial_files]
            ## shorten the diffusion data down to only the biopsies of interest: 
            ev_data_biopsy_file = ev_data_file[ev_data_file['roi-label'].isin(vial_names)]
            ## find only the measure of interest: 
            ev_data_biopsy_file_measure = ev_data_biopsy_file[ev_data_biopsy_file['measure']==measure]
        else: 
            vial_names = []
            print("ERROR! No biopsies.")
    else: 
        print('ERROR! No EV data present.')
    return ev_data_biopsy_file_measure

In [5]:

#####################################
# EXAMPLE: get_diffu_biopsy_data.py --csv_name /home/sf6735452/DataWrangling/GetMergeData/Oct2018/ajnr_
#            --cohort_name REC_HGG
#            --output_file REC_HGG_diffu_biopsies.csv --output_dir ./
#####################################

parser = argparse.ArgumentParser(description='Create a systematic logger of whether biopsies are valid or not through visual inspection')
parser.add_argument("--csv_name",        required=True,    help='Precise name of the csv file that contains the perfusion files of interest.')
parser.add_argument("--csv_dir",         required=True,    help='Precise path of the csv file that contains the perfusion files of interest.')
parser.add_argument("--cohort_name",     required=True,    help='Precise cohort name of the scans of interest (e.g. "po1_preop_recur" or "REC_HGG")')
parser.add_argument("--output_file",     required=True,    help="Name of the output file csv")
parser.add_argument("--output_dir",      required=True,    help="Path where output files get written ")
parser.add_argument("--measure",         required=False,   help='Measurement that you desire for the diffuomical imaging (e.g. median, mean, max, etc.)', default = "median")
parser.add_argument("-v", "--verbose",                     help = "verbose output", action='store_true', default=False,   required=False)


_StoreTrueAction(option_strings=['-v', '--verbose'], dest='verbose', nargs=0, const=True, default=False, type=None, choices=None, help='verbose output', metavar=None)

In [25]:
args = parser.parse_args("--csv_dir /home/sf673542/MultiParametricMRI/MP_MRI_Oct2018/GetMergeData/ --csv_name ajnr_bnum_tnum.95.csv --cohort_name REC_HGG --output_file REC_HGG_diffu1000_biopsies.csv --output_dir /home/sf673542/MultiParametricMRI/MP_MRI_Oct2018/GetMergeData/Get_REC_HGG_Data/ --measure median".split())

In [26]:

#####################################
#   Create strings of the arguments for 
#   navigating to correct directory
#####################################
cohort_name     = args.cohort_name
csv_name        = args.csv_name
csv_dir         = args.csv_dir
output_file     = args.output_file
output_dir      = args.output_dir
measure         = args.measure

print("===============================================")
print("scan list dir:     ", csv_dir)
print("scan list name:    ", csv_name)
print("cohort name:       ", cohort_name) 
print("output file name:  ", output_file) 
print("output dir:        ", output_dir)
print("===============================================")


scan list dir:      /home/sf673542/MultiParametricMRI/MP_MRI_Oct2018/GetMergeData/
scan list name:     ajnr_bnum_tnum.95.csv
cohort name:        REC_HGG
output file name:   REC_HGG_diffu1000_biopsies.csv
output dir:         /home/sf673542/MultiParametricMRI/MP_MRI_Oct2018/GetMergeData/Get_REC_HGG_Data


In [27]:

#####################################
#   Reading in the csv_name
#   as the scan listls
#####################################
bnum_tnum_df = getting_bnum_tnum_list(csv_name)

In [28]:


#####################################
#   Setting the roots of the data 
#   based on the cohort name
#####################################

if cohort_name == 'po1_preop_recur': 
    recgli_path_root = '/data/RECglioma/archived/'
elif cohort_name == 'REC_HGG': 
    recgli_path_root = '/data/RECglioma/'
else: 
    print('Please use a valid cohort name, REC_HGG or po1_preop_recur.')
    exit(1)


In [29]:

#####################################
#   Instantiate the total diffuomical dataframe
#####################################

diffu_total_df = pd.DataFrame()
ev_total_df  = pd.DataFrame()


In [30]:
error_log = pd.DataFrame()


In [12]:

#####################################
#   Iterating through scans, grabbing the data of interest: 
#####################################

for index, row in bnum_tnum_df.iterrows():
    bnum = row['bnum']
    tnum = row['tnum']
    error_log_line = {'bnum': bnum, 'tnum': tnum}
    
    print(bnum)
    print(tnum)

    #####################################
    #   Change into the correct directory
    #####################################
    os.chdir(recgli_path_root+"/"+bnum+"/"+tnum)

    #####################################
    #   Gather the vialIDs 
    #####################################
    if "roi_analysis" in os.listdir(): 
        os.chdir('roi_analysis')
        vial_files = glob.glob('*_t1ca_*.idf')
    else: 
        vial_files = []
        print('ERROR! No biopsies.')

    #####################################
    #   Change back into the correct directory
    #####################################
    os.chdir(recgli_path_root+"/"+bnum+"/"+tnum)

    #####################################
    #   Gather the data if available: 
    #####################################

    ## If there exists an diffuomical data file: 
    if len(glob.glob('svk_roi_analysis/*adcfa1000.csv'))>0: 
        print('diffusion csv file found')         
        ## If there exists biopsy masks in roi_analysis: 
        if len(vial_files)>0: 
            print('vialIDs found in roi_analysis')
            ## Set the vial_file_status = 1 so that we know they exist 
            vial_files_status = 1

            ## Instanstiate the tnum_diffuomical_df
            tnum_diffuomical_df  = pd.DataFrame()

            ## Get the data: 
            tnum_diffuomical_df = get_diffu_data()

            ## Add these data to the overall dataFrame: 
            diffu_total_df = diffu_total_df.append(tnum_diffuomical_df, ignore_index= True)\

        else: 
            print('No vial IDs found! No biopsies.')
            error_log_line['biopsies']='error' 
            
    else: 
        print("------------------------------------------")
        print('No diffu1000.csv file! run svk_roi_analysis')
        print('------------------------------------------')
        error_log_line['data_adcfa_file']='error'
        
    ## If there exists an ev data file: 
    if len(glob.glob('svk_roi_analysis/*ev31000.csv'))>0: 
        print('ev 1000 csv file found')         
        ## If there exists biopsy masks in roi_analysis: 
        if len(vial_files)>0: 
            print('vialIDs found in roi_analysis')
            ## Set the vial_file_status = 1 so that we know they exist 
            vial_files_status = 1

            ## Instanstiate the tnum_diffusion_df
            tnum_ev_df  = pd.DataFrame()

            ## Get the data: 
            tnum_ev_df = get_ev_data()

            ## Add these data to the overall dataFrame: 
            ev_total_df = ev_total_df.append(tnum_ev_df, ignore_index= True)\

        else: 
            print('No vial IDs found! No biopsies.')

    else: 
        print("------------------------------------------")
        print('ERROR! No ev1000.csv file! run svk_roi_analysis')
        print('------------------------------------------')
        error_log_line['data_ev1000_file']='error' 
        
    error_log = error_log.append(error_log_line, ignore_index = True)

b1338
t8773
diffusion csv file found
vialIDs found in roi_analysis
ev 1000 csv file found
vialIDs found in roi_analysis
b2947
t8775
diffusion csv file found
vialIDs found in roi_analysis
------------------------------------------
ERROR! No ev1000.csv file! run svk_roi_analysis
------------------------------------------
b3472
t8783
diffusion csv file found
vialIDs found in roi_analysis
ev 1000 csv file found
vialIDs found in roi_analysis
b3499
t8869
diffusion csv file found
vialIDs found in roi_analysis
ev 1000 csv file found
vialIDs found in roi_analysis
b3125
t8878
diffusion csv file found
vialIDs found in roi_analysis
ev 1000 csv file found
vialIDs found in roi_analysis
b3526
t8943
------------------------------------------
No diffu1000.csv file! run svk_roi_analysis
------------------------------------------
------------------------------------------
ERROR! No ev1000.csv file! run svk_roi_analysis
------------------------------------------
b3527
t8944
diffusion csv file found
vialID

b4203
t10748
diffusion csv file found
vialIDs found in roi_analysis
ev 1000 csv file found
vialIDs found in roi_analysis
b4205
t10756
diffusion csv file found
vialIDs found in roi_analysis
ev 1000 csv file found
vialIDs found in roi_analysis
b4208
t10763
------------------------------------------
No diffu1000.csv file! run svk_roi_analysis
------------------------------------------
------------------------------------------
ERROR! No ev1000.csv file! run svk_roi_analysis
------------------------------------------
b4210
t10768
------------------------------------------
No diffu1000.csv file! run svk_roi_analysis
------------------------------------------
------------------------------------------
ERROR! No ev1000.csv file! run svk_roi_analysis
------------------------------------------
b4211
t10770
diffusion csv file found
vialIDs found in roi_analysis
ev 1000 csv file found
vialIDs found in roi_analysis
b3776
t10777
diffusion csv file found
vialIDs found in roi_analysis
ev 1000 csv fil

In [13]:
error_log.head(30)

Unnamed: 0,bnum,tnum,data_ev1000_file,data_adcfa_file,biopsies
0,b1338,t8773,,,
1,b2947,t8775,error,,
2,b3472,t8783,,,
3,b3499,t8869,,,
4,b3125,t8878,,,
5,b3526,t8943,error,error,
6,b3527,t8944,,,
7,b3533,t8956,,,
8,b1837,t8960,,,
9,b3541,t8977,,,


In [14]:
print(diffu_total_df.columns)
print(ev_total_df.columns)

Index(['adc', 'aficvf', 'afiso', 'aodi', 'fa', 'fecvf', 'l1', 'l2', 'l3',
       'measure', 'nadc', 'naficv', 'nafiso', 'naodi', 'nfa', 'nfecvf', 'nl1',
       'nl2', 'nl3', 'roi-label', 't-num', 'tab-file', 'vol(cc)'],
      dtype='object')
Index(['tab-file', 't-num', 'measure', 'roi-label', 'vol(cc)', 'ev1', 'ev2',
       'ev3', 'evrad', 'nev1', 'nev2', 'nev3', 'nevrad'],
      dtype='object')


In [15]:
#####################################
#   Rearrange the data: 
#####################################

cols = ['t-num', 'roi-label', 'adc', 'fa', 'nadc', 'nfa']
diffu_total_df = diffu_total_df[cols]
diffu_total_df.columns = ['tnum', 'roi.label', 'adc.1', 'fa.1', "nadc.1", 'nfa.1']

cols_ev = ['t-num', 'roi-label', 'nev1', 'nev2', 'nev3', 'nevrad']
ev_total_df=ev_total_df[cols_ev]
ev_total_df.columns = ['tnum', 'roi.label','nev1.1', 'nev2.1', 'nev3.1', 'nevrad.1']


In [16]:
diffu_total_df.shape


(218, 6)

In [17]:
ev_total_df.shape


(211, 6)

In [18]:
new_diff_df = pd.merge(left = diffu_total_df, right = ev_total_df, how = "outer", on = ['roi.label', 'tnum'])

In [19]:
new_diff_df.columns

Index(['tnum', 'roi.label', 'adc.1', 'fa.1', 'nadc.1', 'nfa.1', 'nev1.1',
       'nev2.1', 'nev3.1', 'nevrad.1'],
      dtype='object')

In [20]:
new_diff_df.shape
new_diff_df.head()

Unnamed: 0,tnum,roi.label,adc.1,fa.1,nadc.1,nfa.1,nev1.1,nev2.1,nev3.1,nevrad.1
0,t8773,61B60,1077.5,150.0,1.42,0.54,1.21,1.44,1.74,1.63
1,t8773,62B60,1162.5,210.0,1.53,0.75,1.39,1.52,1.83,1.69
2,t8773,63B60,860.0,150.0,1.13,0.54,1.0,1.12,1.42,1.29
3,t8775,2B61,1145.0,270.0,0.0,0.0,,,,
4,t8775,1B61,1325.0,150.0,0.0,0.0,,,,


In [31]:
out_file = output_dir+output_file

out_file

'/home/sf673542/MultiParametricMRI/MP_MRI_Oct2018/GetMergeData/Get_REC_HGG_DataREC_HGG_diffu1000_biopsies.csv'

In [None]:
new_diff_df.to_csv(out_file, index=False)

In [23]:
error_log.to_csv('/home/sf673542/MultiParametricMRI/MP_MRI_Oct2018/GetMergeData/Get_REC_HGG_Data/REC_HGG_diffu1000_error_log.csv')