# Data Pipeline - Clinical Data

## Python 2.7 Required Modules

In [2]:
import glob
import numpy as np
import pandas as pd
import re
import collections

## Input Data

### NCI-60 Cell Lines

'NCI60.csv' - List of samples with experimental SF2, proliferation rate, and COSMIC ID

'Screened_Compounds.xlsx' - Go to https://www.cancerrxgene.org/downloads, choose 'Annotated - Reference - Drugs - Screened compounds'

'v17.3_fitted_dose_response.xlsx' - Go to https://www.cancerrxgene.org/downloads, choose 'Drug - Preprocessed - Cell lines/Drugs - log(IC50) and AUC values'

'DrugCorrection1.csv' - Go to https://gdisc.bme.gatech.edu/cgi-bin/gdisc/tap5.cgi, download 'DrugCorrection1.csv'. Note that additional changes have been made to this original file.

### HNSCC Cell Lines

'HNSCC.csv' - List of samples with experimental SF2, proliferation rate, and COSMIC ID

'Screened_Compounds.xlsx' - Go to https://www.cancerrxgene.org/downloads, choose 'Annotated - Reference - Drugs - Screened compounds'

'v17.3_fitted_dose_response.xlsx' - Go to https://www.cancerrxgene.org/downloads, choose 'Drug - Preprocessed - Cell lines/Drugs - log(IC50) and AUC values'

'DrugCorrection1.csv' - Go to https://gdisc.bme.gatech.edu/cgi-bin/gdisc/tap5.cgi, download 'DrugCorrection1.csv'. Note that additional changes have been made to this original file.

### TCGA Samples

'pred_rates.csv' - Go to https://github.com/cdiener/proliferation/blob/master/results/pred_rates.csv, download csv file

'*ORG*_*COHORT*_bio.sample.tsv' - Go to https://www.synapse.org/#!Synapse:syn2812961, choose '*COHORT*/bio/' folder, download '*ORG*_*COHORT*_bio.sample.tsv'

'nationwidechildrens.org_clinical_*drug/patient/radiation*_*COHORT*.txt' - Use the GDC Data Transfer Tool (https://gdc.cancer.gov/access-data/gdc-data-transfer-tool) to download TCGA files listed in the manifest 'manifest.txt':

'DrugCorrection1.csv' - Go to https://gdisc.bme.gatech.edu/cgi-bin/gdisc/tap5.cgi, download 'DrugCorrection1.csv'. Note that additional changes have been made to this original file.

## NCI-60 Cell Lines

Retrieve cell line information, incorporate measured drug IC50's

In [None]:
# load NCI60 data
df = pd.read_csv('input_data/NCI60/NCI60.csv',index_col=0)

# load drug data
df_data = pd.read_excel('input_data/NCI60/v17.3_fitted_dose_response.xlsx',sheet_name='Sheet1')

# load drug list
df_drugs = pd.read_excel('input_data/NCI60/Screened_Compounds.xlsx',sheet_name='Sheet1')

# load TCGA standardization
df_conversion = pd.read_csv('input_data/NCI60/DrugCorrection1.csv',header=None,index_col=0)

# initialize drug list
druglist = []

# iterate over cell lines
for cellline in df.index.tolist():
    
    # get drug responses in cell line
    df_data_subset = df_data.loc[df_data['COSMIC_ID'] == df.loc[cellline]['COSMIC']].reset_index(drop=True)
    for i in range(df_data_subset.shape[0]):

        # drug name
        drug_name = df_drugs.loc[df_drugs['Drug ID'].values.tolist().index(int(df_data_subset.loc[i]['DRUG_ID']))]['Drug Name']
        if df_conversion.loc[drug_name][1] != 'NOS':
        
            if df_conversion.loc[drug_name][1] not in druglist:
                druglist.append(df_conversion.loc[drug_name][1])
                df['IC50 DRUG %s [uM]' % df_conversion.loc[drug_name][1]] = ''

            # IC50
            df.at[cellline,'IC50 DRUG %s [uM]' % df_conversion.loc[drug_name][1]] = np.exp(df_data_subset.loc[i]['LN_IC50'])

# write drug list
with open('processing/NCI60/druglist.txt','w') as f:
    for drug in druglist:
        f.write('%s\n' % drug)

Save NCI-60 samples

In [None]:
# remove doubling time and COSMIC ID
df = df.drop(['DOUBLING [hr]', 'COSMIC'], axis=1)

# iterate over celllines
for cellline in df.index.tolist():
    
    # save data
    df.loc[cellline].to_csv('NCI60/%s.csv' % cellline.replace('/','-'))

## HNSCC Cell Lines

Retrieve cell line information, incorporate measured drug IC50's

In [None]:
# load HNSCC data
df = pd.read_csv('input_data/HNSCC/HNSCC.csv',index_col=0)

# load drug data
df_data = pd.read_csv('input_data/HNSCC/v17.3_fitted_dose_response.xlsx',sheet_name='Sheet1')

# load drug list
df_drugs = pd.read_table('input_data/HNSCC/Screened_Compounds.xlsx',sheet_name='Sheet1')

# load TCGA standardization
df_conversion = pd.read_csv('input_data/HNSCC/DrugCorrection1.csv',header=None,index_col=0)

# initialize drug list
druglist = []

# iterate over cell lines
for cellline in df.index.tolist():
    
    # get drug responses in cell line
    df_data_subset = df_data.loc[df_data['COSMIC_ID'] == df.loc[cellline]['COSMIC']].reset_index(drop=True)
    for i in range(df_data_subset.shape[0]):

        # drug name
        drug_name = df_drugs.loc[df_drugs['Drug ID'].values.tolist().index(int(df_data_subset.loc[i]['DRUG_ID']))]['Drug Name']
        if df_conversion.loc[drug_name][1] != 'NOS':
        
            if df_conversion.loc[drug_name][1] not in druglist:
                druglist.append(df_conversion.loc[drug_name][1])
                df['IC50 DRUG %s [uM]' % df_conversion.loc[drug_name][1]] = ''

            # IC50
            df.at[cellline,'IC50 DRUG %s [uM]' % df_conversion.loc[drug_name][1]] = np.exp(df_data_subset.loc[i]['LN_IC50'])

# write drug list
with open('processing/HNSCC/druglist.txt','w') as f:
    for drug in druglist:
        f.write('%s\n' % drug)

Save HNSCC samples

In [None]:
# remove doubling time and COSMIC ID
df = df.drop(['DOUBLING [hr]', 'COSMIC'], axis=1)

# iterate over celllines
for cellline in df.index.tolist():
    
    # save data
    df.loc[cellline].to_csv('HNSCC/%s.csv' % cellline.replace('/','-'))

## TCGA Samples

Find clinical attributes

In [4]:
# find clinical attributes
#df = pd.read_table('../../../qFlux-files/clinical/input_data/TCGA/nationwidechildrens.org_clinical_patient_acc.txt',skiprows=[0,2],header=0)
#df['cohort'] = 'acc'
#patient_files = glob.glob('../../../qFlux-files/clinical/input_data/TCGA/nationwidechildrens.org_clinical_patient_*.txt')
#for fn in patient_files[1:]:  
#    df_new = pd.read_table(fn,skiprows=[0,2],header=0)
#    df_new['cohort'] = fn.split('_')[-1].split('.')[0]
#    df = pd.concat([df,df_new])
#    
#cat = []
#val = []
#for col in df.columns.tolist():
#    cat.append(col)
#    val.append(np.sum(df[col].value_counts().values))
#ind = np.argsort(val)[::-1]
#cat = [cat[i] for i in ind]
#val = [val[i] for i in ind]
#for i in range(len(cat)):
#    print val[i], cat[i]

df[df['cohort'] == 'brca']['pathologic_M'].value_counts()

M0          906
MX          163
M1           22
cM0 (i+)      6
Name: pathologic_M, dtype: int64

Load data from patient and sample biotab files

In [13]:
# initialize sample dataframe
data = pd.DataFrame(columns=['SAMPLE','PATIENT','COHORT','TYPE','AGE','GENDER','RACE','ETHNICITY','HISTOLOGIC','LOCATION','GRADE','CLINICAL STAGE','CLINICAL T','CLINICAL N','CLINICAL M','PATHOLOGIC STAGE','PATHOLOGIC T','PATHOLOGIC N','PATHOLOGIC M','SMOKING HISTORY','PACK YEARS','ALCOHOL DAYS PER WEEK','ALCOHOL PER DAY','HPV P16','HPV ISH','KARNOFSKY','DAY_COLLECTION'])

# iterate over patient files
patient_files = glob.glob('../../../qFlux-files/clinical/input_data/TCGA/nationwidechildrens.org_clinical_patient_*.txt')
for fn in patient_files:
    
    # load file
    df_patient = pd.read_table(fn,skiprows=[0,2],header=0)
    
    # cohort
    cohort = fn.split('_')[-1].split('.')[0]
    
    # load associated sample file
    if cohort == 'laml':
        df_sample = pd.read_table('../../../qFlux-files/clinical/input_data/TCGA/genome.wustl.edu_LAML_bio.sample.tsv')
    else:
        df_sample = pd.read_table('../../../qFlux-files/clinical/input_data/TCGA/nationwidechildrens.org_%s_bio.sample.tsv' % cohort.upper())
        
    # iterate over patients
    for patient in df_patient['bcr_patient_barcode'].values.tolist():
        
        # age
        if df_patient[df_patient['bcr_patient_barcode'] == patient]['age_at_initial_pathologic_diagnosis'].tolist()[0] != '[Not Available]':
            age = int(df_patient[df_patient['bcr_patient_barcode'] == patient]['age_at_initial_pathologic_diagnosis'].tolist()[0])
        else:
            age = None
        
        # gender
        gender = df_patient[df_patient['bcr_patient_barcode'] == patient]['gender'].tolist()[0]
        
        # race
        if df_patient[df_patient['bcr_patient_barcode'] == patient]['race'].tolist()[0] in ['WHITE','BLACK OR AFRICAN AMERICAN','ASIAN','AMERICAN INDIAN OR ALASKA NATIVE','NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER']:
            race = df_patient[df_patient['bcr_patient_barcode'] == patient]['race'].tolist()[0]
        else:
            race = None
  
        # ethnicity
        if df_patient[df_patient['bcr_patient_barcode'] == patient]['ethnicity'].tolist()[0] in ['NOT HISPANIC OR LATINO','HISPANIC OR LATINO']:
            ethnicity = df_patient[df_patient['bcr_patient_barcode'] == patient]['ethnicity'].tolist()[0]
        else:
            ethnicity = None
            
        # histologic
        if 'histological_type' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['histological_type'].tolist()[0][0] != '[':
                histologic = df_patient[df_patient['bcr_patient_barcode'] == patient]['histological_type'].tolist()[0]
            else:
                histologic = None
        else:
            histologic = None
            
        # location
        if 'anatomic_neoplasm_subdivision' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['anatomic_neoplasm_subdivision'].tolist()[0][0] != '[':
                location = df_patient[df_patient['bcr_patient_barcode'] == patient]['anatomic_neoplasm_subdivision'].tolist()[0]
            else:
                location = None
        else:
            location = None
        
        # grade
        if 'neoplasm_histologic_grade' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['neoplasm_histologic_grade'].tolist()[0][0] != '[':
                grade = df_patient[df_patient['bcr_patient_barcode'] == patient]['neoplasm_histologic_grade'].tolist()[0]
            else:
                grade = None
        else:
            grade = None
        
        # clinical stage
        if 'clinical_stage' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['clinical_stage'].tolist()[0][0] != '[':
                clinical_stage = df_patient[df_patient['bcr_patient_barcode'] == patient]['clinical_stage'].tolist()[0]
            else:
                clinical_stage = None
        else:
            clinical_stage = None
            
        # clinical T
        if 'clinical_T' in df_patient.columns.values.tolist():    
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['clinical_T'].tolist()[0][0] != '[':
                clinical_T = df_patient[df_patient['bcr_patient_barcode'] == patient]['clinical_T'].tolist()[0]
            else:
                clinical_T = None
        else:
            clinical_T = None
            
        # clinical N
        if 'clinical_N' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['clinical_N'].tolist()[0][0] != '[':
                clinical_N = df_patient[df_patient['bcr_patient_barcode'] == patient]['clinical_N'].tolist()[0]
            else:
                clinical_N = None
        else:
            clinical_N = None
            
        # clinical M
        if 'clinical_M' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['clinical_M'].tolist()[0][0] != '[':
                clinical_M = df_patient[df_patient['bcr_patient_barcode'] == patient]['clinical_M'].tolist()[0]
            else:
                clinical_M = None
        else:
            clinical_M = None
            
        # pathologic stage
        if 'pathologic_stage' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['pathologic_stage'].tolist()[0][0] != '[':
                pathologic_stage = df_patient[df_patient['bcr_patient_barcode'] == patient]['pathologic_stage'].tolist()[0]
            else:
                pathologic_stage = None
        else:
            pathologic_stage = None
            
        # pathologic T
        if 'pathologic_T' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['pathologic_T'].tolist()[0][0] != '[':
                pathologic_T = df_patient[df_patient['bcr_patient_barcode'] == patient]['pathologic_T'].tolist()[0]
            else:
                pathologic_T = None
        else:
            pathologic_T = None
            
        # pathologic N
        if 'pathologic_N' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['pathologic_N'].tolist()[0][0] != '[':
                pathologic_N = df_patient[df_patient['bcr_patient_barcode'] == patient]['pathologic_N'].tolist()[0]
            else:
                pathologic_N = None
        else:
            pathologic_N = None
            
        # pathologic M
        if 'pathologic_M' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['pathologic_M'].tolist()[0][0] != '[':
                pathologic_M = df_patient[df_patient['bcr_patient_barcode'] == patient]['pathologic_M'].tolist()[0]
            else:
                pathologic_M = None
        else:
            pathologic_M = None
        
        # smoking history
        if 'tobacco_smoking_history' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['tobacco_smoking_history'].tolist()[0][0] != '[':
                if int(df_patient[df_patient['bcr_patient_barcode'] == patient]['tobacco_smoking_history'].tolist()[0]) == 1:
                    smoking_history = 'LIFELONG NON-SMOKER'
                elif int(df_patient[df_patient['bcr_patient_barcode'] == patient]['tobacco_smoking_history'].tolist()[0]) == 2:
                    smoking_history = 'CURRENT SMOKER'
                elif int(df_patient[df_patient['bcr_patient_barcode'] == patient]['tobacco_smoking_history'].tolist()[0]) == 3:
                    smoking_history = 'REFORMED > 15 YRS'
                elif int(df_patient[df_patient['bcr_patient_barcode'] == patient]['tobacco_smoking_history'].tolist()[0]) == 4:
                    smoking_history = 'REFORMED <= 15 YRS'
                elif int(df_patient[df_patient['bcr_patient_barcode'] == patient]['tobacco_smoking_history'].tolist()[0]) == 5:
                    smoking_history = 'REFORMED DURATION UNKNOWN'
            else:
                smoking_history = None
        else:
            smoking_history = None
        
        # pack years
        if 'number_pack_years_smoked' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['number_pack_years_smoked'].tolist()[0][0] != '[':
                pack_years = float(df_patient[df_patient['bcr_patient_barcode'] == patient]['number_pack_years_smoked'].tolist()[0])
            else:
                pack_years = None
        else:
            pack_years = None
        
        # alcohol days per week
        if 'frequency_of_alcohol_consumption' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['frequency_of_alcohol_consumption'].tolist()[0][0] != '[':
                alcohol_days_per_week = float(df_patient[df_patient['bcr_patient_barcode'] == patient]['frequency_of_alcohol_consumption'].tolist()[0])
            else:
                alcohol_days_per_week = None
        else:
            alcohol_days_per_week = None
        
        # alcohol per day
        if 'amount_of_alcohol_consumption_per_day' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['amount_of_alcohol_consumption_per_day'].tolist()[0][0] != '[':
                alcohol_per_day = float(df_patient[df_patient['bcr_patient_barcode'] == patient]['amount_of_alcohol_consumption_per_day'].tolist()[0])
            else:
                alcohol_per_day = None
        else:
            alcohol_per_day = None
        
        # HPV p16
        if 'hpv_status_by_p16_testing' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['hpv_status_by_p16_testing'].tolist()[0][0] != '[':
                hpv_p16 = df_patient[df_patient['bcr_patient_barcode'] == patient]['hpv_status_by_p16_testing'].tolist()[0].upper()
            else:
                hpv_p16 = None
        else:
            hpv_p16 = None
        
        # HPV ISH
        if 'hpv_status_by_ish_testing' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['hpv_status_by_ish_testing'].tolist()[0][0] != '[':
                hpv_ish = df_patient[df_patient['bcr_patient_barcode'] == patient]['hpv_status_by_ish_testing'].tolist()[0].upper()
            else:
                hpv_ish = None
        else:
            hpv_ish = None
        
        # karnofsky
        if 'karnofsky_performance_score' in df_patient.columns.values.tolist():
            if df_patient[df_patient['bcr_patient_barcode'] == patient]['karnofsky_performance_score'].tolist()[0][0] != '[':
                karnofsky = int(df_patient[df_patient['bcr_patient_barcode'] == patient]['karnofsky_performance_score'].tolist()[0])
            else:
                karnofsky = None
        else:
            karnofsky = None
        
        # find associated rows in sample file
        for i in range(df_sample.shape[0]):
            if df_sample.loc[i]['sample'][:12] == patient:
                
                # day of collection
                try:
                    day_collection = int(df_sample.loc[i]['days_to_collection'])
                except:
                    day_collection = None
                
                # if tumor
                if df_sample.loc[i]['sample_type'] == 'Primary Tumor':
                    data.loc[data.shape[0]] = [df_sample.loc[i]['sample'],patient,cohort.upper(),'TUMOR',age,gender,race,ethnicity,histologic,location,grade,clinical_stage,clinical_T,clinical_N,clinical_M,pathologic_stage,pathologic_T,pathologic_N,pathologic_M,smoking_history,pack_years,alcohol_days_per_week,alcohol_per_day,hpv_p16,hpv_ish,karnofsky,day_collection]
                
                # if normal
                elif df_sample.loc[i]['sample_type'] in ['Blood Derived Normal','Solid Tissue Normal']:
                    data.loc[data.shape[0]] = [df_sample.loc[i]['sample'],patient,cohort.upper(),'NORMAL',age,gender,race,ethnicity,histologic,location,grade,clinical_stage,clinical_T,clinical_N,clinical_M,pathologic_stage,pathologic_T,pathologic_N,pathologic_M,smoking_history,pack_years,alcohol_days_per_week,alcohol_per_day,hpv_p16,hpv_ish,karnofsky,day_collection]

Incorporate estimated proliferation rates from Diener et al.

In [14]:
# add proliferation column in data
data['PROLIFERATION [1/hr]'] = np.nan

# load proliferation rate data
df_proliferation = pd.read_csv('../../../qFlux-files/clinical/input_data/TCGA/pred_rates.csv')

# iterate over samples
for a in range(data.shape[0]):
    
    # if proliferation value available for patient
    if data.loc[a]['PATIENT'] in df_proliferation['patient_barcode'].values.tolist():
        
        # initialize list of possible values
        values = []
        
        # find sample in proliferation file
        df_proliferation_subset = df_proliferation.loc[df_proliferation['patient_barcode'] == data.loc[a]['PATIENT']].reset_index(drop=True)
        for i in range(df_proliferation_subset.shape[0]):
                
            # match tumor=tumor or normal=normal
            if ((data.loc[a]['TYPE'] == 'TUMOR') and (df_proliferation_subset.loc[i]['tumor'] == True)) or ((data.loc[a]['TYPE'] == 'NORMAL') and (df_proliferation_subset.loc[i]['tumor'] == False)):
                values.append(df_proliferation_subset.loc[i]['rates'])
                    
        # if at least one value and all are positive, add to data
        if len(values) > 0:
            if np.min(values) > 0:
                data.at[a,'PROLIFERATION [1/hr]'] = np.mean(values)

Incorporate measured radiation response

In [15]:
# add radiation response column to data
data['RESPONSE RADIATION'] = ''
data['COLLECTION BEFORE OR AFTER RADIATION'] = ''

# iterate over radiation files
biotabs = glob.glob('../../../qFlux-files/clinical/input_data/TCGA/nationwidechildrens.org_clinical_radiation_*.txt')
for fn in biotabs:

    # load biotab file
    df_biotab = pd.read_table(fn,skiprows=[0,2],header=0)
    
    # get all patient barcodes in file
    barcodes = list(set(df_biotab['bcr_patient_barcode'].values.tolist()))
    
    # iterate over patients
    for barcode in barcodes:
    
        # gather radiation data from all patient treatments on primary tumor field
        course = []
        start = []
        response = []
        
        for i in range(df_biotab.shape[0]):
            if (df_biotab.loc[i]['bcr_patient_barcode'] == barcode) and (df_biotab.loc[i]['anatomic_treatment_site'] == 'Primary Tumor Field'):
                try:
                    course.append(int(df_biotab.loc[i]['course_number']))
                except:
                    course.append(df_biotab.loc[i]['course_number'])
                try:
                    start.append(int(df_biotab.loc[i]['days_to_radiation_therapy_start']))
                except:
                    start.append(df_biotab.loc[i]['days_to_radiation_therapy_start'])
                response.append(df_biotab.loc[i]['measure_of_response'])
        
        # if only one treatment
        if len(response) == 1:
            if response[0] not in ['[Not Available]','[Unknown]','[Not Applicable]']:
                
                # find correct samples
                for a in range(data.shape[0]):
                    if (data.loc[a]['PATIENT'] == barcode) and (data.loc[a]['TYPE'] == 'TUMOR'):
                        data.at[a,'RESPONSE RADIATION'] = response[0]
                        if type(start[0]) == int:
                            if data.loc[a]['DAY_COLLECTION'] < start[0]:
                                data.at[a,'COLLECTION BEFORE OR AFTER RADIATION'] = 'BEFORE'
                            elif data.loc[a]['DAY_COLLECTION'] > start[0]:
                                data.at[a,'COLLECTION BEFORE OR AFTER RADIATION'] = 'AFTER'
        
        # if multiple treatments
        elif len(response) > 1:
        
            # if course number 1 is available
            if 1 in course:
                
                # get all responses for course number 1
                responses = []
                starts = []
                for i in range(len(response)):
                    if (course[i] == 1) and (response[i] not in ['[Not Available]','[Unknown]','[Not Applicable]']) and (response[i] not in responses):
                        responses.append(response[i])
                        starts.append(start[i])
                    
                # if only one response, return response
                if len(responses) == 1:
                    
                    # find correct samples
                    for a in range(data.shape[0]):
                        if (data.loc[a]['PATIENT'] == barcode) and (data.loc[a]['TYPE'] == 'TUMOR'):
                            data.at[a,'RESPONSE RADIATION'] = responses[0]
                            if type(starts[0]) == int:
                                if data.loc[a]['DAY_COLLECTION'] < starts[0]:
                                    data.at[a,'COLLECTION BEFORE OR AFTER RADIATION'] = 'BEFORE'
                                elif data.loc[a]['DAY_COLLECTION'] > starts[0]:
                                    data.at[a,'COLLECTION BEFORE OR AFTER RADIATION'] = 'AFTER'
            
            # if no course numbers available, but at least one start date is available
            elif (min(course) in ['[Not Available]','[Unknown]','[Not Applicable]']) and (min(start) not in ['[Not Available]','[Unknown]','[Not Applicable]']):
                
                # get all responses for smallest start date
                responses = []
                starts = []
                for i in range(len(response)):
                    if (start[i] == min(start)) and (response[i] not in ['[Not Available]','[Unknown]','[Not Applicable]']) and (response[i] not in responses):
                        responses.append(response[i])
                        starts.append(start[i])
                    
                # if only one response, return response
                if len(responses) == 1:
                    
                    # find correct samples
                    for a in range(data.shape[0]):
                        if (data.loc[a]['PATIENT'] == barcode) and (data.loc[a]['TYPE'] == 'TUMOR'):
                            data.at[a,'RESPONSE RADIATION'] = responses[0]
                            if type(starts[0]) == int:
                                if data.loc[a]['DAY_COLLECTION'] < starts[0]:
                                    data.at[a,'COLLECTION BEFORE OR AFTER RADIATION'] = 'BEFORE'
                                elif data.loc[a]['DAY_COLLECTION'] > starts[0]:
                                    data.at[a,'COLLECTION BEFORE OR AFTER RADIATION'] = 'AFTER'

Incorporate measured drug responses

In [16]:
# load TCGA drug name standardization
df_conversion = pd.read_csv('../../../qFlux-files/clinical/input_data/TCGA/DrugCorrection1.csv',header=None,index_col=0)

# initialize data dictionary
drug_data = {}

# iterate over drug files
biotabs = glob.glob('../../../qFlux-files/clinical/input_data/TCGA/nationwidechildrens.org_clinical_drug_*.txt')
for fn in biotabs:
    
    # cohort
    cohort = fn.split('_')[-1].split('.')[0]
    
    # load biotab file
    df_biotab = pd.read_table(fn,skiprows=[0,2],header=0)
    
    # determine if "measure_of_response" is in biotab file
    if 'measure_of_response' in df_biotab.columns.tolist():
        
        # determine if "regimen_number" is in biotab file
        if 'regimen_number' not in df_biotab.columns.tolist():
            print('%s does not have drug regimen number' % cohort.upper())
        
        # get all patient barcodes in file
        barcodes = list(set(df_biotab['bcr_patient_barcode'].values.tolist()))
        
        # iterate over patients
        for barcode in barcodes:
            
            # initialize patient dictionary
            patient_data = {}
            
            # gather patient data
            for i in range(df_biotab.shape[0]):
                if (df_biotab.loc[i]['bcr_patient_barcode'] == barcode):
                    
                    # if drug not NOS
                    if df_conversion.loc[df_biotab.loc[i]['drug_name']][1] != 'NOS':
                    
                        # if drug not already in patient dictionary
                        if df_conversion.loc[df_biotab.loc[i]['drug_name']][1] not in patient_data:
                            patient_data[df_conversion.loc[df_biotab.loc[i]['drug_name']][1]] = {'course':[],'start':[],'response':[]}
                
                        # add data to patient dictionary
                        if 'regimen_number' in df_biotab.columns.tolist():
                            try:
                                patient_data[df_conversion.loc[df_biotab.loc[i]['drug_name']][1]]['course'].append(int(df_biotab.loc[i]['regimen_number']))
                            except:
                                patient_data[df_conversion.loc[df_biotab.loc[i]['drug_name']][1]]['course'].append(df_biotab.loc[i]['regimen_number'])
                        else:
                            patient_data[df_conversion.loc[df_biotab.loc[i]['drug_name']][1]]['course'].append('[Not Available]')         
                        try:
                            patient_data[df_conversion.loc[df_biotab.loc[i]['drug_name']][1]]['start'].append(int(df_biotab.loc[i]['days_to_drug_therapy_start']))
                        except:
                            patient_data[df_conversion.loc[df_biotab.loc[i]['drug_name']][1]]['start'].append(df_biotab.loc[i]['days_to_drug_therapy_start'])
                        patient_data[df_conversion.loc[df_biotab.loc[i]['drug_name']][1]]['response'].append(df_biotab.loc[i]['measure_of_response'])
            
            # iterate over drugs in patient dictionary
            for drug in patient_data:
            
                # if only one treatment
                if len(patient_data[drug]['response']) == 1:
                    if patient_data[drug]['response'][0] not in ['[Not Available]','[Unknown]','[Not Applicable]']:
                        
                        # if drug not in dictionary
                        if drug not in drug_data:
                            drug_data[drug] = {'barcode':[],'cohort':[],'response':[]}
                        
                        # add data to dictionary
                        drug_data[drug]['barcode'].append(barcode)
                        drug_data[drug]['cohort'].append(cohort)
                        drug_data[drug]['response'].append(patient_data[drug]['response'][0])
                    
                # if multiple treatments
                elif len(patient_data[drug]['response']) > 1:
                
                    # if course number 1 is available
                    if 1 in patient_data[drug]['course']:
                        
                        # get all responses for course number 1
                        responses = []
                        for i in range(len(patient_data[drug]['response'])):
                            if (patient_data[drug]['course'][i] == 1) and (patient_data[drug]['response'][i] not in ['[Not Available]','[Unknown]','[Not Applicable]']) and (patient_data[drug]['response'][i] not in responses):
                                responses.append(patient_data[drug]['response'][i])
                            
                        # if only one response, return response
                        if len(responses) == 1:
                            
                            # if drug not in dictionary
                            if drug not in drug_data:
                                drug_data[drug] = {'barcode':[],'cohort':[],'response':[]}
                            
                            # add data to dictionary
                            drug_data[drug]['barcode'].append(barcode)
                            drug_data[drug]['cohort'].append(cohort)
                            drug_data[drug]['response'].append(responses[0])
                    
                    # if no course numbers available, but at least one start date is available
                    elif (min(patient_data[drug]['course']) in ['[Not Available]','[Unknown]','[Not Applicable]']) and (min(patient_data[drug]['start']) not in ['[Not Available]','[Unknown]','[Not Applicable]']):
                        
                        # get all responses for smallest start date
                        responses = []
                        for i in range(len(patient_data[drug]['response'])):
                            if (patient_data[drug]['start'][i] == min(patient_data[drug]['start'])) and (patient_data[drug]['response'][i] not in ['[Not Available]','[Unknown]','[Not Applicable]']) and (patient_data[drug]['response'][i] not in responses):
                                responses.append(patient_data[drug]['response'][i])
                            
                        # if only one response, return response
                        if len(responses) == 1:
                            
                            # if drug not in dictionary
                            if drug not in drug_data:
                                drug_data[drug] = {'barcode':[],'cohort':[],'response':[]}
                            
                            # add data to dictionary
                            drug_data[drug]['barcode'].append(barcode)
                            drug_data[drug]['cohort'].append(cohort)
                            drug_data[drug]['response'].append(responses[0])
    
    # if "measure_of_response" not in biotab file                
    else:
        print('%s does not have drug measure of response' % cohort.upper())

# write TCGA drug list
with open('../../../qFlux-files/clinical/processing/TCGA/druglist.txt','w') as f:
    for drug in drug_data:
        f.write('%s\n' % drug)
        
# iterate over drugs
for drug in drug_data:
    
    # add drug response column to data
    data['RESPONSE DRUG %s' % drug] = ''
    
    # iterate over samples
    for i in range(len(drug_data[drug]['barcode'])):
        
        # find correct samples
        for a in range(data.shape[0]):
            if (data.loc[a]['PATIENT'] == drug_data[drug]['barcode'][i]) and (data.loc[a]['TYPE'] == 'TUMOR'):
                data.at[a,'RESPONSE DRUG %s' % drug] = drug_data[drug]['response'][i]

DLBC does not have drug measure of response
KICH does not have drug regimen number


Save TCGA samples

In [21]:
data = data.drop('DAY_COLLECTION',axis=1)

# rename rows to sample name
data.index = data['SAMPLE']
data = data.drop('SAMPLE', axis=1)

# iterate over samples
for sample in data.index.tolist():
    
    # save data
    data.loc[sample].to_csv('TCGA/%s.csv' % sample)

## CCLE Cell Lines

Load data

In [10]:
# load CCLE data
df = pd.read_table('_data_/input/CCLE/CCLE_sample_info_file_2012-10-18.txt',index_col=0)
df = df.drop(labels=['TT_OESOPHAGUS','TT_THYROID'], axis=0)
df.index = [x.split('_')[0] for x in df.index.tolist()]

# load drug data
#df_data = pd.read_excel('_data_/input/CCLE/v17.3_fitted_dose_response.xlsx',sheet_name='Sheet1')
df_data_1 = pd.read_excel('_data_/input/CCLE/GDSC1_fitted_dose_response_15Oct19.xlsx',sheet_name='Sheet 1')
df_data_2 = pd.read_excel('_data_/input/CCLE/GDSC2_fitted_dose_response_15Oct19.xlsx',sheet_name='Sheet 1')
df_data = pd.concat([df_data_1,df_data_2]).reset_index(drop=False)

# load drug list
#df_drugs = pd.read_excel('_data_/input/CCLE/Screened_Compounds.xlsx',sheet_name='Sheet1')
df_drugs = pd.read_csv('_data_/input/CCLE/screened_compounds_rel_8.1.csv')

# load TCGA standardization
df_conversion = pd.read_csv('_data_/input/CCLE/DrugCorrection1.csv',header=None,index_col=0)

# load radiation response
df_radiation = pd.read_excel('_data_/input/CCLE/ncomms11428-s2.xlsx', header=1, index_col=1)

  


Get COSMIC ID's for each cell line

In [11]:
cell_to_cosmic = {}
for i in range(df_data.shape[0]):
    if str(df_data.loc[i]['CELL_LINE_NAME']) not in cell_to_cosmic:
        cell_to_cosmic[str(df_data.loc[i]['CELL_LINE_NAME'])] = df_data.loc[i]['COSMIC_ID']

In [12]:
df['COSMIC'] = np.nan
for cell in df.index.tolist():
    
    # subset cosmic data
    if type(df.loc[cell]['Cell line aliases']) == str:
        aliases = df.loc[cell]['Cell line aliases'].split(' | ')
        possible_names = list(set([cell,df.loc[cell]['Cell line primary name']] + aliases))
    else:
        possible_names = list(set([cell,df.loc[cell]['Cell line primary name']]))
        
    cosmics = []
    for name in possible_names:
        if name in cell_to_cosmic:
            cosmics.append(cell_to_cosmic[name])
    
    if len(cosmics) == 1:
        df.at[cell,'COSMIC'] = int(cosmics[0])

Edit CCLE clinical variables

In [13]:
# gender
gender = []
for cell in df.index.tolist():
    if df.loc[cell]['Gender'] == 'M':
        gender.append('MALE')
    elif df.loc[cell]['Gender'] == 'F':
        gender.append('FEMALE')
    else:
        gender.append(np.nan)
df['GENDER'] = gender

# primary site
primary_site = []
for cell in df.index.tolist():
    primary_site.append(df.loc[cell]['Site Primary'].replace('_',' ').capitalize())
df['PRIMARY SITE'] = primary_site

# histology
histology = []
for cell in df.index.tolist():
    histology.append(df.loc[cell]['Histology'].replace('_',' ').capitalize())
df['HISTOLOGY'] = histology

# histologic subtype
histologic_subtype = []
for cell in df.index.tolist():
    if df.loc[cell]['Hist Subtype1'] != 'NS':
        histologic_subtype.append(df.loc[cell]['Hist Subtype1'].replace('_',' ').capitalize())
    else:
        histologic_subtype.append(np.nan)
df['HISTOLOGIC SUBTYPE'] = histologic_subtype

# drop unncessary features
df = df.drop(['Cell line primary name','Cell line aliases','Gender','Site Primary','Histology','Hist Subtype1','Notes','Source','Expression arrays','SNP arrays','Oncomap','Hybrid Capture Sequencing'], axis=1)

Radiation Response

In [14]:
df['RADIATION AUC'] = np.nan
for cell in df.index.tolist():
    if cell in df_radiation.index.tolist():
        df.at[cell,'RADIATION AUC'] = df_radiation.loc[cell]['AUC']

Drug Response

In [314]:
# initialize drug list
druglist = []

# iterate over cell lines
for cellline in df.index.tolist():
    
    # get drug responses in cell line
    df_data_subset = df_data.loc[df_data['COSMIC_ID'] == df.loc[cellline]['COSMIC']].reset_index(drop=True)
    for i in range(df_data_subset.shape[0]):

        # drug name
        drug_name = df_drugs.loc[df_drugs['DRUG_ID'].values.tolist().index(int(df_data_subset.loc[i]['DRUG_ID']))]['DRUG_NAME'].strip()
        if df_conversion.loc[drug_name][1] != 'NOS':
        
            if df_conversion.loc[drug_name][1] not in druglist:
                druglist.append(df_conversion.loc[drug_name][1])
                df['IC50 DRUG %s [uM]' % df_conversion.loc[drug_name][1]] = ''

            # IC50
            df.at[cellline,'IC50 DRUG %s [uM]' % df_conversion.loc[drug_name][1]] = np.exp(df_data_subset.loc[i]['LN_IC50'])

# write drug list
with open('_data_/processing/CCLE/druglist.txt','w') as f:
    for drug in druglist:
        f.write('%s\n' % drug)

Save CCLE samples

In [317]:
# remove COSMIC ID
df = df.drop(['COSMIC'], axis=1)

# iterate over celllines
for cellline in df.index.tolist():
    
    # save data
    df.loc[cellline].to_csv('CCLE/%s.csv' % cellline)

  


## GTEx Tissues

Load data

In [3]:
# load CCLE data
df = pd.read_table('_data_/input/GTEx/GTEx_v7_Annotations_SampleAttributesDS.txt',index_col=0)

Save GTEx samples

In [5]:
# iterate over samples
for sample in df.index.tolist():
    
    # create file
    with open('GTEx/%s.csv' % sample, 'w') as f:
        f.write('TISSUE TYPE,%s\n' % df.loc[sample]['SMTSD'])

## Citations

Diener C, Resendis-Antonio O. Personalized prediction of proliferation rates and metabolic liabilities in cancer biopsies. Frontiers in Physiology. 2016;7(644). doi: 10.3389/fphys.2016.00644. PubMed PMID: 28082911. PubMed Central PMCID: PMC5186797.

Spainhour JCG, Lim J, Qiu P. GDISC: a web portal for integrative analyis of gene-drug interaction for survival in cancer. Bioinformatics. 2017;33(9):1426-1428. doi: 10.1093/bioinformatics/btw830. PubMed PMID: 28453687.

Yang W, Soares J, Greninger P, Edelman EJ, Lightfoot H, et al. Genomics of Drug Sensitivity in Cancer (GDSC): a resource for therapeutic biomarker discovery in cancer cells. Nucleic Acids Research. 2013;41(Database issue):D955-D961. doi: 10.1093/nar/gks1111. PubMed PMID: 23180760. PubMed Central PMCID: PMC3531057.