In [157]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import OneHotEncoder

In [158]:
clin_fp = open('../data/nationwidechildrens.org_clinical_patient_brca.txt')
clin_data = pd.read_csv(clin_fp,skiprows=[1,2], header=[0], delimiter="\t")
keep_cols = ['bcr_patient_uuid','tumor_status','surgical_procedure_first','margin_status',
             'lymph_nodes_examined_he_count','ajcc_tumor_pathologic_pt','ajcc_nodes_pathologic_pn',
             'ajcc_metastasis_pathologic_pm','ajcc_pathologic_tumor_stage','er_status_by_ihc',
             'pr_status_by_ihc','her2_status_by_ihc','histological_type', 'vital_status']
clin_data = clin_data[keep_cols]

In [159]:
clin_data['surgical_procedure_first'] = clin_data['surgical_procedure_first'].replace(['[Discrepancy]','[Unknown]', '[Not Available]'], 'NA_proc')
print(clin_data['surgical_procedure_first'].unique())
first_surg_hot = OneHotEncoder(dtype=np.int, categories='auto')
first_surg_out = first_surg_hot.fit_transform(clin_data[['surgical_procedure_first']]).toarray()
print(first_surg_hot.categories_)
clin_data = clin_data.merge(pd.DataFrame(first_surg_out, columns=np.array(first_surg_hot.categories_).ravel()),left_index=True,right_index=True)
print(list(clin_data))

clin_data['margin_status'] = clin_data['margin_status'].replace('[Unknown]', '[Not Available]')
margin_map = {'[Not Available]':0,'Positive': 1,'Close':2,'Negative':3}
clin_data['margin_status'] = clin_data['margin_status'].map(lambda x: margin_map[x])
print(clin_data['margin_status'].unique())

['Modified Radical Mastectomy' 'Lumpectomy' 'Simple Mastectomy' 'NA_proc'
 'Other']
[array(['Lumpectomy', 'Modified Radical Mastectomy', 'NA_proc', 'Other',
       'Simple Mastectomy'], dtype=object)]
['bcr_patient_uuid', 'tumor_status', 'surgical_procedure_first', 'margin_status', 'lymph_nodes_examined_he_count', 'ajcc_tumor_pathologic_pt', 'ajcc_nodes_pathologic_pn', 'ajcc_metastasis_pathologic_pm', 'ajcc_pathologic_tumor_stage', 'er_status_by_ihc', 'pr_status_by_ihc', 'her2_status_by_ihc', 'histological_type', 'vital_status', 'Lumpectomy', 'Modified Radical Mastectomy', 'NA_proc', 'Other', 'Simple Mastectomy']
[3 2 1 0]


In [160]:
print(clin_data['tumor_status'].unique())
clin_data['tumor_status'] = clin_data['tumor_status'].replace('[Unknown]', '[Not Available]')
tumor_map = {'[Not Available]':0,'WITH TUMOR': 1,'TUMOR FREE':2}
clin_data['tumor_status'] = clin_data['tumor_status'].map(lambda x: tumor_map[x])

['WITH TUMOR' 'TUMOR FREE' '[Unknown]' '[Not Available]']


In [161]:
print(clin_data['lymph_nodes_examined_he_count'].unique())
clin_data['lymph_nodes_examined_he_count'] = clin_data['lymph_nodes_examined_he_count'].replace('[Not Available]', '-1')
clin_data['lymph_nodes_examined_he_count'] = clin_data['lymph_nodes_examined_he_count'].astype(int)

['4' '1' '0' '[Not Available]' '3' '18' '9' '12' '2' '5' '26' '8' '10'
 '14' '6' '22' '27' '13' '21' '19' '7' '15' '11' '29' '25' '16' '24' '35'
 '17' '23' '28' '20']


In [162]:
print(clin_data['ajcc_tumor_pathologic_pt'].unique())
tumor_pt_map = {'TX':0,'T1': 1,'T1a':2, 'T1b':3, 'T1c':4, 'T2':5, 'T2a':6, 'T2b':7, 'T3':8, 'T3a':9, 'T4':10, 'T4b':11, 'T4d':12 }
clin_data['ajcc_tumor_pathologic_pt'] = clin_data['ajcc_tumor_pathologic_pt'].map(lambda x: tumor_pt_map[x])

['TX' 'T2' 'T1c' 'T3' 'T1' 'T4b' 'T1b' 'T4d' 'T4' 'T2b' 'T1a' 'T3a' 'T2a']


In [163]:
print(clin_data['ajcc_nodes_pathologic_pn'].unique())
nodes_pn_map = {'NX':0,'N0 (mol+)': 1,'N0 (i-)':2, 'N0 (i+)':3, 'N0':4, 'N1mi':5, 'N1':6, 'N1a':7, 'N1b':8, 'N1c':9, 'N2':10, 'N2a':11, 'N3':12, 'N3a': 13, 'N3b': 14, 'N3c':15}
clin_data['ajcc_nodes_pathologic_pn'] = clin_data['ajcc_nodes_pathologic_pn'].map(lambda x: nodes_pn_map[x])

['NX' 'N1a' 'N0 (i+)' 'N2a' 'N0' 'N0 (i-)' 'N1' 'N3c' 'N1mi' 'N3a' 'N2'
 'N3' 'N1b' 'N1c' 'N0 (mol+)' 'N3b']


In [164]:
print(clin_data['ajcc_metastasis_pathologic_pm'].unique())
meta_pm_map = {'MX':0,'M0': 1,'cM0 (i+)':2, 'M1': 3}
clin_data['ajcc_metastasis_pathologic_pm'] = clin_data['ajcc_metastasis_pathologic_pm'].map(lambda x: meta_pm_map[x])

['MX' 'M0' 'M1' 'cM0 (i+)']


In [165]:
print(clin_data['ajcc_pathologic_tumor_stage'].unique())
clin_data['ajcc_pathologic_tumor_stage'] = clin_data['ajcc_pathologic_tumor_stage'].replace(['[Not Available]','[Discrepancy]'], 'Stage X')
tumor_stage_map = {'Stage X':0,'Stage I': 1,'Stage IA':2, 'Stage IB': 3, 'Stage II': 4, 'Stage IIA': 5, 'Stage IIB':6, 'Stage IIB':6, 'Stage III':6, 'Stage IIIA':6, 'Stage IIIB':6, 'Stage IIIC':6,'Stage IV':6,}
clin_data['ajcc_pathologic_tumor_stage'] = clin_data['ajcc_pathologic_tumor_stage'].map(lambda x: tumor_stage_map[x])

['Stage X' 'Stage IIB' 'Stage IA' 'Stage IIIA' 'Stage IIA' 'Stage IV'
 'Stage I' 'Stage IIIC' 'Stage IB' 'Stage IIIB' '[Discrepancy]'
 'Stage III' 'Stage II' '[Not Available]']


In [166]:
print(clin_data['er_status_by_ihc'].unique())
clin_data['er_status_by_ihc'] = clin_data['er_status_by_ihc'].replace('[Not Evaluated]', 'Indeterminate')
er_status_map = {'Indeterminate':0,'Negative': 1,'Positive':2,}
clin_data['er_status_by_ihc'] = clin_data['er_status_by_ihc'].map(lambda x: er_status_map[x])

['Positive' 'Negative' '[Not Evaluated]' 'Indeterminate']


In [167]:
print(clin_data['pr_status_by_ihc'].unique())
clin_data['pr_status_by_ihc'] = clin_data['pr_status_by_ihc'].replace('[Not Evaluated]', 'Indeterminate')
pr_status_map = {'Indeterminate':0,'Negative': 1,'Positive':2,}
clin_data['pr_status_by_ihc'] = clin_data['pr_status_by_ihc'].map(lambda x: pr_status_map[x])

['Positive' 'Negative' '[Not Evaluated]' 'Indeterminate']


In [168]:
print(clin_data['her2_status_by_ihc'].unique())

clin_data['her2_status_by_ihc'] = clin_data['her2_status_by_ihc'].replace(['[Not Available]', '[Not Evaluated]', 'Equivocal'], 'Indeterminate')
her2_status_map = {'Indeterminate':0,'Negative': 1,'Positive':2,}
clin_data['her2_status_by_ihc'] = clin_data['her2_status_by_ihc'].map(lambda x: her2_status_map[x])

['Negative' 'Positive' 'Indeterminate' 'Equivocal' '[Not Evaluated]'
 '[Not Available]']


In [169]:
print(clin_data['histological_type'].unique())
clin_data['histological_type'] = clin_data['histological_type'].replace(['Other, specify','Mixed Histology (please specify)', '[Not Available]'], 'NA_hist')
print(clin_data['histological_type'].unique())
histo_type_hot = OneHotEncoder(dtype=np.int, categories='auto')
histo_type_out = histo_type_hot.fit_transform(clin_data[['histological_type']]).toarray()
print(histo_type_hot.categories_)
clin_data = clin_data.merge(pd.DataFrame(histo_type_out, columns=np.array(histo_type_hot.categories_).ravel()),left_index=True,right_index=True)
print(list(clin_data))

['Infiltrating Lobular Carcinoma' 'Infiltrating Ductal Carcinoma'
 'Other, specify' 'Mixed Histology (please specify)' 'Mucinous Carcinoma'
 'Metaplastic Carcinoma' 'Infiltrating Carcinoma NOS'
 'Medullary Carcinoma' '[Not Available]']
['Infiltrating Lobular Carcinoma' 'Infiltrating Ductal Carcinoma'
 'NA_hist' 'Mucinous Carcinoma' 'Metaplastic Carcinoma'
 'Infiltrating Carcinoma NOS' 'Medullary Carcinoma']
[array(['Infiltrating Carcinoma NOS', 'Infiltrating Ductal Carcinoma',
       'Infiltrating Lobular Carcinoma', 'Medullary Carcinoma',
       'Metaplastic Carcinoma', 'Mucinous Carcinoma', 'NA_hist'],
      dtype=object)]
['bcr_patient_uuid', 'tumor_status', 'surgical_procedure_first', 'margin_status', 'lymph_nodes_examined_he_count', 'ajcc_tumor_pathologic_pt', 'ajcc_nodes_pathologic_pn', 'ajcc_metastasis_pathologic_pm', 'ajcc_pathologic_tumor_stage', 'er_status_by_ihc', 'pr_status_by_ihc', 'her2_status_by_ihc', 'histological_type', 'vital_status', 'Lumpectomy', 'Modified Radical M

In [170]:
clin_data.to_csv("../data/clinical_data_processed.csv", sep=',')