In [1]:
import pandas as pd
import pipeline as p
import cleaning as c

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler, OneHotEncoder

pd.set_option('mode.chained_assignment', None)

import datetime as dt

In [2]:
df = p.open_pkl('Data/breast_2000-2015_df.pkl')

In [3]:
df.shape

(592051, 133)

### Notes on updated "cleaning":
Censoring Time
1. Convert DX Month to numeric
2. Create DX Date from DX month & year
3. Calcluate Censoring Time = End of Study (12/31/15) - DX Date

Need to create 'OUTCOME' column: whether person died or not. How to determine this:
- Survival Time < Censoring Time
    - give or take 2 months? -- assumed day of DX Date was 1st of month. 
- some other column?
    - 'CODPUB' to the rescue! Re-code for Cause of Death. https://seer.cancer.gov/codrecode/1969_d03012018/index.html 
        - 00000 = Alive. Others = Dead
        - 41000 & 99999 = Unknown. Need to drop

In [51]:
df4 = c.clean_only_mod(df)

In [52]:
df4['DEATH'].value_counts(1)

0    0.786837
1    0.213163
Name: DEATH, dtype: float64

In [53]:
p.pkl_this('Data/df_cleaned_all_years.pkl', df4)

### Finish data prep
- drop all but desired feature columns
- drop na's

In [54]:
cols_to_include = ['MAR_STAT_MOD',
                   'RACE_MOD',
                   'AGE_DX',
                   'GRADE',
                   'TUMSIZ',
                   'SURG',
                   'SEQ_NUM',
                   'POS_NODES',
                   'HST_STGA',
                   'INVAS',
                  'SRV_TIME_MON',
                  'DEATH']

In [55]:
df5 = df4[cols_to_include]
df5.dropna(inplace=True)

In [60]:
len(df4), len(df5)

(405145, 337674)

In [56]:
df5.head()

Unnamed: 0,MAR_STAT_MOD,RACE_MOD,AGE_DX,GRADE,TUMSIZ,SURG,SEQ_NUM,POS_NODES,HST_STGA,INVAS,SRV_TIME_MON,DEATH
0,1.0,1,45.0,3.0,40.0,1.0,0.0,1.0,2.0,1,95.0,0
1,0.0,2,49.0,2.0,50.0,1.0,0.0,1.0,2.0,1,76.0,1
4,1.0,1,69.0,2.0,15.0,1.0,0.0,0.0,1.0,1,95.0,0
5,1.0,2,65.0,3.0,34.0,1.0,1.0,0.0,1.0,1,95.0,0
6,1.0,1,34.0,3.0,15.0,1.0,0.0,0.0,1.0,1,95.0,0


In [61]:
p.pkl_this('Data/df_survival_analysis.pkl', df5)

### Train-test split & Preprocess data

Need to one-hot encode race for survival regression. One-Hot-Encode was not performing correctly ('01' --> 0,0,0 sometimes, other times --> '0,0,1'). Use Pandas `get_dummies` instead.

In [160]:
df5['RACE_MOD'].unique()

array(['01', '02', '03'], dtype=object)

In [183]:
test_data = X_train.loc[:5]

In [196]:
df6 = pd.concat((df5, pd.get_dummies(df5['RACE_MOD'],prefix='RACE', drop_first=False)), 1)

In [198]:
df6.rename(columns={'RACE_01': 'RACE_WHITE', 'RACE_02': 'RACE_BLACK', 'RACE_03':'RACE_OTHER'}, inplace=True)

In [199]:
df6.head()

Unnamed: 0,MAR_STAT_MOD,RACE_MOD,AGE_DX,GRADE,TUMSIZ,SURG,SEQ_NUM,POS_NODES,HST_STGA,INVAS,SRV_TIME_MON,DEATH,RACE_WHITE,RACE_BLACK,RACE_OTHER
0,1.0,1,45.0,3.0,40.0,1.0,0.0,1.0,2.0,1,95.0,0,1,0,0
1,0.0,2,49.0,2.0,50.0,1.0,0.0,1.0,2.0,1,76.0,1,0,1,0
4,1.0,1,69.0,2.0,15.0,1.0,0.0,0.0,1.0,1,95.0,0,1,0,0
5,1.0,2,65.0,3.0,34.0,1.0,1.0,0.0,1.0,1,95.0,0,0,1,0
6,1.0,1,34.0,3.0,15.0,1.0,0.0,0.0,1.0,1,95.0,0,1,0,0


In [200]:
X = df6.drop(columns=['DEATH'])
y = df6['DEATH']

In [201]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3, random_state=12, stratify=y)

X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

In [203]:
p.pkl_this('Data/X_train_SA.pkl', X_train)
p.pkl_this('Data/y_train_SA.pkl', y_train)
p.pkl_this('Data/X_test_SA.pkl', X_test)
p.pkl_this('Data/y_test_SA.pkl', y_test)

## Old code

In [4]:
df2, df3 = c.clean_and_dropna(df)

In [5]:
df2.head()

Unnamed: 0,MAR_STAT_MOD,RACE_MOD,AGE_DX,GRADE,TUMSIZ,SURG,SEQ_NUM,PRIMSITE,POS_NODES,HST_STGA,INVAS,TARGET
0,1.0,1,45.0,3.0,40.0,1.0,0,C504,1.0,2.0,1,1
1,0.0,2,49.0,2.0,50.0,1.0,0,C502,1.0,2.0,1,1
4,1.0,1,69.0,2.0,15.0,1.0,0,C504,0.0,1.0,1,1
5,1.0,2,65.0,3.0,34.0,1.0,1,C505,0.0,1.0,1,1
6,1.0,1,34.0,3.0,15.0,1.0,0,C506,0.0,1.0,1,1


In [6]:
df2.shape

(222127, 12)

In [39]:
df3.head()

Unnamed: 0,MAR_STAT_MOD,RACE_MOD,AGE_DX,GRADE,TUMSIZ,SURG,SEQ_NUM,PRIMSITE,POS_NODES,INVAS,HST_STGA,BRST_SUB,TARGET
589,0.0,3,72.0,3.0,17.0,1.0,0,C504,0.0,1,1.0,3,1
688,1.0,3,72.0,2.0,15.0,1.0,2,C504,0.0,1,1.0,3,1
1108,0.0,3,56.0,3.0,24.0,1.0,3,C508,9.0,1,2.0,3,0
1267,0.0,3,85.0,3.0,8.0,1.0,2,C508,0.0,1,1.0,3,1
1702,0.0,3,83.0,1.0,11.0,1.0,2,C508,0.0,1,1.0,3,1


In [7]:
df3.shape

(19667, 13)

In [6]:
p.pkl_this('Data/breast_df_for_model.pkl', df2)
p.pkl_this('Data/breast_df_with_sub_for_model.pkl', df3)

## Train-Test split

Drop 'PRIMSITE' (primary site) -- later discovered to not be informative

In [7]:
X = df2.drop(columns=['TARGET', 'PRIMSITE'])
y = df2['TARGET']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3, random_state=12, stratify=y)

X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

In [10]:
p.pkl_this('Data/X_train_unprocessed.pkl', X_train)
p.pkl_this('Data/y_train_unprocessed.pkl', y_train)
p.pkl_this('Data/X_test_unprocessed.pkl', X_test)
p.pkl_this('Data/y_test_unprocessed.pkl', y_test)

Data with sub-type feature:

In [11]:
Xsub = df3.drop(columns=['TARGET', 'PRIMSITE'])
ysub = df3['TARGET']

In [12]:
ysub.value_counts(1)

1    0.784461
0    0.215539
Name: TARGET, dtype: float64

In [13]:
X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(
    Xsub, ysub, test_size = 0.3, random_state=12, stratify=ysub)

X_train_sub.reset_index(drop=True, inplace=True)
y_train_sub.reset_index(drop=True, inplace=True)

In [14]:
p.pkl_this('Data/X_train_unproc_with_sub.pkl', X_train_sub)
p.pkl_this('Data/y_train_unproc_with_sub.pkl', y_train_sub)
p.pkl_this('Data/X_test_unproc_with_sub.pkl', X_test_sub)
p.pkl_this('Data/y_test_unproc_with_sub.pkl', y_test_sub)

## Pre-processing Pipeline

In [24]:
X_train.head()

Unnamed: 0,MAR_STAT_MOD,RACE_MOD,AGE_DX,GRADE,TUMSIZ,SURG,SEQ_NUM,POS_NODES,HST_STGA,INVAS
0,1.0,3,79.0,3.0,15.0,1.0,2,0.0,1.0,1
1,0.0,3,94.0,2.0,20.0,1.0,0,0.0,1.0,1
2,0.0,3,80.0,3.0,1.0,1.0,0,0.0,1.0,1
3,1.0,3,76.0,1.0,18.0,1.0,0,0.0,1.0,1
4,1.0,3,84.0,1.0,8.0,1.0,2,0.0,1.0,1


In [15]:
# for dataframe without sub-type
preprocessor1 = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), [2,4,7]), # age, tumor size, pos_nodes
        ('cat', OneHotEncoder(categories='auto', handle_unknown='ignore'), [1]) # race_mod
    ], remainder='passthrough')

In [16]:
# for dataframe with sub-type
preprocessor2 = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), [2,4,7]), # age, tumor size, pos_nodes
        ('cat', OneHotEncoder(categories='auto', handle_unknown='ignore', ), [1,10]) # race, BRST_SUB
    ], remainder='passthrough')

In [17]:
X_tr_proc = preprocessor1.fit_transform(X_train)

In [18]:
X_tr_proc[0]

array([ 1.41189499, -0.30159618, -0.39079147,  1.        ,  0.        ,
        0.        ,  1.        ,  3.        ,  1.        ,  2.        ,
        1.        ,  1.        ])

In [19]:
len(X_tr_proc[0])

12

In [20]:
preprocessor1.named_transformers_['cat'].get_feature_names()

array(['x0_01', 'x0_02', 'x0_03'], dtype=object)

In [21]:
p.pkl_this('Data/preprocess_wo_sub.pkl', preprocessor1)
p.pkl_this('Data/preprocess_with_sub.pkl', preprocessor2)