In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from decision_trees.config import data_path
from decision_trees.plotting_utils import *

sns.set_context("talk")

# settings
%load_ext autoreload
%autoreload 2

## 1. Load In Data

Load in data (cleaned if was necessary)

In [2]:
fpath = "raw/breast_cancer_raw.csv"
df = pd.read_csv(data_path / fpath)
df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Regional Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


## 2. Describe data

Things to consider:
* datatypes: cat, float, int, datetime
* distributions of numerical data ~ predictor
* multi-variate distributions 

https://ieee-dataport.org/open-access/seer-breast-cancer-data

more info on 6th edition stages: https://seer.cancer.gov/seerstat/variables/seer/ajcc-stage/6th/

Columns:
'Age': age of px at dx,
'Race': White, Black, Other, Other-Unspecified, Unknown (Other=Asian, Pacific Islander, American Indian)
'Marital Status': Single, Married, Separated, Divorced, Widowed. at time of dx,
'T Stage': size and extent of the main tumor (bigger is worse). T1-T4,
'N Stage': number of nearby affected lymph nodes. N1-N3,
'6th Stage': what stage of cancer - 6th edition stage of cancer, increases from IIA->IIB->IIIA->IIIB->IIIC,
'differentiate': range from Well->Moderate->Poorly->Undifferentiated, where undifferentiated is the worst,
'Grade': range from 1-4 (numerical encoding of "differentiate" column),
'A Stage': Regional or Distant. Distant means more metastasized (worse),
'Tumor Size': exact size in mm,
'Estrogen Status': if the tumor is Positive or Negative for estrogen receptors (if cancer cells are estrogen positive, it means it has receptors for estrogen, meaning estrogen levels affect the growth of the tumor. determining which receptor type you're positive for (estrogen and/or progesterone) determines what treatment options you have),
'Progesterone Status': Positive or Negative to progesterone receptors, same concept as estrogen,
'Regional Node Examined': total number of regional lymph nodes that were removed and examined by the pathologist,
'Regional Node Positive': total number of regional lymph nodes that were examined and found to be positive,
'Survival Months': amount of time px has survived since dx (if px is alive, this number is the min number of months they've survived, could be longer),
'Status': Alive or Dead

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Age                     4024 non-null   int64 
 1   Race                    4024 non-null   object
 2   Marital Status          4024 non-null   object
 3   T Stage                 4024 non-null   object
 4   N Stage                 4024 non-null   object
 5   6th Stage               4024 non-null   object
 6   differentiate           4024 non-null   object
 7   Grade                   4024 non-null   object
 8   A Stage                 4024 non-null   object
 9   Tumor Size              4024 non-null   int64 
 10  Estrogen Status         4024 non-null   object
 11  Progesterone Status     4024 non-null   object
 12  Regional Node Examined  4024 non-null   int64 
 13  Regional Node Positive  4024 non-null   int64 
 14  Survival Months         4024 non-null   int64 
 15  Stat

In [4]:
df.columns

Index(['Age', 'Race', 'Marital Status', 'T Stage', 'N Stage', '6th Stage',
       'differentiate', 'Grade', 'A Stage', 'Tumor Size', 'Estrogen Status',
       'Progesterone Status', 'Regional Node Examined',
       'Regional Node Positive', 'Survival Months', 'Status'],
      dtype='object')

### Column Rename

In [5]:
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(" ", "_")
df.rename(columns={"6th_stage": "stage"}, inplace=True)
print(df.columns)

Index(['age', 'race', 'marital_status', 't_stage', 'n_stage', 'stage',
       'differentiate', 'grade', 'a_stage', 'tumor_size', 'estrogen_status',
       'progesterone_status', 'regional_node_examined',
       'regional_node_positive', 'survival_months', 'status'],
      dtype='object')


### Numerical Representation

In [6]:
df["t_stage_num"] = df.t_stage.str.replace("T", "").astype(int)
df[["t_stage_num", "t_stage"]].head()

Unnamed: 0,t_stage_num,t_stage
0,1,T1
1,2,T2
2,3,T3
3,1,T1
4,2,T2


In [7]:
df["n_stage_num"] = df.n_stage.str.replace("N", "").astype(int)
df[["n_stage_num", "n_stage"]].head()

Unnamed: 0,n_stage_num,n_stage
0,1,N1
1,2,N2
2,3,N3
3,1,N1
4,1,N1


In [8]:
stage_map = {"IIA": 1, "IIB": 2, "IIIA": 3, "IIIB": 4, "IIIC": 5}
df["stage_num"] = df.stage.map(stage_map)
df[["stage", "stage_num"]].head()

Unnamed: 0,stage,stage_num
0,IIA,1
1,IIIA,3
2,IIIC,5
3,IIA,1
4,IIB,2


### Boolean Representation

In [9]:
bool_map = {
    "Positive": True,
    "Negative": False,
    "Alive": True,
    "Dead": False,
    "Regional": False,
    "Distant": True,
}
df["has_estrogen"] = df.estrogen_status.map(bool_map)
df["has_progesterone"] = df.progesterone_status.map(bool_map)
df["is_alive"] = df.status.map(bool_map)
df["is_metastasized"] = df.a_stage.map(bool_map)

df[
    [
        "estrogen_status",
        "has_estrogen",
        "progesterone_status",
        "has_progesterone",
        "a_stage",
        "is_metastasized",
        "status",
        "is_alive",
    ]
].head()

Unnamed: 0,estrogen_status,has_estrogen,progesterone_status,has_progesterone,a_stage,is_metastasized,status,is_alive
0,Positive,True,Positive,True,Regional,False,Alive,True
1,Positive,True,Positive,True,Regional,False,Alive,True
2,Positive,True,Positive,True,Regional,False,Alive,True
3,Positive,True,Positive,True,Regional,False,Alive,True
4,Positive,True,Positive,True,Regional,False,Alive,True


### Ratio of Examined to Positive



In [10]:
df.columns

Index(['age', 'race', 'marital_status', 't_stage', 'n_stage', 'stage',
       'differentiate', 'grade', 'a_stage', 'tumor_size', 'estrogen_status',
       'progesterone_status', 'regional_node_examined',
       'regional_node_positive', 'survival_months', 'status', 't_stage_num',
       'n_stage_num', 'stage_num', 'has_estrogen', 'has_progesterone',
       'is_alive', 'is_metastasized'],
      dtype='object')

In [13]:
df["regional_node_frac_positive"] = (
    df.regional_node_positive / df.regional_node_examined
).round(4)

df[
    ["regional_node_positive", "regional_node_examined", "regional_node_frac_positive"]
].head(10)

Unnamed: 0,regional_node_positive,regional_node_examined,regional_node_frac_positive
0,1,24,0.0417
1,5,14,0.3571
2,7,14,0.5
3,1,2,0.5
4,1,3,0.3333
5,2,18,0.1111
6,1,11,0.0909
7,1,9,0.1111
8,18,20,0.9
9,12,21,0.5714


In [14]:
fpath = "processed/breast_cancer.csv"
df.to_csv(data_path / fpath, index=False)