# Breast Cancer Survival Analysis
Penny Yang, Danwen Li, Xinyu Li

In [1]:
import pandas as pd

dataset = pd.read_csv('./BRCA.csv')
dataset.head()

Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,TCGA-D8-A1XD,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,15-Jan-17,19-Jun-17,Alive
1,TCGA-EW-A1OX,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,26-Apr-17,09-Nov-18,Dead
2,TCGA-A8-A079,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,08-Sep-17,09-Jun-18,Alive
3,TCGA-D8-A1XR,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,25-Jan-17,12-Jul-17,Alive
4,TCGA-BH-A0BF,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,06-May-17,27-Jun-19,Dead


In [2]:
dataset.isnull().sum()

Patient_ID             7
Age                    7
Gender                 7
Protein1               7
Protein2               7
Protein3               7
Protein4               7
Tumour_Stage           7
Histology              7
ER status              7
PR status              7
HER2 status            7
Surgery_type           7
Date_of_Surgery        7
Date_of_Last_Visit    24
Patient_Status        20
dtype: int64

### 1. Data Wrangling

#### drop all missing values

In [3]:
dataset = dataset.dropna()   # delete all missing values

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 317 entries, 0 to 333
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Patient_ID          317 non-null    object 
 1   Age                 317 non-null    float64
 2   Gender              317 non-null    object 
 3   Protein1            317 non-null    float64
 4   Protein2            317 non-null    float64
 5   Protein3            317 non-null    float64
 6   Protein4            317 non-null    float64
 7   Tumour_Stage        317 non-null    object 
 8   Histology           317 non-null    object 
 9   ER status           317 non-null    object 
 10  PR status           317 non-null    object 
 11  HER2 status         317 non-null    object 
 12  Surgery_type        317 non-null    object 
 13  Date_of_Surgery     317 non-null    object 
 14  Date_of_Last_Visit  317 non-null    object 
 15  Patient_Status      317 non-null    object 
dtypes: float

#### add a new column, compute the time since surgery to last visit
#### drop columns 'Date_of_Last_Visit' and 'Date_of_Surgery'

In [5]:
# add a new column, compute the time since surgery to last visit
dataset['Date_of_Last_Visit'] = pd.to_datetime(dataset['Date_of_Last_Visit'], format='%d-%b-%y')
dataset['Date_of_Surgery'] = pd.to_datetime(dataset['Date_of_Surgery'], format='%d-%b-%y')
dataset['Days Between Surgery And Last_Visit'] = (dataset['Date_of_Last_Visit'] - dataset['Date_of_Surgery']).dt.days
dataset.head()

Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status,Days Between Surgery And Last_Visit
0,TCGA-D8-A1XD,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,2017-01-15,2017-06-19,Alive,155
1,TCGA-EW-A1OX,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,2017-04-26,2018-11-09,Dead,562
2,TCGA-A8-A079,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,2017-09-08,2018-06-09,Alive,274
3,TCGA-D8-A1XR,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,2017-01-25,2017-07-12,Alive,168
4,TCGA-BH-A0BF,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,2017-05-06,2019-06-27,Dead,782


In [6]:
# define a variable for some data visualizations
data = dataset

In [7]:
# drop columns: Date_of_Last_Visit and Date_of_Surgery
columns_drop = ['Date_of_Last_Visit', 'Date_of_Surgery']
dataset.drop(columns = columns_drop, inplace = True)
dataset.head()

Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Patient_Status,Days Between Surgery And Last_Visit
0,TCGA-D8-A1XD,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,Alive,155
1,TCGA-EW-A1OX,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,Dead,562
2,TCGA-A8-A079,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,Alive,274
3,TCGA-D8-A1XR,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,Alive,168
4,TCGA-BH-A0BF,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,Dead,782


In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 317 entries, 0 to 333
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Patient_ID                           317 non-null    object 
 1   Age                                  317 non-null    float64
 2   Gender                               317 non-null    object 
 3   Protein1                             317 non-null    float64
 4   Protein2                             317 non-null    float64
 5   Protein3                             317 non-null    float64
 6   Protein4                             317 non-null    float64
 7   Tumour_Stage                         317 non-null    object 
 8   Histology                            317 non-null    object 
 9   ER status                            317 non-null    object 
 10  PR status                            317 non-null    object 
 11  HER2 status                     

#### convert String columns to Numeric values

In [9]:
# convert 'Negative' to 0 and 'Positive' to 1 in multiple columns: ER status, PR status, HER2 status
dataset.replace({'Negative': 0, 'Positive': 1}, inplace = True)

# convert 'Alive' to 1, 'Dead' to 0 in 'Patient_Status'
dataset.replace({'Dead': 0, 'Alive': 1}, inplace = True)

dataset.head()

Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Patient_Status,Days Between Surgery And Last_Visit
0,TCGA-D8-A1XD,36.0,FEMALE,0.080353,0.42638,0.54715,0.27368,III,Infiltrating Ductal Carcinoma,1,1,0,Modified Radical Mastectomy,1,155
1,TCGA-EW-A1OX,43.0,FEMALE,-0.42032,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,1,1,0,Lumpectomy,0,562
2,TCGA-A8-A079,69.0,FEMALE,0.21398,1.3114,-0.32747,-0.23426,III,Infiltrating Ductal Carcinoma,1,1,0,Other,1,274
3,TCGA-D8-A1XR,56.0,FEMALE,0.34509,-0.21147,-0.19304,0.12427,II,Infiltrating Ductal Carcinoma,1,1,0,Modified Radical Mastectomy,1,168
4,TCGA-BH-A0BF,56.0,FEMALE,0.22155,1.9068,0.52045,-0.31199,II,Infiltrating Ductal Carcinoma,1,1,0,Other,0,782


In [10]:
gender = dataset['Gender'].unique()
tumour = dataset['Tumour_Stage'].unique()
histology = dataset['Histology'].unique()
surgeryType = dataset['Surgery_type'].unique()
print('gender: ', gender, '\n\ntumour: ', tumour, '\n\nhistology: ', histology, '\n\nsurgery type: ', surgeryType)

gender:  ['FEMALE' 'MALE'] 

tumour:  ['III' 'II' 'I'] 

histology:  ['Infiltrating Ductal Carcinoma' 'Mucinous Carcinoma'
 'Infiltrating Lobular Carcinoma'] 

surgery type:  ['Modified Radical Mastectomy' 'Lumpectomy' 'Other' 'Simple Mastectomy']


In [11]:
from sklearn.preprocessing import OrdinalEncoder

# Initialize OrdinalEncoder
encoder = OrdinalEncoder()

# Use fit_transform to convert string columns to numerical values
original_encoded_data = encoder.fit_transform(dataset[['Gender', 'Tumour_Stage', 'Histology', 'Surgery_type']])
# Assign the transformed data back to the columns in the original DataFrame
dataset[['Gender', 'Tumour_Stage', 'Histology', 'Surgery_type']] = original_encoded_data

dataset.head()

Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Patient_Status,Days Between Surgery And Last_Visit
0,TCGA-D8-A1XD,36.0,0.0,0.080353,0.42638,0.54715,0.27368,2.0,0.0,1,1,0,1.0,1,155
1,TCGA-EW-A1OX,43.0,0.0,-0.42032,0.57807,0.61447,-0.031505,1.0,2.0,1,1,0,0.0,0,562
2,TCGA-A8-A079,69.0,0.0,0.21398,1.3114,-0.32747,-0.23426,2.0,0.0,1,1,0,2.0,1,274
3,TCGA-D8-A1XR,56.0,0.0,0.34509,-0.21147,-0.19304,0.12427,1.0,0.0,1,1,0,1.0,1,168
4,TCGA-BH-A0BF,56.0,0.0,0.22155,1.9068,0.52045,-0.31199,1.0,0.0,1,1,0,2.0,0,782


In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 317 entries, 0 to 333
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Patient_ID                           317 non-null    object 
 1   Age                                  317 non-null    float64
 2   Gender                               317 non-null    float64
 3   Protein1                             317 non-null    float64
 4   Protein2                             317 non-null    float64
 5   Protein3                             317 non-null    float64
 6   Protein4                             317 non-null    float64
 7   Tumour_Stage                         317 non-null    float64
 8   Histology                            317 non-null    float64
 9   ER status                            317 non-null    int64  
 10  PR status                            317 non-null    int64  
 11  HER2 status                     

In [13]:
dataset.describe()

Unnamed: 0,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Patient_Status,Days Between Surgery And Last_Visit
count,317.0,317.0,317.0,317.0,317.0,317.0,317.0,317.0,317.0,317.0,317.0,317.0,317.0,317.0
mean,58.725552,0.012618,-0.027232,0.949557,-0.095104,0.006713,1.053628,0.33123,1.0,1.0,0.091483,1.507886,0.804416,447.776025
std,12.827374,0.111797,0.543858,0.906153,0.589027,0.625965,0.656246,0.546047,0.0,0.0,0.28875,1.039155,0.397276,386.279467
min,29.0,0.0,-2.1446,-0.97873,-1.6274,-2.0255,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,49.0,0.0,-0.3506,0.36884,-0.53136,-0.38224,1.0,0.0,1.0,1.0,0.0,1.0,1.0,189.0
50%,58.0,0.0,0.005649,0.99713,-0.19304,0.038522,1.0,0.0,1.0,1.0,0.0,2.0,1.0,372.0
75%,67.0,0.0,0.33626,1.612,0.25121,0.43625,1.0,1.0,1.0,1.0,0.0,2.0,1.0,595.0
max,90.0,1.0,1.5936,3.4022,2.1934,1.6299,2.0,2.0,1.0,1.0,1.0,3.0,1.0,3019.0


### 2. data visualizations

### 3. Predictive Analytics