## Breast Cancer Mortality and Survival Prediction

Notebook 1: Data Understanding and Preparation

Author: Thanuja Liyanage

In [None]:
!pip install pandas



In [None]:
#Importing Pandas Library
import pandas as pd

#Loading Data into a DataFrame
cancer_dataset = pd.read_csv('cancer-dataset.csv')

#Displaying the first 5 rows of the DataFrame
cancer_dataset.head()

Unnamed: 0,Patient_ID,Month_of_Birth,Age,Sex,Occupation,T_Stage,N_Stage,6th_Stage,Differentiated,Grade,A_Stage,Tumor_Size,Estrogen_Status,Progesterone_Status,Regional_Node_Examined,Reginol_Node_Positive,Survival_Months,Mortality_Status
0,A0012,12,68.0,Female,Teaching,T1,N1,IIA,Poorly differentiated,3,Regional,4.0,Positive,Positive,24.0,1,60,Alive
1,A0013,12,50.0,Female,Medical,T2,N2,IIIA,Moderately differentiated,2,Regional,35.0,Positive,Positive,14.0,5,62,Alive
2,A0014,11,58.0,Female,Engineering,T3,N3,IIIC,Moderately differentiated,2,Regional,63.0,Positive,Positive,14.0,7,75,Alive
3,A0015,3,58.0,Female,Technology,T1,N1,IIA,Poorly differentiated,3,Regional,18.0,Positive,Positive,2.0,1,84,Alive
4,A0016,1,47.0,Female,Multimedia,T2,N1,IIB,Poorly differentiated,3,Regional,41.0,Positive,Positive,3.0,1,50,Alive


In [None]:
list(cancer_dataset.columns)

['Patient_ID',
 'Month_of_Birth',
 'Age',
 'Sex',
 'Occupation',
 'T_Stage',
 'N_Stage',
 '6th_Stage',
 'Differentiated',
 'Grade',
 'A_Stage',
 'Tumor_Size',
 'Estrogen_Status',
 'Progesterone_Status',
 'Regional_Node_Examined',
 'Reginol_Node_Positive',
 'Survival_Months',
 'Mortality_Status']

In [None]:
cancer_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Patient_ID              4024 non-null   object 
 1   Month_of_Birth          4024 non-null   int64  
 2   Age                     4015 non-null   float64
 3   Sex                     4020 non-null   object 
 4   Occupation              43 non-null     object 
 5   T_Stage                 4024 non-null   object 
 6   N_Stage                 4024 non-null   object 
 7   6th_Stage               4024 non-null   object 
 8   Differentiated          4024 non-null   object 
 9   Grade                   4024 non-null   int64  
 10  A_Stage                 4024 non-null   object 
 11  Tumor_Size              4021 non-null   float64
 12  Estrogen_Status         4024 non-null   object 
 13  Progesterone_Status     4024 non-null   object 
 14  Regional_Node_Examined  4023 non-null   

In [None]:
cancer_dataset.describe()

Unnamed: 0,Month_of_Birth,Age,Grade,Tumor_Size,Regional_Node_Examined,Reginol_Node_Positive,Survival_Months
count,4024.0,4015.0,4024.0,4021.0,4023.0,4024.0,4024.0
mean,6.481362,54.107098,2.150596,30.419299,14.373602,4.158052,71.472167
std,3.475442,11.715528,0.638234,21.16108,8.129293,5.109331,25.361855
min,1.0,-50.0,1.0,-75.0,1.0,1.0,1.0
25%,3.0,47.0,2.0,16.0,9.0,1.0,56.0
50%,6.0,54.0,2.0,25.0,14.0,2.0,73.0
75%,10.0,61.0,3.0,38.0,19.0,5.0,90.0
max,12.0,502.0,4.0,140.0,61.0,46.0,760.0


In [None]:
cancer_dataset.describe(include=object)

Unnamed: 0,Patient_ID,Sex,Occupation,T_Stage,N_Stage,6th_Stage,Differentiated,A_Stage,Estrogen_Status,Progesterone_Status,Mortality_Status
count,4024,4020,43,4024,4024,4024,4024,4024,4024,4024,4024
unique,4024,2,40,4,3,5,4,2,2,2,7
top,A4035,Female,House Person,T2,N1,IIA,Moderately differentiated,Regional,Positive,Positive,Alive
freq,1,4001,2,1786,2732,1305,2351,3932,3755,3326,3399


In [None]:
cancer_dataset.shape

(4024, 18)

In [None]:
cancer_dataset.isnull().sum()

Unnamed: 0,0
Patient_ID,0
Month_of_Birth,0
Age,9
Sex,4
Occupation,3981
T_Stage,0
N_Stage,0
6th_Stage,0
Differentiated,0
Grade,0


In [None]:
# Rename the incorrect column name
cancer_dataset.rename(columns={'Reginol_Node_Positive': 'Regional_Node_Positive'}, inplace=True)

In [None]:
# Dropping 'Occupation' due to high missing values
cancer_dataset.drop(columns=['Occupation'], inplace=True)

In [None]:
cancer_dataset['Mortality_Status'] = cancer_dataset['Mortality_Status'].str.lower().map({
    'alive': 0,
    'dead': 1
})

In [None]:
# Fill missing values
cancer_dataset['Age'].fillna(cancer_dataset['Age'].median(), inplace=True)  # Fill missing age with median
cancer_dataset['Sex'].fillna(cancer_dataset['Sex'].mode()[0], inplace=True)  # Fill missing sex with mode
cancer_dataset['Tumor_Size'].fillna(cancer_dataset['Tumor_Size'].median(), inplace=True)  # Fill missing tumor size with median
cancer_dataset['Regional_Node_Examined'].fillna(cancer_dataset['Regional_Node_Examined'].median(), inplace=True)  # Fill missing nodes with median

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cancer_dataset['Age'].fillna(cancer_dataset['Age'].median(), inplace=True)  # Fill missing age with median
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cancer_dataset['Sex'].fillna(cancer_dataset['Sex'].mode()[0], inplace=True)  # Fill missing sex with mode
The behavior will c

In [None]:
cancer_dataset.isnull().sum()

Unnamed: 0,0
Patient_ID,0
Month_of_Birth,0
Age,0
Sex,0
T_Stage,0
N_Stage,0
6th_Stage,0
Differentiated,0
Grade,0
A_Stage,0


In [None]:
# Define function to detect outliers using IQR
def find_outliers_IQR(data):
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    IQR = q3 - q1
    outliers = data[(data < (q1 - 1.5 * IQR)) | (data > (q3 + 1.5 * IQR))]
    return outliers

In [None]:
# Define function to remove outliers using IQR
def remove_outliers(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    IQR = q3 - q1
    lower_bound = q1 - 1.5 * IQR
    upper_bound = q3 + 1.5 * IQR
    df_cleaned = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df_cleaned

In [None]:
# Select numerical columns for outlier detection
numeric_cols = ['Age', 'Tumor_Size', 'Regional_Node_Examined', 'Regional_Node_Positive', 'Survival_Months']

In [None]:
import plotly.express as px

# Detect and visualize outliers before removal using Plotly
print("Detecting outliers in numerical columns...")
for col in numeric_cols:
    outliers = find_outliers_IQR(cancer_dataset[col])
    print(f"Number of outliers in {col}: {len(outliers)}")

    # Interactive boxplot before removing outliers
    fig = px.box(cancer_dataset, y=col, title=f'Boxplot for {col} (Before Removing Outliers)')
    fig.show()

Detecting outliers in numerical columns...
Number of outliers in Age: 4


Number of outliers in Tumor_Size: 221


Number of outliers in Regional_Node_Examined: 73


Number of outliers in Regional_Node_Positive: 344


Number of outliers in Survival_Months: 19


In [None]:
# Remove outliers
for col in numeric_cols:
    cancer_dataset = remove_outliers(cancer_dataset, col)

In [None]:
# Verify removal by checking outliers again
for col in numeric_cols:
    outliers = find_outliers_IQR(cancer_dataset[col])
    print(f"Remaining outliers in {col}: {len(outliers)}")

    # Boxplot after removing outliers
    fig = px.box(cancer_dataset, y=col, title=f'Boxplot for {col} (Before Removing Outliers)')
    fig.show()

print("\nOutlier removal completed successfully.")

Remaining outliers in Age: 0


Remaining outliers in Tumor_Size: 167


Remaining outliers in Regional_Node_Examined: 5


Remaining outliers in Regional_Node_Positive: 160


Remaining outliers in Survival_Months: 6



Outlier removal completed successfully.


In [None]:
cancer_dataset.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Month_of_Birth,3257.0,6.454406,3.474967,1.0,3.0,6.0,9.0,12.0
Age,3257.0,53.946577,8.904536,30.0,47.0,54.0,61.0,69.0
Grade,3257.0,2.115444,0.63007,1.0,2.0,2.0,3.0,4.0
Tumor_Size,3257.0,25.815781,14.445754,1.0,15.0,22.0,32.0,70.0
Regional_Node_Examined,3257.0,13.035923,7.109377,1.0,8.0,13.0,18.0,34.0
Regional_Node_Positive,3257.0,2.40743,1.809946,1.0,1.0,2.0,3.0,8.0
Survival_Months,3257.0,73.058029,21.266127,8.0,58.0,74.0,91.0,107.0
Mortality_Status,3257.0,0.111145,0.31436,0.0,0.0,0.0,0.0,1.0


In [None]:
cancer_dataset = cancer_dataset.dropna()

In [None]:
cancer_dataset = cancer_dataset.astype({"Age": "int", "Tumor_Size": "int", "Mortality_Status" : "int"})

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encoding Categorical Variables
label_encoders = {}
categorical_cols = ['Sex', 'T_Stage', 'N_Stage', '6th_Stage', 'Differentiated', 'A_Stage', 'Estrogen_Status', 'Progesterone_Status']

for col in categorical_cols:
    le = LabelEncoder()
    cancer_dataset[col] = le.fit_transform(cancer_dataset[col])
    label_encoders[col] = le

In [None]:
cancer_dataset.head()

Unnamed: 0,Patient_ID,Month_of_Birth,Age,Sex,T_Stage,N_Stage,6th_Stage,Differentiated,Grade,A_Stage,Tumor_Size,Estrogen_Status,Progesterone_Status,Regional_Node_Examined,Regional_Node_Positive,Survival_Months,Mortality_Status
0,A0012,12,68,1,0,0,0,1,3,1,4,1,1,24.0,1,60,0
1,A0013,12,50,1,1,1,2,0,2,1,35,1,1,14.0,5,62,0
2,A0014,11,58,1,2,2,4,0,2,1,63,1,1,14.0,7,75,0
3,A0015,3,58,1,0,0,0,1,3,1,18,1,1,2.0,1,84,0
4,A0016,1,47,1,1,0,1,1,3,1,41,1,1,3.0,1,50,0


In [None]:
# Splitting the dataset
classification_df = cancer_dataset.drop(columns=['Survival_Months'])  # Classification target: Mortality_Status
regression_df = cancer_dataset.drop(columns=['Mortality_Status'])  # Regression target: Survival_Months

In [None]:
# Removing 'Patient_ID' before saving datasets
classification_df.drop(columns=['Patient_ID'], inplace=True)
regression_df.drop(columns=['Patient_ID'], inplace=True)

In [None]:
# Saving the cleaned datasets
classification_file = "classification_dataset.csv"
regression_file = "regression_dataset.csv"

classification_df.to_csv(classification_file, index=False)
regression_df.to_csv(regression_file, index=False)

In [None]:
print("\nData preprocessing complete. Cleaned datasets saved:")
print(f"- Classification dataset: {classification_file}")
print(f"- Regression dataset: {regression_file}")


Data preprocessing complete. Cleaned datasets saved:
- Classification dataset: classification_dataset.csv
- Regression dataset: regression_dataset.csv
