In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.style.use('ggplot')

In [None]:
df = pd.read_csv(r'C:\Users\hardi\Breast-Cancer-Survival-Prediction\data\data.csv')
df.head()

## Quick EDA

In [None]:
df.info()

In [None]:
df.describe(include='object').T

In [None]:
df.Gender.value_counts()  ## mostly in females

In [None]:
ax = df.Patient_Status.value_counts().sort_index().plot(kind='bar',figsize=(7,7),
                                                       title='Distribution of the Target variable')
ax.set_xlabel('Patient Status')

Since the target variable is unbalanced, there is a need to oversample or undersample the target variables so that the model can generalize better.

In [None]:
df.Tumour_Stage.value_counts().sort_index().plot(kind='pie',figsize=(7,7),subplots=True,
                                                title='No of patients with differnt stages of cancer',
                                                ylabel=" ",legend=False,autopct='%1.3f%%')

In [None]:
df.Histology.value_counts().plot(kind='bar',subplots=True,legend=False,figsize=(7,7))

## Preprocessing Pipeline

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, OrdinalEncoder

In [None]:
# Drop ER and PR status
df.drop(['ER status', 'PR status'], axis=1, inplace=True)

# Encode Tumour_Stage, Histology, and Surgery_type using OrdinalEncoder
ordinal_encoder = OrdinalEncoder(categories=[['I', 'II', 'III', 'IV'], 
                                              ['Infiltrating Ductal Carcinoma', 'Infiltrating Lobular Carcinoma', 'Mucinous Carcinoma'],
                                              ['Other', 'Lumpectomy', 'Modified Radical Mastectomy','Simple Mastectomy']])
df[['Tumour_Stage', 'Histology', 'Surgery_type']] = ordinal_encoder.fit_transform(df[['Tumour_Stage', 'Histology', 'Surgery_type']])

# Encode Gender and HER2 status using binary encoding
df = pd.get_dummies(df, columns=['Gender', 'HER2 status'], drop_first=True)

# Extract year, month, and day from Date_of_Surgery and Date_of_Last_Visit
df['Surgery_Year'] = pd.DatetimeIndex(df['Date_of_Surgery']).year
df['Surgery_Month'] = pd.DatetimeIndex(df['Date_of_Surgery']).month
df['Surgery_Day'] = pd.DatetimeIndex(df['Date_of_Surgery']).day
df.drop('Date_of_Surgery', axis=1, inplace=True)

df['LastVisit_Year'] = pd.DatetimeIndex(df['Date_of_Last_Visit']).year
df['LastVisit_Month'] = pd.DatetimeIndex(df['Date_of_Last_Visit']).month
df['LastVisit_Day'] = pd.DatetimeIndex(df['Date_of_Last_Visit']).day
df.drop('Date_of_Last_Visit', axis=1, inplace=True)

In [None]:
numerical_columns = [column for column in df.columns if df[column].dtype in ['int64','float64']]

numerical_preprocessor = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy='mean')),
    ("scaler",StandardScaler())
])

categorical_preprocessor = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy='most_frequent')),
    ("onehot",OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
transformers=[
    ("numerical",numerical_preprocessor,numerical_columns)
])

In [None]:
X = df.drop("Patient_Status",axis=1)
y = df.Patient_Status

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,
                                                y,
                                                test_size=0.2,
                                                random_state=42)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

## Evaluating accuracy across all models

In [None]:
!pip install lazypredict

In [None]:
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0,ignore_warnings=True,custom_metric=None)
model,predictions = clf.fit(X_train,X_test,y_train,y_test)
print(predictions)

## Neural Network

In [None]:
import torch

X_train_tensor = torch.from_numpy(X_train)
X_test_tensor = torch.from_numpy(X_test)
y_train_tensor = torch.from_numpy(y_train)
y_test_tensor = torch.from_numpy(y_test)

In [None]:
import torch.nn as nn
import torch.optim as optim

class PyTorchModel(nn.Module):
    def __init__(self, input_shape, num_classes, units=[16, 16, 16, 16],dropout=0.1, learning_rate=8e-3):
        super(PyTorchModel,self).__init__()
        self.fc1 = nn.Linear(input_shape[1], units[0])
        self.fc2 = nn.Linear(units[0], units[1])
        self.fc3 = nn.Linear(units[1], units[2])
        self.fc4 = nn.Linear(units[2], units[3])
        self.fc5 = nn.Linear(units[3], num_classes)

        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x):
        x = x.to(self.fc1.weight.dtype)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.relu(self.fc4(x))
        x = self.dropout(x)
        x = self.softmax(self.fc5(x))
        return x

In [None]:
input_shape = (None, X_train_tensor.shape[1])
num_classes=3

# Define the model, optimizer and loss function
model = PyTorchModel(input_shape=input_shape,num_classes=num_classes)
optimizer = optim.Adam(model.parameters(),lr=8e-3)
criterion = nn.CrossEntropyLoss()

In [None]:
def train_loop(model, optimizer, criterion, X_train, y_train, X_val, y_val, epochs=10, batch_size=1):
    for epoch in range(epochs):
        # Train the model on batches of data
        for i in range(0, len(X_train), batch_size):
            # Extract a batch of data
            batch_X = X_train[i:i+batch_size]
            batch_y = y_train[i:i+batch_size]
            
            # Forward pass
            output = model(batch_X)
            # Convert batch_y to Long tensor
            batch_y = batch_y.long()
            loss = criterion(output, batch_y)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        # Evaluate the model on validation data
        with torch.no_grad():
            val_output = model(X_val)
            # Convert y_val to Long tensor
            y_val = y_val.long()
            val_loss = criterion(val_output, y_val)
            val_accuracy = (val_output.argmax(dim=1) == y_val).float().mean()
            
        # Print the result for an epoch
        print(f"Epoch:{epoch+1}  "
              f"train_loss:{loss:.4f}  "
              f"val_loss:{val_loss:.4f}  "
              f"val_acc:{val_accuracy:.4f}")




In [None]:
# Train the model
train_loop(model, optimizer, criterion, X_train_tensor, y_train_tensor,
           X_test_tensor, y_test_tensor, epochs=20)

The neural net can further be improved by adjusting no. of units, learning rate, increasing the no. of layers, etc.