In [5]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [6]:
df = pd.read_csv('https://raw.githubusercontent.com/gscdit/Breast-Cancer-Detection/refs/heads/master/data.csv')
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [7]:
print(df.shape)

(569, 33)


* `inplace=True` → makes the change directly in df (doesn’t create a copy).

In [8]:
df.drop(columns=['id', 'Unnamed: 32'], inplace=True)
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [10]:
print(df.shape)

(569, 31)


In [11]:
print(df.iloc[:4, 0])

0    M
1    M
2    M
3    M
Name: diagnosis, dtype: object


* Train Test Split

In [12]:
X_spilt = df.iloc[:, 1:]
y_split = df.iloc[:, 0]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_spilt, y_split, random_state=42, test_size=0.20)

* Simple tutorial on `Scaling` & `LabelEncoding`

* `Scaling:`

In [14]:
X_train1 = [10, 20, 30]
X_test1  = [15, 25]

# fit_transform on training
# fit → finds mean=20, std=8.16
# transform → scales training data
# x_train_scaled = (x_train - mean) / std
X_train_scaled = [-1.22, 0.0, 1.22]

# transform on test
# uses the same mean & std from training
# x_test_scaled = (x_test - mean) / std
X_test_scaled = [-0.61, 0.61]

* `LabelEncoding:`

In [15]:
y_train1 = ["cat", "dog", "dog", "cat"]
y_test1  = ["dog", "cat"]

# fit_transform on training
# fit → learns mapping: {"cat":0, "dog":1}
# transform → converts training labels
y_train_encoded = [0, 1, 1, 0]

# transform on test (uses same mapping)
y_test_encoded = [1, 0]

* Scaling 

In [16]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
X_train

array([[-1.44075296, -0.43531947, -1.36208497, ...,  0.9320124 ,
         2.09724217,  1.88645014],
       [ 1.97409619,  1.73302577,  2.09167167, ...,  2.6989469 ,
         1.89116053,  2.49783848],
       [-1.39998202, -1.24962228, -1.34520926, ..., -0.97023893,
         0.59760192,  0.0578942 ],
       ...,
       [ 0.04880192, -0.55500086, -0.06512547, ..., -1.23903365,
        -0.70863864, -1.27145475],
       [-0.03896885,  0.10207345, -0.03137406, ...,  1.05001236,
         0.43432185,  1.21336207],
       [-0.54860557,  0.31327591, -0.60350155, ..., -0.61102866,
        -0.3345212 , -0.84628745]])

In [18]:
y_train

68     B
181    M
63     B
248    B
60     B
      ..
71     B
106    B
270    B
435    M
102    B
Name: diagnosis, Length: 455, dtype: object

* Label Encoding 

In [19]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test  = encoder.transform(y_test)

In [20]:
y_train

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,

In [21]:
y_test

array([0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1])

In [22]:
print(type(X_train), type(y_train))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [23]:
X_train_tensor = torch.from_numpy(X_train.astype(np.float32))
X_test_tensor  = torch.from_numpy(X_test.astype(np.float32))
y_train_tensor = torch.from_numpy(y_train.astype(np.float32))
y_test_tensor  = torch.from_numpy(y_test.astype(np.float32))

In [24]:
X_train_tensor.shape, y_train_tensor.shape

(torch.Size([455, 30]), torch.Size([455]))

* Building Model

In [34]:
from torch import nn

class BinaryClassificationModel(nn.Module):

    def __init__(self, in_features):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(in_features=in_features,
                      out_features=1),
            nn.Sigmoid()
        )

    def forward(self, X):
        y_pred = self.model(X)
        return y_pred

* Training Pipeline  -> Try to follow this for training loop

In [35]:
# set reproducibility
import torch
torch.manual_seed(42)
# define hyper parameters
lr = 0.1 # learning rate
epochs = 50

# create empty loss list to track values
train_loss_values = []
test_loss_values = []
epoch_count = []

# create an instance of the Model
input_features = X_train_tensor.shape[1]
model = BinaryClassificationModel(input_features)

# create loss function
loss_function = nn.BCELoss()

# define optimizer
optimizer = torch.optim.SGD(params=model.parameters(),
                           lr=lr)

for epoch in range(epochs):

    # put model in training mode
    model.train()

    # forward pass
    y_pred = model(X_train_tensor)

    # calculate loss
    loss = loss_function(y_pred, y_train_tensor.view(-1, 1))

    # Clear gradients
    optimizer.zero_grad()

    # apply backpropagation
    loss.backward()

    # update parameters
    optimizer.step()

    # put model in evaluation mode
    model.eval()

    with torch.inference_mode():
        # forward pass on test
        test_pred = model(X_test_tensor)

        # calculate test loss
        test_loss = loss_function(test_pred, y_test_tensor.view(-1,1))

        if epoch % 10 == 0:
            epoch_count.append(epoch)
            train_loss_values.append(loss.detach().numpy())
            test_loss_values.append(test_loss.detach().numpy())
            print(f"Epoch: {epoch} | BCE Train Loss: {loss} | BCE Test Loss: {test_loss} ")

Epoch: 0 | BCE Train Loss: 0.6703537702560425 | BCE Test Loss: 0.5113815069198608 
Epoch: 10 | BCE Train Loss: 0.2561724781990051 | BCE Test Loss: 0.21949444711208344 
Epoch: 20 | BCE Train Loss: 0.19322893023490906 | BCE Test Loss: 0.16543683409690857 
Epoch: 30 | BCE Train Loss: 0.16379277408123016 | BCE Test Loss: 0.14018133282661438 
Epoch: 40 | BCE Train Loss: 0.1463005691766739 | BCE Test Loss: 0.12496013939380646 


In [39]:
# model evaluation
with torch.no_grad():
  y_test_pred = model.forward(X_test_tensor)
  y_test_pred = (y_test_pred > 0.9).float()
  accuracy = (y_test_pred == y_test_tensor).float().mean()
  print(f'Accuracy: {accuracy.item()}')

Accuracy: 0.5538627505302429
