In [1]:
# joepareti54@gmail.com  May 3 2022
# binary classification : cancer, no cancer 
# using a dataset in scikit learn
#
# the model class definition is similar to the linear regression
# but the last layer is a sigmoid function

In [2]:
import numpy as np
import torch
from torch import nn, optim
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#

In [3]:
bc = datasets.load_breast_cancer()
X, y = bc.data, bc.target

In [4]:
print('input shape ',X.shape)
print('label shape ',y.shape)
n_samples = X.shape[0]
n_features = X.shape[1]
print('n_samples ', n_samples)
print('n_features ', n_features)

input shape  (569, 30)
label shape  (569,)
n_samples  569
n_features  30


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('input shape trn ',X_train.shape)
print('input shape tst ',X_test.shape)

input shape trn  (455, 30)
input shape tst  (114, 30)


In [6]:
# apply scaling
scaler = StandardScaler().fit(X_train)
X_TRAIN = scaler.transform(X_train)
X_TEST  = scaler.transform(X_test)

In [7]:
#convert data to pytorch tensors
#XX_train = torch.from_numpy(X_TRAIN, dtype=np.float32)
XX_train = torch.from_numpy(X_TRAIN.astype(np.float32))
XX_test  = torch.from_numpy(X_TEST.astype(np.float32))
print('shape of train tensor ',XX_train.shape)
print('shape of test tensor ',XX_test.shape)
YY_train = torch.from_numpy(y_train.astype(np.float32))
YY_test = torch.from_numpy(y_test.astype(np.float32))

shape of train tensor  torch.Size([455, 30])
shape of test tensor  torch.Size([114, 30])


In [8]:
# in order to understand what input size and output size need to be for this case:
# https://stackoverflow.com/questions/54916135/what-is-the-class-definition-of-nn-linear-in-pytorch
# 
class BinaryClassification(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)
    def forward(self, x):
        y_pred = self.linear(x)
        y_pred = torch.sigmoid(y_pred)
        return y_pred
        

In [9]:
model = BinaryClassification(XX_train.shape[1], 1)
loss = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 100000

In [10]:
yy_train = YY_train.view(-1,1)
yy_test  = YY_test.view(-1, 1)

In [11]:
for i in range(epochs):
    optimizer.zero_grad()
    output = model(XX_train)
    LOSS = loss(output, yy_train)
#    print(LOSS)
    LOSS.backward()
    optimizer.step()
    if i % 10000 == 0:
        print('epoch', i, 'loss', LOSS.item())

epoch 0 loss 0.331028014421463
epoch 10000 loss 0.006876879837363958
epoch 20000 loss 0.0025529274716973305
epoch 30000 loss 0.0022152799647301435
epoch 40000 loss 0.0021987820509821177
epoch 50000 loss 0.0021979310549795628
epoch 60000 loss 0.0021978511940687895
epoch 70000 loss 0.002197830704972148
epoch 80000 loss 0.00219782255589962
epoch 90000 loss 0.0021978176664561033


In [12]:
#test model
y_hat = model(XX_test)
for i in range(20):
    print(y_hat[i].item(),yy_test[i].item()  )

1.0 1.0
0.0 0.0
0.0 0.0
1.0 1.0
1.0 1.0
0.0 0.0
0.0 0.0
1.7295785418606282e-31 0.0
0.9932057857513428 1.0
1.0 1.0
1.0 1.0
3.909506575849516e-37 0.0
1.0 1.0
1.8312136808408115e-18 0.0
1.0 1.0
2.6416767455771666e-35 0.0
1.0 1.0
1.0 1.0
1.0 1.0
0.0 0.0


In [23]:
# compute accuracy
with torch.no_grad():
    
    a = y_hat.round()
    A = (a == yy_test)
    acc = A.sum() / float(YY_test.shape[0])
    print(acc)


tensor(0.9561)
