In [89]:
import torch
import sklearn
import sklearn.neural_network
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import accuracy_score


In [90]:
# utility function
# load train and test data
def load_train_test_data():
    '''
    This function can be used to load the preprocessed dataset and output the same training and testing data within our 
    different notebooks.

    We decided to do a 85-15 split since our dataset is not very big and we want to maximize the training data while 
    preserving the test data to some extent. <font color='red'>We have a bit more data due to the oversampling now. 
    
    returns: X_train, X_test, y_train, y_test
    '''
    import pandas as pd
    from sklearn.model_selection import train_test_split
    
    # load the preprocessed dataset
    df = pd.read_csv('data/diabetes_dataset_preprocessed.csv')
    
    # split the dataset into features and target
    y = df['Diabetic']
    X = df.drop('Diabetic', axis=1)
    
    # perform the train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

    # print information about the datasets
    print('A snippet of our training data:')
    display(X_train.head())
    print("There are {} entries with {} columns in our training data.".format(X_train.shape[0], X_train.shape[1]))
    print("There are {} entries with {} columns in our testing data.".format(X_test.shape[0], X_test.shape[1]))

    return (X_train, X_test, y_train, y_test)

In [91]:
# Define the network architecture
class Net(torch.nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)   # hidden layer
        self.predict = torch.nn.Linear(n_hidden, n_output)   # output layer
        
    def forward(self, x):
        x = torch.relu(self.hidden(x))      # activation function for hidden layer
        x = self.predict(x)             # linear output
        return x


In [92]:
# Load the data
X_train, X_test, y_train, y_test = load_train_test_data()

A snippet of our training data:


Unnamed: 0,Age,PhysicallyActive,BMI,Sleep,SoundSleep,JunkFood,Stress,BPLevel,Pregnancies,UrinationFreq,Gender_Male,Family_Diabetes_yes,Smoking_yes,Pdiabetes_yes,highBP_yes,Alcohol_yes,RegularMedicine_yes,BMI_high
554,0.0,3.0,0.77493,0.053296,-1.347851,0.0,1.0,1.0,-0.476208,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
46,2.0,3.0,2.295772,0.805294,0.235535,0.0,2.0,1.0,-0.476208,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1
968,2.0,0.0,0.965035,0.053296,0.235535,0.0,3.0,2.0,-0.476208,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1
816,2.0,2.0,0.204615,0.053296,-0.29226,0.0,1.0,2.0,-0.476208,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1
1120,3.0,2.0,0.01451,0.805294,-1.347851,0.0,1.0,1.0,-0.476208,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1


There are 1091 entries with 18 columns in our training data.
There are 193 entries with 18 columns in our testing data.


In [93]:
X_test

Unnamed: 0,Age,PhysicallyActive,BMI,Sleep,SoundSleep,JunkFood,Stress,BPLevel,Pregnancies,UrinationFreq,Gender_Male,Family_Diabetes_yes,Smoking_yes,Pdiabetes_yes,highBP_yes,Alcohol_yes,RegularMedicine_yes,BMI_high
808,0.0,2.0,-0.365701,0.053296,0.235535,1.0,1.0,1.0,-0.476208,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0
186,0.0,2.0,-0.745911,0.805294,0.235535,1.0,1.0,1.0,-0.476208,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0
583,2.0,1.0,1.345246,0.053296,-0.292260,0.0,1.0,1.0,-0.476208,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
553,2.0,1.0,1.345246,-1.450701,-0.292260,0.0,1.0,2.0,-0.476208,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1
405,0.0,1.0,-0.555806,-1.450701,-0.292260,0.0,0.0,1.0,-0.476208,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,0.0,3.0,0.584825,-0.698703,0.235535,1.0,1.0,1.0,1.489871,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
92,0.0,3.0,-0.936016,0.053296,0.235535,1.0,2.0,1.0,-0.476208,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0
864,0.0,1.0,-0.936016,0.805294,0.763330,1.0,1.0,1.0,-0.476208,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
296,3.0,2.0,-0.555806,0.053296,0.235535,0.0,2.0,2.0,-0.476208,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0


In [94]:
y_test

808    0
186    1
583    1
553    0
405    0
      ..
123    0
92     0
864    0
296    1
710    0
Name: Diabetic, Length: 193, dtype: int64

In [95]:
# Convert the data to tensors
X_train = torch.tensor(X_train.values, dtype=torch.float32)
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.int64)
y_test = torch.tensor(y_test.values, dtype=torch.int64)

In [99]:
# define the network
X_train.shape[0]
net = Net(n_feature=X_train.shape[1], n_hidden=10, n_output=2)
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
loss = torch.nn.CrossEntropyLoss()

batch_size = 100
num_epoch = 20
next_epoch = 1

In [100]:
for epoch in range(next_epoch, next_epoch+num_epoch):
        
        # Make an entire pass (an 'epoch') over the training data in batch_size chunks
        for i in range(0, len(X_train)):        
            X = X_train[i]     # Slice out a mini-batch of features
            y = y_train[i]     # Slice out a mini-batch of targets
    
            y_pred = net(X)                 # Make predictions (final-layer activations)
            l = loss(y_pred, y)                # Compute loss with respect to predictions
            
            net.zero_grad()                   # Reset all gradient accumulators to zero (PyTorch thing)
            l.backward()                        # Compute gradient of loss wrt all parameters (backprop!)
            optimizer.step()                    # Use the gradients to take a step with SGD.
            
        print("Epoch %2d: loss on final training batch: %.4f" % (epoch, l.item()))

print("Epoch %2d: loss on test set: %.4f" % (epoch, loss(net(X_test), y_test)))
next_epoch = epoch+1

Epoch  1: loss on final training batch: 0.4131
Epoch  2: loss on final training batch: 0.5267
Epoch  3: loss on final training batch: 0.4480
Epoch  4: loss on final training batch: 0.3082
Epoch  5: loss on final training batch: 0.0726
Epoch  6: loss on final training batch: 0.4424
Epoch  7: loss on final training batch: 0.0438
Epoch  8: loss on final training batch: 0.4352
Epoch  9: loss on final training batch: 0.0653
Epoch 10: loss on final training batch: 0.0989
Epoch 11: loss on final training batch: 0.0896
Epoch 12: loss on final training batch: 0.1676
Epoch 13: loss on final training batch: 0.0395
Epoch 14: loss on final training batch: 0.0403
Epoch 15: loss on final training batch: 0.0205
Epoch 16: loss on final training batch: 0.0139
Epoch 17: loss on final training batch: 0.0009
Epoch 18: loss on final training batch: 0.0286
Epoch 19: loss on final training batch: 0.0099
Epoch 20: loss on final training batch: 0.0033
Epoch 20: loss on test set: 0.2002


In [98]:
# print the accuracy
y_pred = F.softmax(net(X_test)).argmax(dim=1).float()
print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))


Accuracy: 0.8601036269430051


  y_pred = F.softmax(net(X_test)).argmax(dim=1).float()
