## Data loading

In [1]:
import numpy as np
import pandas as pd
import torch

In [2]:
file_path= 'diabetes_scale.txt'

# Read the data from the file
with open(file_path, 'r') as file:
    data = file.readlines()

# Create a DataFrame
col_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']
df = pd.DataFrame([line.strip().split() for line in data], columns=["Outcome"] + col_names)

# Split each feature column into two columns: Feature ID and Feature Value
for col in col_names:
    feature_data = df[col].str.split(":", expand=True)
    df[col] = feature_data[1].astype(float)

df["Outcome"] = df["Outcome"].astype(int)
df = df.dropna()

# Extract features and groundtruth columns
X = df.iloc[:,1:].values
y = df.iloc[:,0].values

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_test, X_val, y_test, y_val  = train_test_split(X_test, y_test, test_size=0.5, random_state=1)

X_train.shape, X_val.shape, X_test.shape

((607, 8), (76, 8), (76, 8))

In [8]:
# def build_tensor_data(X_train, X_test, X_val, y_train, y_test, y_val):
#     '''
#     Convert our data into 32-bit tensors
#     '''
#     X_test = torch.tensor(X_test).to(dtype=torch.float32)
#     X_val = torch.tensor(X_val, dtype=torch.float32)
#     X_train = torch.tensor(X_train, dtype=torch.float32)

#     # map from [-1, 1] to [0, 1]
#     if y_train.min() < 0:
#         y_train = (y_train + 1) / 2
#         y_test = (y_test + 1) / 2
#         y_val = (y_val + 1) / 2

#     y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)
#     y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1)
#     y_val = torch.tensor(y_val, dtype=torch.float32).reshape(-1, 1)
    
#     return X_train, X_test, X_val, y_train, y_test, y_val

# X_train, X_test, X_val, y_train, y_test, y_val = build_tensor_data(X_train, X_test, X_val, y_train, y_test, y_val)


def build_tensor(X, y):
    X = torch.tensor(X, dtype=torch.float32)
    if y.min() < 0:
        y = (y + 1) / 2
    y = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)
    return X, y

X_train, y_train = build_tensor(X_train, y_train)
X_test, y_test = build_tensor(X_test, y_test)
X_val, y_val = build_tensor(X_val, y_val)

  X = torch.tensor(X, dtype=torch.float32)
  y = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)


In [10]:
# class Dataset(torch.utils.data.Dataset):
#     'Characterizes a dataset for PyTorch'
#     def __init__(self, features, labels):
#         'Initialization'
#         self.labels = labels
#         self.features = features

#     def __len__(self):
#         'Denotes the total number of samples'
#         return len(self.labels)

#     def __getitem__(self, i):
#         'Generates one sample of data'
#         return self.features[i], self.labels[i]

from torch.utils.data import TensorDataset
    
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset( X_val, y_val)
test_dataset = TensorDataset(X_test, y_test)

In [11]:
# create the dataloader class simply by passing the custom dataset object we created
from torch.utils.data import DataLoader

batch_size = 32

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# demo / test -- this returns the first "shuffled" entry in the dataset
for images, labels in train_dataloader:
    pass

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def metric_eval(y_true, y_pred, isPrint=False):
    
    # y_true = y_true.detach().numpy()
    # y_pred = y_pred.detach().numpy()
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    if isPrint:

        print(f'Precision: {precision:.2f}')
        print(f'Recall: {recall:.2f}')
        print(f'F1-score: {f1:.2f}')

        print('Confusion Matrix:')
        print(conf_matrix)
    
    return precision, recall, f1, conf_matrix

The training loop is simple.
First we reset the gradients so that we don't use the gradients from the previous rows.  
Next we predict labels using a batch from the dataloader and evaluate the loss.
Then we backpropagate the loss by calling `loss.backward`, and tell the optimizer to step towards the minimum with that loss.

## MLP architecture

In [13]:
import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_size=8, hidden_size1=64, hidden_size2=64, output_size=1):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size2, output_size)
        
        self.sigmoid =  nn.Sigmoid()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(hidden_size1)
        self.batchnorm2 = nn.BatchNorm1d(hidden_size2)

    def forward(self, x):
        x = self.fc1(x)
        x = self.batchnorm1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.batchnorm2(x)
        x = self.relu2(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.sigmoid(x)

        return x
    
model = MLP()
print(model)

MLP(
  (fc1): Linear(in_features=8, out_features=64, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [26]:
def train_model(model, train_dataloader, val_dataset, loss_fn, optimizer, num_epochs, learning_rate = 1, show_metrics = True):
    
    best_acc = -np.inf
    
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        running_loss = 0.0
        for inputs, labels in train_dataloader:  # batch-wise
            
            outputs = model(inputs)
            optimizer.zero_grad()
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            break
            
        # Print the average loss for this epoch
        # print(f'Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_dataloader)}')
        
        X_val, y_val = val_dataset.tensors[0], val_dataset.tensors[1]
        y_pred = model(X_val)
        y_pred = (y_pred >= 0.5).int()
        # print(y_pred[:10], y_val[:10])
        acc = (y_pred == y_val).float().mean()
        acc = float(acc)
        # print(f'Accuracy {acc}')
        ## Other metrics -
        if show_metrics == True:
            metric_eval(y_val, y_pred)

In [27]:
input_size = 8
hidden_size1 = 300
hidden_size2 = 100
output_size = 1

mlp = MLP(input_size, hidden_size1, hidden_size2, output_size)

learning_rate = 0.01
loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(mlp.parameters(),lr=learning_rate)

train_model(mlp, train_dataloader, val_dataset, loss_fn, optimizer, num_epochs=30 , show_metrics = True)

Experimentation with MLP

* increase num of layers
* change activation function
* change learning rate
* change weight initialization

### Use 5-fold CV

In [55]:
from sklearn.model_selection import KFold

def MLPCrossValidationTest(X, y, learning_rate = 1, num_epochs = 100):
    
    # init historical metric arrays
    acc_arr, p_arr, r_arr, f1_arr = [], [], [], []
    
    kf = KFold(n_splits=5)
    
    for train_index, val_index in kf.split(X):
        # print("TRAIN:", train_index, "VAL:", val_index)
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        # convert to [0, 1] and into tensor datatype
        X_train, y_train = build_tensor(X_train, y_train) 
        X_val, y_val = build_tensor(X_val, y_val)
        
        # create datasets and dataloader
        train_dataset = TensorDataset(X_train, y_train)
        val_dataset = TensorDataset(X_val, y_val)
        train_dataloader = DataLoader(train_dataset, batch_size = 32, shuffle=True)
        
        ##########
        ########## Adjust layers here
        mlp = MLP(input_size=8, hidden_size1=256, hidden_size2=256, output_size=1)
        
        # model params
        learning_rate = 0.01
        loss_fn = nn.BCELoss()
        optimizer = torch.optim.SGD(mlp.parameters(),lr=learning_rate)

        # train model
        train_model(mlp, train_dataloader, val_dataset, loss_fn, optimizer, num_epochs=num_epochs, learning_rate = learning_rate, show_metrics = False)
        
        # validation
        X_val = val_dataset.tensors[0]
        y_val = val_dataset.tensors[1]
        
        predictions = mlp(X_val)
        predictions = (predictions >= 0.5).int()
        
        test_acc = accuracy_score(predictions, y_val)  # again, val <-> test here
        # print(f'Test acc for learning rate {learning_rate}',test_acc)
        p, r, f1, _ = metric_eval(predictions, y_val)
        
        acc_arr.append(test_acc)
        p_arr.append(p)
        r_arr.append(r)
        f1_arr.append(f1)
        
    return acc_arr, p_arr, r_arr, f1_arr

In [49]:
# nodes are 8, 64, 64, 1

for learn_r in [1, 0.1, 0.01, 0.001]:

    acc_arr, p_arr, r_arr, f1_arr = MLPCrossValidationTest(X, y, learning_rate = learn_r)
    print('acc, precision, recall, f1')
    print(f'lr = {learn_r} : ', [np.mean(x) for x in [acc_arr, p_arr, r_arr, f1_arr] ] )

acc, precision, recall, f1
lr = 1 :  [0.7325984663645869, 0.8721080275419798, 0.7553931073842579, 0.808435408157125]
acc, precision, recall, f1
lr = 0.1 :  [0.7443795747647263, 0.8897922358023186, 0.766344573602986, 0.8192576232094572]
acc, precision, recall, f1
lr = 0.01 :  [0.7457215057511328, 0.8838491444129287, 0.7656109503035514, 0.8183189245968471]
acc, precision, recall, f1
lr = 0.001 :  [0.7377831997211572, 0.8880790012393277, 0.7565072614324857, 0.8148051701262965]


It would seem learning rate has little effect given number of epochs = 100 (which is what we used for SLP)

## Try networks of various widths

I adjusted MLP width settings inside of the MLPCrossValidationTest function

In [52]:
# for 8, 8, 8, 1
lr = 0.01
acc_arr, p_arr, r_arr, f1_arr = MLPCrossValidationTest(X, y, learning_rate = lr)
print('acc, precision, recall, f1')
print(f'lr = {lr} : ', [np.mean(x) for x in [acc_arr, p_arr, r_arr, f1_arr] ] )

acc, precision, recall, f1
lr = 0.01 :  [0.6864761240850471, 0.7752677777903066, 0.7656477258720572, 0.7557518309535288]


In [54]:
# 8, 4, 4, 1
lr = 0.01
acc_arr, p_arr, r_arr, f1_arr = MLPCrossValidationTest(X, y, learning_rate = lr)
print('acc, precision, recall, f1')
print(f'lr = {lr} : ', [np.mean(x) for x in [acc_arr, p_arr, r_arr, f1_arr] ] )

acc, precision, recall, f1
lr = 0.01 :  [0.6205820843499477, 0.8581298454479569, 0.6638457732691339, 0.743868671210518]


In [56]:
# 8, 256, 256, 1
lr = 0.01
acc_arr, p_arr, r_arr, f1_arr = MLPCrossValidationTest(X, y, learning_rate = lr)
print('acc, precision, recall, f1')
print(f'lr = {lr} : ', [np.mean(x) for x in [acc_arr, p_arr, r_arr, f1_arr] ] )

acc, precision, recall, f1
lr = 0.01 :  [0.7681334959916348, 0.8635565052397212, 0.7991670538542021, 0.8284484933339848]


MLP with different weight initialization