### Neural Network for Dataset 1 (Regression)

In [2002]:
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,accuracy_score,classification_report
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import pandas

In [2003]:
# CHOOSE DATASET

# 1. Regression
diabetes = datasets.load_diabetes(as_frame=True)
X = diabetes.data.values
y = diabetes.target.values

# 2. Classification
# diabetes = datasets.fetch_openml(name="diabetes", version=1, as_frame=True)
# X = diabetes.data.values
# y = diabetes.target.astype(str).map({'tested_positive': 1, 'tested_negative': 0}).values

In [2004]:
#train test spliting
test_size=0.2
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=test_size, random_state=42)

In [2005]:
# Standardize features
scaler=StandardScaler()
Xtr= scaler.fit_transform(Xtr)
Xte= scaler.transform(Xte)

### NN Architecture
The architecture can be tuned by changing the number of layers, layer size and regularization (dropout). Dropout prevents overfitting by randomly "dropping out" (setting to zero) a fraction of the neurons during training, which forces the network to learn more robust and generalized features. Regularization is determined later together with other hyperparameters.  
  
It can be very hard to determine an optimal architecture but I ended up using the following:  
**Regression**: 3 hidden layers with 64 neurons each.  
**Classification**: 4 hidden layers with 64 neurons in the first three and 32 neurons in the last.

In [2006]:
class MLP(nn.Module):
    def __init__(self, input_size, output_size=1, dropout_prob=0.5):
        super(MLP, self).__init__()
        
        self.fc1 = nn.Linear(input_size, 64)
        # self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 64)
        self.out = nn.Linear(64, output_size)
        
        self.dropout = nn.Dropout(p=dropout_prob)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        
        # x = F.relu(self.fc2(x))
        # x = self.dropout(x)
        
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        
        x = F.relu(self.fc4(x))
        x = self.dropout(x)
        
        x = self.out(x)
        return x

### Hyperparameters
- **`num_epochs`** – Number of training passes over the entire dataset. Don't want too many epochs to avoid overfitting to noise.  
- **`lr`** – Learning rate. Step size for updating weights during training. Controls how fast the model learns.  
- **`dropout`** – Fraction of neurons randomly dropped during training to reduce overfitting. In this task I'm using 10% but for toy datasets, higher fraction could be used.  
- **`batch_size`** – Number of samples processed before updating the model. If too much RAM is being used, this value could for example be dropped to 32.  
  
After tuning the model, the following values were chosen for the different datasets:  
**Regression**: `num_epochs`=60, `lr`=0.001, `dropout`=0.1, `batch_size`=64  
**Classification**: `num_epochs`=75, `lr`=0.001, `dropout`=0.1, `batch_size`=64  
  
Important to note is that these values are not necessarily fully optimized. Training the model over and over with the same hyperparameters can give very different performance.

In [2007]:
num_epochs=60 
lr=0.001
dropout=0.1
batch_size=64 

In [2008]:
Xtr = torch.tensor(Xtr, dtype=torch.float32)
ytr = torch.tensor(ytr, dtype=torch.float32)
Xte = torch.tensor(Xte, dtype=torch.float32)
yte = torch.tensor(yte, dtype=torch.float32)

# Wrap Xtr and ytr into a dataset
train_dataset = TensorDataset(Xtr, ytr)

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [2009]:
# Model, Loss, Optimizer
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  
# Ignoring this line since I'm not using cuda

model = MLP(input_size=Xtr.shape[1], dropout_prob=dropout)#.to(device)
# criterion = nn.BCEWithLogitsLoss()  # for binary classification
criterion = nn.MSELoss() # for regression
optimizer = optim.Adam(model.parameters(), lr=lr) #can use different optimizer such as AdamW but not necessary

In [2010]:
# Training loop
for epoch in range(num_epochs):
    model.train() #train or evolve
    epoch_loss = 0.0

    for batch_x, batch_y in train_dataloader:
        batch_x = batch_x#.to(device)
        batch_y = batch_y#.to(device)

        logits = model(batch_x)
        loss = criterion(logits, batch_y.view(-1, 1))

        optimizer.zero_grad()
        loss.backward() #directly related to the forward function defined above
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

Epoch [1/60], Loss: 29575.9437
Epoch [2/60], Loss: 30131.5693
Epoch [3/60], Loss: 29604.6950
Epoch [4/60], Loss: 29306.4294
Epoch [5/60], Loss: 28574.7874
Epoch [6/60], Loss: 28728.4570
Epoch [7/60], Loss: 27837.8532
Epoch [8/60], Loss: 27302.4528
Epoch [9/60], Loss: 26152.4359
Epoch [10/60], Loss: 23954.2630
Epoch [11/60], Loss: 21583.5163
Epoch [12/60], Loss: 18228.2782
Epoch [13/60], Loss: 14263.4688
Epoch [14/60], Loss: 10740.4442
Epoch [15/60], Loss: 8089.3803
Epoch [16/60], Loss: 6009.1539
Epoch [17/60], Loss: 5237.9879
Epoch [18/60], Loss: 5130.6556
Epoch [19/60], Loss: 4511.1023
Epoch [20/60], Loss: 4366.9094
Epoch [21/60], Loss: 3985.4761
Epoch [22/60], Loss: 4150.6009
Epoch [23/60], Loss: 3794.2279
Epoch [24/60], Loss: 4077.5017
Epoch [25/60], Loss: 3815.5411
Epoch [26/60], Loss: 3694.2213
Epoch [27/60], Loss: 3857.4960
Epoch [28/60], Loss: 3602.5615
Epoch [29/60], Loss: 3520.0992
Epoch [30/60], Loss: 3633.3056
Epoch [31/60], Loss: 3381.4645
Epoch [32/60], Loss: 3406.5404
Epo

We print `mean_squared_error` and `accuracy_score` as indications of the performance of the model for regression and classification respectively.

In [2011]:
y_pred=model(Xte)
# Performance metric for regression
print(f'MSE:{mean_squared_error(yte.detach().numpy(),y_pred.detach().numpy())}') 

# Performance metric for classification
# print(f'ACC:{accuracy_score(yte.detach().numpy(),y_pred.detach().numpy()>0.5)}') 


MSE:2840.69287109375
