In [1]:
import os
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter

In [2]:
num_workers = os.cpu_count()
print(f"Number of workers: {num_workers}")

Number of workers: 16


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

#### Load dataset

In [4]:
bc = datasets.load_wine()
X, y = bc.data, bc.target

In [5]:
X = torch.from_numpy(X.astype(np.float32))
y = torch.from_numpy(y.astype(np.int64))
# y = y.view(y.shape[0], 1)
n_samples, n_features = X.shape

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=1
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, stratify=y_train, test_size=0.2, random_state=1
)

X_train.shape, X_val.shape, X_test.shape

(torch.Size([113, 13]), torch.Size([29, 13]), torch.Size([36, 13]))

### Prepare Dataset and Dataloader

In [7]:
class LRDataset(Dataset):

    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __getitem__(self, index):
        return self.X[index], self.y[index]

    def __len__(self):
        return len(self.y)

In [8]:
train_ds = LRDataset(X_train, y_train)
val_ds = LRDataset(X_val, y_val)
test_ds = LRDataset(X_test, y_test)

In [9]:
num_workers = 0

train_dl = DataLoader(
    dataset=train_ds, batch_size=32, shuffle=True, num_workers=num_workers
)
val_dl = DataLoader(
    dataset=val_ds, batch_size=32, shuffle=False, num_workers=num_workers
)
test_dl = DataLoader(
    dataset=test_ds, batch_size=32, shuffle=False, num_workers=num_workers
)

### Define MLP

In [10]:
class MLP(nn.Module):
    def __init__(self, n_features, n_classes):
        super(MLP, self).__init__()
        self.linear1 = nn.Linear(n_features, 16)
        self.linear2 = nn.Linear(16, 8)
        self.linear3 = nn.Linear(8, 4)
        self.linear4 = nn.Linear(4, n_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.relu(self.linear1(x))
        out = self.relu(self.linear2(out))
        out = self.relu(self.linear3(out))
        out = self.linear4(out)
        return out

In [11]:
class MLPSequential(nn.Module):
    def __init__(self, n_features, n_classes):
        super(MLPSequential, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_features, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 4),
            nn.ReLU(),
            nn.Linear(4, n_classes),
        )

    def forward(self, x):
        return self.layers(x)

In [12]:
class MLPDict(nn.Module):
    def __init__(self, n_features, n_classes):
        super(MLPDict, self).__init__()
        self.layers = nn.ModuleDict(
            {
                "linear1": nn.Linear(n_features, 16),
                "relu1": nn.ReLU(),
                "linear2": nn.Linear(16, 8),
                "relu2": nn.ReLU(),
                "linear3": nn.Linear(8, 4),
                "relu3": nn.ReLU(),
                "linear4": nn.Linear(4, n_classes),  # No activation here
            }
        )

    def forward(self, x):
        x = self.layers["relu1"](self.layers["linear1"](x))
        x = self.layers["relu2"](self.layers["linear2"](x))
        x = self.layers["relu3"](self.layers["linear3"](x))
        x = self.layers["linear4"](x)  # No activation here
        return x

In [13]:
class ImprovedMLP(nn.Module):
    def __init__(self, n_features, n_classes, dropout_prob=0.3):
        super(ImprovedMLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_features, 32),
            nn.BatchNorm1d(32),  # Batch Normalization
            nn.ReLU(),
            nn.Dropout(dropout_prob),  # Dropout
            nn.Linear(32, 16),
            nn.BatchNorm1d(16),  # Batch Normalization
            nn.ReLU(),
            nn.Dropout(dropout_prob),  # Dropout
            nn.Linear(16, 8),
            nn.BatchNorm1d(8),  # Batch Normalization
            nn.ReLU(),
            nn.Dropout(dropout_prob),  # Dropout
            nn.Linear(8, n_classes),  # Output layer (no activation for logits)
        )

    def forward(self, x):
        return self.layers(x)

#### Instantiate Model and Define Loss & Optimizer

In [14]:
model = ImprovedMLP(n_features=n_features, n_classes=3).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.005)

#### Run training

In [15]:
writer = SummaryWriter(log_dir="../../runs/mlp")

In [16]:
verbose = 5
n_epochs = 100
train_losses = []
val_losses = []

In [17]:
for epoch in tqdm(range(n_epochs)):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in train_dl:
        # Move data to device
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        # Forward pass
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)

        # Backward pass
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        epoch_loss += loss.item()

    epoch_loss /= len(train_dl)
    train_losses.append(epoch_loss)

    # Validation loss calculation
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for X_val_batch, y_val_batch in val_dl:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            y_val_pred = model(X_val_batch)
            val_loss += loss_fn(y_val_pred, y_val_batch).item()

        val_loss /= len(val_dl)
        val_losses.append(val_loss)

    writer.add_scalars("Loss", {"Train": epoch_loss, "Validation": val_loss}, epoch)
    if verbose and (epoch + 1) % verbose == 0:
        print(
            f"Epoch {epoch + 1}/{n_epochs} | Training Loss: {epoch_loss:.4f} | Validation Loss: {val_loss:.4f}"
        )

print(f"Final Training Loss: {epoch_loss:.4f} | Validation Loss: {val_loss:.4f}")

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch 5/100 | Training Loss: 0.8230 | Validation Loss: 0.7369
Epoch 10/100 | Training Loss: 0.6941 | Validation Loss: 0.6809
Epoch 15/100 | Training Loss: 0.6412 | Validation Loss: 0.9672
Epoch 20/100 | Training Loss: 0.5505 | Validation Loss: 0.5552
Epoch 25/100 | Training Loss: 0.4821 | Validation Loss: 0.4061
Epoch 30/100 | Training Loss: 0.3709 | Validation Loss: 0.3464
Epoch 35/100 | Training Loss: 0.4447 | Validation Loss: 0.7510
Epoch 40/100 | Training Loss: 0.4030 | Validation Loss: 1.4630
Epoch 45/100 | Training Loss: 0.3261 | Validation Loss: 0.1573
Epoch 50/100 | Training Loss: 0.3382 | Validation Loss: 0.1091
Epoch 55/100 | Training Loss: 0.2313 | Validation Loss: 0.1240
Epoch 60/100 | Training Loss: 0.2730 | Validation Loss: 0.1185
Epoch 65/100 | Training Loss: 0.2669 | Validation Loss: 0.0640
Epoch 70/100 | Training Loss: 0.2845 | Validation Loss: 0.1252
Epoch 75/100 | Training Loss: 0.2118 | Validation Loss: 0.0498
Epoch 80/100 | Training Loss: 0.3696 | Validation Loss: 

In [18]:
# Plot losses
writer.flush()
writer.close()