In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler

In [3]:
# Import training and testing sets
X_train = pd.read_csv("/home/s2106664/msc_project/training_testing_dataset/X_train.csv")
X_test = pd.read_csv("/home/s2106664/msc_project/training_testing_dataset/X_test.csv")
y_train = pd.read_csv("/home/s2106664/msc_project/training_testing_dataset/y_train.csv")
y_test = pd.read_csv("/home/s2106664/msc_project/training_testing_dataset/y_test.csv")

# 1. Hyperparameter tuning using validation datasets (subset datasets)

## 1.1 Create datasets and dataloader

In [None]:
# Toy set
X_train_toy = X_train.head(10000)
y_train_toy = y_train.head(10000).squeeze()

(10000,)

In [4]:
# Identify if cuda is available to use GPU
if torch.cuda.is_available() == True:
    device = "cuda"
else:
    device = "cpu"

print(f"Using device: {device}")

Using device: cuda


In [22]:
# Dataloader
X_train_tensor = torch.tensor(X_train_toy.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_toy.values, dtype=torch.long)


dataset = TensorDataset(X_train_tensor, y_train_tensor)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

In [None]:
class MLP (nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    
    def forward(self, x):
        return self.net(x)

In [None]:
"""
input_dim = X_train_tensor.shape[1]
hidden_dim = 128
output_dim = 1

model = MLP(input_dim, hidden_dim, output_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
"""

In [None]:
"""
epoch_num = 10

model.train()
for epoch in range(epoch_num):
    total_loss = 0 
    for X_batch, y_batch in loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")
"""

In [None]:
def train_fold(X_train, y_train, X_val, y_val, config, device):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)

    train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor),
                              batch_size=config["batch_size"], shuffle=True)

    model = MLP(input_dim=X_train.shape[1], hidden_dim=config["hidden_dim"]).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])

    model.train()
    for epoch in range(config["epochs"]):
        total_loss = 0
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            out = model(x_batch)
            loss = criterion(out, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

    model.eval()
    with torch.no_grad():
        logits = model(X_val_tensor.to(device))
        preds = torch.sigmoid(logits).cpu().numpy() >= 0.5
        acc = (preds == y_val.reshape(-1,1)).mean()
    return acc

In [None]:
def train_cv(config):
    # Assuming global X, y are available or passed differently
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []

    for train_idx, val_idx in kfold.split(X, y):
        X_train, y_train = X[train_idx], y[train_idx]
        X_val, y_val = X[val_idx], y[val_idx]
        acc = train_fold(X_train, y_train, X_val, y_val, config, device)
        accuracies.append(acc)

    avg_acc = np.mean(accuracies)

    tune.report(accuracy=avg_acc)  # Report metric back to Ray Tune

# Example config search space
search_space = {
    "lr": tune.loguniform(1e-4, 1e-2),
    "batch_size": tune.choice([32, 64, 128]),
    "hidden_dim": tune.choice([32, 64, 128]),
    "epochs": 10,
}

# Run tuning
analysis = tune.run(
    train_cv,
    config=search_space,
    resources_per_trial={"cpu": 1, "gpu": 1},  # adjust based on your machine
    num_samples=20,
)
print("Best config:", analysis.get_best_config(metric="accuracy"))