Split data for testing

In [None]:
import polars as pl
import torch
from sklearn.model_selection import train_test_split

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running with device: {torch.cuda.get_device_name(DEVICE)}")


def split_train_test(csv: str) -> tuple[pl.DataFrame, pl.DataFrame]:
    """Returns train_val_df and test_df. Uses a fixed seed to always get the same test set"""
    clean_df = pl.read_csv(csv).with_row_index()
    (labels,) = clean_df.select("Class Label")
    labels = labels.to_numpy()

    train_val_idx, test_idx = train_test_split(
        range(len(clean_df)),
        stratify=labels,
        test_size=0.1,
        random_state=0,
    )

    train_val_df = clean_df.filter(pl.col("index").is_in(train_val_idx))
    test_df = clean_df.filter(pl.col("index").is_in(test_idx))

    # Reindex
    train_val_df = train_val_df.drop("index").with_row_index()
    test_df = test_df.drop("index").with_row_index()
    return train_val_df, test_df

In [None]:
import optuna
from torch import nn

from src.cci.models import LambdaModule


def suggest_mlp(trial: optuna.Trial) -> list[nn.Module]:
    n_hidden_layers = trial.suggest_int("n_hidden_layers", 1, 3)
    i = 1
    layers: list[nn.Module] = [
        nn.Flatten(),
        nn.Dropout(trial.suggest_float(f"dropout_{i}", 0.1, 0.5)),
    ]
    features = trial.suggest_int(f"linear_{i}", 1, 1000)
    prev_features = features
    layers.append(nn.LazyLinear(features))
    layers.append(nn.ReLU())
    for _ in range(n_hidden_layers):
        i += 1
        features = trial.suggest_int(f"linear_{i}", 1, 1000)

        layers.append(nn.Dropout(trial.suggest_float(f"dropout_{i}", 0.1, 0.5)))
        layers.append(nn.Linear(prev_features, features))
        layers.append(nn.ReLU())
        prev_features = features
    i += 1
    layers.append(nn.Dropout(trial.suggest_float(f"dropout_{i}", 0.1, 0.5)))
    layers.append(nn.Linear(prev_features, 1))

    # Convert to list
    layers.append(LambdaModule(lambda x: x.view(-1)))
    return layers

In [None]:
import os
from pathlib import Path

import aim
import numpy as np
import optuna
import polars as pl
import torch
from aim import Text
from aim.optuna import AimCallback
from aim.pytorch import track_gradients_dists, track_params_dists
from rich.live import Live
from rich.progress import (
    Progress,
    TextColumn,
    TimeElapsedColumn,
)
from rich.table import Table
from sklearn.model_selection import StratifiedKFold
from torch import nn, optim, tensor
from torch.nn import functional as F
from torch.utils.data import DataLoader

from src.cci.dataset.dataset import TransitionDataset
from src.cci.dataset.transforms import CropSample, RandomSample, ToTensor
from src.cci.metrics import Metrics
from torch.utils import bottleneck

sample_length = 1500
EXPERIMENT_NAME = "MLP_parameter_search"


def fit_model(
    model: nn.Module,
    opt: optim.Optimizer,
    loss_fn: nn.BCEWithLogitsLoss,
    val_loss_fn: nn.BCEWithLogitsLoss,
    dataloaders: dict[str, DataLoader],
    metrics: dict[str, Metrics],
    run: aim.Run,
    epochs: int,
):
    """Fits model and returns the best validation f1 and loss
    (should this maybe be the best f1 at the best loss? or best combination of the scores? (alpha * f1 - (1-alpha) * loss))"""
    table = Table(f"Training model: {model.parameters()}")
    metric_info = Progress(TextColumn("{task.description}"))
    task_metrics = metric_info.add_task("Metrics")
    progress = Progress(*Progress.get_default_columns(), TimeElapsedColumn())
    task_epoch = progress.add_task("Epochs")
    task_train = progress.add_task("Training")
    task_validation = progress.add_task("Validation")
    table.add_row(progress)
    table.add_row(metric_info)

    with Live(table):
        plot_cm = True

        for epoch in progress.track(range(1, epochs + 1), description="Epochs", task_id=task_epoch):
            progress.reset(task_train)
            progress.reset(task_validation)

            # Training
            metrics["train"].reset()
            model.train()
            for data in progress.track(dataloaders["train"], description="Training", task_id=task_train):
                sample, label = data["signal"].to(DEVICE), data["label"].to(DEVICE)
                opt.zero_grad()
                logits = model(sample)

                loss = loss_fn(logits, label.float())
                loss.backward()
                opt.step()

                predictions = F.sigmoid(logits)
                metrics["train"].update(predictions, label, loss)
            metrics["train"].upload_metrics_epoch(run, epoch, plot_cm)

            # Validation
            metrics["val"].reset()
            model.eval()
            with torch.no_grad():
                for data in progress.track(dataloaders["val"], description="Validation", task_id=task_validation):
                    sample, label = data["signal"].to(DEVICE), data["label"].to(DEVICE)
                    logits = model(sample)

                    loss = val_loss_fn(logits, label.float())

                    predictions = F.sigmoid(logits)
                    metrics["val"].update(predictions, label, loss)
            metrics["val"].upload_metrics_epoch(run, epoch, plot_cm)

            validation_values = metrics["val"].compute()
            training_values = metrics["train"].compute()
            metric_info.update(
                task_id=task_metrics,
                description=f"\nTraining\n Acc:{training_values['acc']:.3f}\n Loss{training_values['loss']:.3f}\n"
                f"Validation\n Acc:{validation_values['acc']:.3f}\n Loss:{validation_values['loss']:.3f}\n",
            )
            # ------------------ TODO: ----------------------------- #
            if validation_values["loss"] == metrics["val"].best_metrics["loss"]:
                best_metrics = {"loss": validation_values["loss"], "f1": validation_values["f1"]}
                # SAVE BEST MODEL FOR TESTING
            # ------------------ TODO: ----------------------------- #

            # Track weights
            track_params_dists(model, run)
            track_gradients_dists(model, run)
            plot_cm = True if epoch % 10 == 0 else False

        # Testing
        model.eval()
        with torch.no_grad():
            for data in progress.track(dataloaders["test"], description="Testing"):
                sample, label = data["signal"].to(DEVICE), data["label"].to(DEVICE)
                logits = model(sample)

                loss = val_loss_fn(logits, label.float())

                predictions = F.sigmoid(logits)
                metrics["test"].update(predictions, label, loss)

        metrics["train"].upload_training_end(run)
        metrics["val"].upload_training_end(run)
        metrics["test"].upload_test(run)

        return metrics["val"].best_metrics["f1"], metrics["val"].best_metrics["loss"]


# Tuner
aim_callback = AimCallback(metric_name=["f1", "loss"], as_multirun=True, experiment_name="MLP_parameter_search")


@aim_callback.track_in_aim()
def objective(trial: optuna.Trial):
    aim_callback.experiment["dataset"] = {
        "samples": 1500,
        "preprocessing": {},
        "set": "data/clean_df.csv",
        "test_set": {"augmentation": "random_shift"},
    }

    optimizer_name = trial.suggest_categorical("Optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    batch_size = trial.suggest_categorical("batch_size", [4, 8, 16, 32])
    model_layers = suggest_mlp(trial)

    fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    train_val_df, test_df = split_train_test("data/clean_df.csv")
    (labels,) = train_val_df.select("Class Label")
    labels = labels.to_numpy()
    f1_scores = []
    loss_scores = []
    for fold_idx, (train_idx, val_idx) in enumerate(fold.split(np.zeros(len(train_val_df)), labels)):
        # Split data
        train_df = train_val_df.filter(pl.col("index").is_in(train_idx))
        val_df = train_val_df.filter(pl.col("index").is_in(val_idx))
        root_dir = Path(os.environ["OOCHA_DIR"])
        train_dataset = TransitionDataset(
            train_df,
            root_dir,
            transforms=[
                RandomSample(sample_length),
                ToTensor(),
            ],
        )
        val_dataset = TransitionDataset(
            val_df,
            root_dir,
            transforms=[
                CropSample(sample_length),
                ToTensor(),
            ],
        )
        test_dataset = TransitionDataset(
            test_df,
            root_dir,
            transforms=[
                CropSample(sample_length),
                ToTensor(),
            ],
        )
        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True,
        )
        val_loader = DataLoader(
            val_dataset,
            batch_size=batch_size,
        )
        test_loader = DataLoader(
            test_dataset,
            batch_size=batch_size,
        )
        dataloaders = {"train": train_loader, "val": val_loader, "test": test_loader}
        metrics = {
            "train": Metrics("train", len(train_dataset), DEVICE, fold_idx),
            "val": Metrics("val", len(val_dataset), DEVICE, fold_idx),
            "test": Metrics("test", len(test_dataset), DEVICE, fold_idx),
        }

        model = nn.Sequential(*model_layers).to(DEVICE)
        if fold_idx == 0:
            aim_callback.experiment.track(Text(f"{model}"), "Model Architecture")
        opt = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
        loss_fn = nn.BCEWithLogitsLoss(
            pos_weight=tensor(train_dataset.get_pos_weight()),
        )
        val_loss_fn = nn.BCEWithLogitsLoss()

        # Train the model
        f1, loss = fit_model(
            model, opt, loss_fn, val_loss_fn, dataloaders, metrics, run=aim_callback.experiment, epochs=1000
        )
        f1_scores.append(f1)
        loss_scores.append(loss)

    avg_f1 = np.average(f1_scores)
    avg_loss = np.average(loss_scores)
    return float(avg_f1), float(avg_loss)


study = optuna.create_study(directions=["maximize", "minimize"])
study.set_metric_names(["f1", "loss"])


study.optimize(objective, n_trials=10, callbacks=[aim_callback])