# Base Notebook to Run Experiments

## Notebook Setup

### Download Dependencies

In [None]:
!pip install uv
!git clone https://github.com/jorgesilva2407/poc.git /gcr
%cd /gcr
!uv sync --no-dev --no-cache-dir
!pip install "optuna[database]"

### Import Depedencies
- Import dependencies
- Mount Google Drive
- Define global variables
    - Assert paths exist

In [None]:
import os
import json
import subprocess
from enum import Enum
from pathlib import Path
from dataclasses import dataclass

import optuna
from google.colab import drive

drive.mount("/content/drive")

TENSORBOARD_LOG_DIR = "/content/drive/MyDrive/poc/tensorboard"
ARTIFACTS_LOG_DIR = "/content/drive/MyDrive/poc/artifacts"
METRIC_FILE = "/tmp/metric.json"
DATA_DIR = "/content/drive/MyDrive/poc/data"
OPTUNA_DIR = "/content/drive/MyDrive/poc/optuna"

os.makedirs(TENSORBOARD_LOG_DIR, exist_ok=True)
os.makedirs(ARTIFACTS_LOG_DIR, exist_ok=True)
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(OPTUNA_DIR, exist_ok=True)

## Optuna Setup

### Define Hyper Parameter Types

In [None]:
class HParamType(Enum):
    LOGUNIFORM = "loguniform"
    UNIFORM = "uniform"
    INTEGER = "integer"
    CATEGORICAL = "categorical"


@dataclass
class HParam:
    type: HParamType

    def suggest(self, trial, name: str):
        raise NotImplementedError


@dataclass
class Categorical(HParam):
    choices: list[int | float | str]

    def __init__(self, choices: list[int | float | str]):
        super().__init__(HParamType.CATEGORICAL)
        self.choices = choices

    def suggest(self, trial, name: str):
        return trial.suggest_categorical(name, self.choices)


@dataclass
class Uniform(HParam):
    low: float
    high: float

    def __init__(self, low: float, high: float):
        super().__init__(HParamType.UNIFORM)
        self.low = low
        self.high = high

    def suggest(self, trial, name: str):
        return trial.suggest_uniform(name, self.low, self.high)


@dataclass
class LogUniform(HParam):
    low: float
    high: float

    def __init__(self, low: float, high: float):
        super().__init__(HParamType.LOGUNIFORM)
        self.low = low
        self.high = high

    def suggest(self, trial, name: str):
        return trial.suggest_loguniform(name, self.low, self.high)


@dataclass
class Integer(HParam):
    low: int
    high: int

    def __init__(self, low: int, high: int):
        super().__init__(HParamType.INTEGER)
        self.low = low
        self.high = high

    def suggest(self, trial, name: str):
        return trial.suggest_int(name, self.low, self.high)

### Helper Function to get a set of Hyper Parameters to use

In [None]:
def suggest_params(trial, search_space):
    params = {}
    for name, hparam in search_space.items():
        params[name] = hparam.suggest(trial, name)
    return params

### Model Runner

In [None]:
def run_model(
    model_name: str,
    params: dict,
    all_csv: str,
    train_csv: str,
    val_csv: str,
    test_csv: str,
) -> float:
    cmd = [
        "uv",
        "run",
        "main.py",
        "--model",
        model_name,
        "--logger",
        "TensorBoard",
        "--tensorboard-log-dir",
        TENSORBOARD_LOG_DIR,
        "--artifact-saver",
        "Local",
        "--optuna-metric-file",
        METRIC_FILE,
        "--local-artifacts-path",
        ARTIFACTS_LOG_DIR,
        "--experiment-tracker",
        "Optuna",
        "--batch-size",
        64,
        "--all-interactions-csv",
        all_csv,
        "--train-interactions-csv",
        train_csv,
        "--validation-interactions-csv",
        val_csv,
        "--test-interactions-csv",
        test_csv,
    ]

    for param, value in params.items():
        cmd.extend([param, str(value)])

    cmd = [str(c) for c in cmd]

    print(" ".join(cmd))

    with subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
        bufsize=1,
    ) as process:
        for line in process.stdout:
            print(line, end="", flush=True)

    metric = None
    with open(METRIC_FILE, "r") as f:
        metric = json.load(f)
    return metric["value"]

### Define the Objective of the Optuna Study

In [None]:
def objective_factory(model_name: str, search_space: dict, dataset: str):
    def objective(trial):
        params = suggest_params(trial, search_space)

        data_dir = Path(DATA_DIR) / dataset
        all_csv = data_dir / "all_interactions.csv"
        train_csv = data_dir / "split" / "train.csv"
        val_csv = data_dir / "split" / "val.csv"
        test_csv = data_dir / "split" / "test_neg_samples.csv"

        paths = [data_dir, all_csv, train_csv, val_csv, test_csv]
        missing_paths = [path for path in paths if not path.exists()]

        if missing_paths:
            raise ValueError(f"Missing paths: {missing_paths}")

        result = run_model(model_name, params, all_csv, train_csv, val_csv, test_csv)
        return result

    return objective

### Run an Optuna Study

In [None]:
def optimize_model(
    model_name: str, search_space: dict, dataset: str, n_trials: int = 20
):
    db_path = Path(OPTUNA_DIR) / f"{model_name}.db"
    study = optuna.create_study(
        study_name=f"{model_name}Optimization",
        direction="maximize",
        storage=f"sqlite:///{db_path}",
        load_if_exists=True,
        sampler=optuna.samplers.TPESampler(seed=42),
    )

    objective = objective_factory(model_name, search_space, dataset)
    study.optimize(objective, n_trials=n_trials)

    print("\nâœ… Optimization complete!")
    print(f"Best value: {study.best_value}")
    print(f"Best params: {study.best_params}")

    return study

## Run the Study

Update the model, dataset and search space as needed

In [None]:
model = "your_model_name"
dataset = "your_dataset_name"
search_space = {
    "--hparam1": Categorical(choices=["option1", "option2", "option3"]),
    "--hparam2": Uniform(low=0.0, high=1.0),
    "--hparam3": LogUniform(low=1e-5, high=1e-1),
    "--hparam4": Integer(low=1, high=100),
    "--learning-rate": LogUniform(low=1e-5, high=1e-2),
    "--weight-decay": LogUniform(low=1e-6, high=1e-3),
}

In [None]:
optimize_model(model, search_space, dataset, n_trials=20)