In [None]:
# ============================================
# üìò MLOps Project 1 ‚Äî DistilBERT MRPC Tuning
# Week 2+ setup (supports Bonus Task)
# ============================================


from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, AutoConfig
)
from datasets import load_dataset
import numpy as np
import evaluate
import torch
import wandb
import random
from pathlib import Path
import matplotlib.pyplot as plt

# --- Initialize Weights & Biases ---
wandb.login()
PROJECT_NAME = "MLOPS_p1_distilbert"

# --- Reproducibility ---
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# --- Metric ---
metric = evaluate.load("glue", "mrpc")

# --- Short, descriptive run name ---
def make_run_name(cfg):
    """Generate concise run name with key hyperparameters."""
    return f"lr{cfg['learning_rate']}_wd{cfg['weight_decay']}_wr{cfg['warmup_ratio']}"

# --- Metric computation for Trainer ---
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

# --- Training Function ---
def train_model(config=None):
    """Train DistilBERT on MRPC with given hyperparameters."""
    # Normalize config object
    config_dict = dict(config) if not isinstance(config, dict) else config

    # Generate descriptive run name
    run_name = make_run_name(config_dict)

    with wandb.init(project=PROJECT_NAME, name=run_name, config=config_dict):
        config = wandb.config

        # --- Load data ---
        dataset = load_dataset("glue", "mrpc")
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

        def preprocess_function(examples):
            return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
        encoded_dataset = dataset.map(preprocess_function, batched=True)

        # --- Model setup ---
        model_config = AutoConfig.from_pretrained("distilbert-base-uncased")
        model_config.num_labels = 2
        model_config.hidden_dropout_prob = config_dict["classifier_dropout"]

        model = AutoModelForSequenceClassification.from_pretrained(
            "distilbert-base-uncased",
            config=model_config,
        )

        # --- Training Arguments ---
        args = TrainingArguments(
            output_dir="./results",
            evaluation_strategy="epoch",
            save_strategy="epoch",  # Save each epoch for Bonus Task plots
            save_total_limit=1,
            learning_rate=config_dict["learning_rate"],
            weight_decay=config_dict["weight_decay"],
            per_device_train_batch_size=config_dict["per_device_train_batch_size"],
            gradient_accumulation_steps=config_dict["gradient_accumulation_steps"],
            num_train_epochs=3,
            lr_scheduler_type=config_dict["lr_scheduler_type"],
            warmup_ratio=config_dict["warmup_ratio"],
            optim=config_dict["optimizer_type"],
            logging_steps=50,
            report_to="wandb",
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
        )

        # --- Trainer ---
        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=encoded_dataset["train"],
            eval_dataset=encoded_dataset["validation"],
            compute_metrics=compute_metrics,
            tokenizer=tokenizer,
        )

        # --- Train & Evaluate ---
        trainer.train()
        eval_results = trainer.evaluate()

        # Log final results
        wandb.log(eval_results)
        return eval_results

# ============================================
# Sweep Configuration for Week 2
# ============================================

default_config = {
    "optimizer_type": "adamw_torch",
    "lr_scheduler_type": "linear",
    "per_device_train_batch_size": 16,
    "gradient_accumulation_steps": 1,
    "adam_beta1": 0.9,
    "adam_beta2": 0.999,
    "classifier_dropout": 0.1,
}

# Focused tuning on 3 hyperparameters
sweep_configs = [
    {"learning_rate": lr, "weight_decay": wd, "warmup_ratio": wr}
    for lr in [1e-5, 2e-5, 3e-5, 5e-5]
    for wd in [0.0, 0.01, 0.05, 0.1]
    for wr in [0.0, 0.05, 0.1, 0.2]
]

# Optional: Limit to 12 runs for Week 2
random.seed(42)
sweep_configs = random.sample(sweep_configs, 12)

# ============================================
# Run Experiments
# ============================================
Path("results").mkdir(exist_ok=True)

for cfg in sweep_configs:
    full_cfg = default_config.copy()
    full_cfg.update(cfg)
    train_model(full_cfg)


### Manual Model optimization

In [None]:
import wandb
import pandas as pd

def inspect_run_details(entity, project_name, num_runs=2):
    """
    Deep inspection of run structure to find where metrics are stored.
    """
    api = wandb.Api()
    runs = api.runs(f"{entity}/{project_name}")

    print(f"Total runs in project: {len(runs)}\n")

    for i, run in enumerate(runs):
        if i >= num_runs:
            break

        print("="*80)
        print(f"RUN {i+1}: {run.name}")
        print(f"ID: {run.id}")
        print(f"State: {run.state}")
        print(f"Created: {run.created_at}")
        print("="*80)

        print("\n--- CONFIG ---")
        print(f"Type: {type(run.config)}")
        if hasattr(run.config, '__dict__'):
            print("Config attributes:", dir(run.config))
        try:
            config_dict = dict(run.config) if hasattr(run.config, '__iter__') else {}
            for key, val in config_dict.items():
                if not key.startswith('_'):
                    print(f"  {key}: {val}")
        except:
            print("  Could not extract config")

        print("\n--- SUMMARY ---")
        print(f"Type: {type(run.summary)}")
        if hasattr(run.summary, '__dict__'):
            print("Summary attributes:", [a for a in dir(run.summary) if not a.startswith('_')])

        try:
            if hasattr(run.summary, '_json_dict'):
                print("\nSummary _json_dict:")
                for key, val in run.summary._json_dict.items():
                    print(f"  {key}: {val}")
        except:
            pass

        try:
            if hasattr(run.summary, 'keys'):
                summary_keys = list(run.summary.keys())
                print(f"\nSummary keys: {summary_keys}")
        except:
            pass

        print("\n--- HISTORY (last 5 logged steps) ---")
        try:
            history = run.history(samples=5)
            print(f"History shape: {history.shape}")
            print(f"History columns: {list(history.columns)}")
            print("\nLast 5 rows:")
            print(history)
        except Exception as e:
            print(f"Could not get history: {e}")

        print("\n\n")


def get_all_runs_with_history(entity, project_name):
    """
    Get final metrics from run history (more reliable than summary).
    """
    api = wandb.Api()
    runs = api.runs(f"{entity}/{project_name}")

    data = []

    for run in runs:
        row = {
            "run_id": run.id,
            "run_name": run.name,
            "state": run.state,
        }

        try:
            config_dict = dict(run.config) if hasattr(run.config, '__iter__') else {}
            for key, val in config_dict.items():
                if not key.startswith('_'):
                    row[f"config_{key}"] = val
        except:
            pass

        try:
            history = run.history()
            if len(history) > 0:
                last_row = history.iloc[-1]

                for col in history.columns:
                    if not col.startswith('_'):
                        row[col] = last_row[col]
        except Exception as e:
            print(f"Warning: Could not get history for run {run.name}: {e}")

        data.append(row)

    return pd.DataFrame(data)


def get_metrics_summary(entity, project_name):
    """
    Extract key metrics focusing on eval metrics from final evaluation.
    """
    api = wandb.Api()
    runs = api.runs(f"{entity}/{project_name}")

    data = []

    for run in runs:
        row = {
            "run_id": run.id,
            "run_name": run.name,
            "state": run.state,
        }

        try:
            config_dict = dict(run.config) if hasattr(run.config, '__iter__') else {}
            row["learning_rate"] = config_dict.get("learning_rate", None)
            row["weight_decay"] = config_dict.get("weight_decay", None)
            row["warmup_ratio"] = config_dict.get("warmup_ratio", None)
            row["classifier_dropout"] = config_dict.get("classifier_dropout", None)
            row["per_device_train_batch_size"] = config_dict.get("per_device_train_batch_size", None)
            row["optimizer_type"] = config_dict.get("optimizer_type", None)
        except:
            pass

        try:
            history = run.history()
            if len(history) > 0:
                eval_rows = history[history['eval_accuracy'].notna()]

                if len(eval_rows) > 0:
                    final_eval = eval_rows.iloc[-1]
                    row["eval_accuracy"] = final_eval.get("eval_accuracy", None)
                    row["eval_f1"] = final_eval.get("eval_f1", None)
                    row["eval_loss"] = final_eval.get("eval_loss", None)

                train_rows = history[history['train_loss'].notna()]
                if len(train_rows) > 0:
                    final_train = train_rows.iloc[-1]
                    row["final_train_loss"] = final_train.get("train_loss", None)
                    row["final_learning_rate"] = final_train.get("train_learning_rate", None)
                    row["final_grad_norm"] = final_train.get("train_grad_norm", None)
        except Exception as e:
            print(f"Warning: Could not process run {run.name}: {e}")

        data.append(row)

    df = pd.DataFrame(data)
    return df


if __name__ == "__main__":
    ENTITY = "janick-steffen-hslu"
    PROJECT = "MLOPS_p1_distilbert"

    print("STEP 1: Deep inspection of run structure")
    print("="*80)
    inspect_run_details(ENTITY, PROJECT, num_runs=1)

    print("\n\nSTEP 2: Get all metrics from history")
    print("="*80)
    df = get_metrics_summary(ENTITY, PROJECT)

    print(f"\nFound {len(df)} runs")
    print(f"\nColumns: {list(df.columns)}")

    print("\n\nAll runs sorted by eval_accuracy:")
    df_sorted = df.sort_values("eval_accuracy", ascending=False)

    cols_to_show = [c for c in [
        "run_name", "eval_accuracy", "eval_f1",
        "learning_rate", "weight_decay", "warmup_ratio"
    ] if c in df.columns]

    print(df_sorted[cols_to_show])

    df.to_csv("wandb_runs_summary.csv", index=False)
    print("\n\nExported to wandb_runs_summary.csv")

#### Optimized by claude

In [None]:
"""
Optimal Hyperparameter Configuration
Based on 12 manual tuning runs in Week 2

Analysis Summary:
- Learning Rate 3e-5: Best average performance (85.13%) and highest max (85.78%)
- Weight Decay 0.1: Good regularization, prevents overfitting
- Warmup Ratio 0.2: Stable training with gradual LR warmup

Best Result: 85.78% validation accuracy, 90.17% F1, 0.3456 loss
"""

# Use this configuration for your next training run
hyperparameter_config = {
    "learning_rate": 3e-5,
    "weight_decay": 0.1,
    "warmup_ratio": 0.2,
    "optimizer_type": "adamw_torch",
    "lr_scheduler_type": "linear",
    "per_device_train_batch_size": 16,
    "gradient_accumulation_steps": 1,
    "adam_beta1": 0.9,
    "adam_beta2": 0.999,
    "classifier_dropout": 0.1,
}

# Run this configuration
if __name__ == "__main__":
    # Initialize wandb
    wandb.login()

    # Train with optimal config
    print("Training with optimal hyperparameters...")
    print(f"LR: {optimal_config['learning_rate']}")
    print(f"Weight Decay: {optimal_config['weight_decay']}")
    print(f"Warmup Ratio: {optimal_config['warmup_ratio']}")

    results = train_model(optimal_config)
    print(f"\nResults: {results}")


# For further fine-tuning (Week 3 or bonus), explore around the optimum:
fine_tuning_ranges = {
    "learning_rate": [2.5e-5, 3e-5, 3.5e-5, 4e-5],
    "weight_decay": [0.08, 0.09, 0.1, 0.11, 0.12],
    "warmup_ratio": [0.15, 0.18, 0.2, 0.22, 0.25],
}

print("\nFor Week 3 automatic optimization, use these ranges:")
print(fine_tuning_ranges)

# Week 3 - automated optimal config with Bayesian, grid and random search

In [None]:
# ============================================
# MLOps Project 1 - Week 3: Automatic Hyperparameter Optimization
# Using Wandb Sweeps with Bayesian Optimization
# ============================================

# Run this first in Colab
# !pip install -q transformers datasets evaluate wandb

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, AutoConfig
)
from datasets import load_dataset
import numpy as np
import evaluate
import torch
import wandb
import random
from pathlib import Path

# --- Initialize Weights & Biases ---
wandb.login()
PROJECT_NAME = "MLOPS_p1_distilbert"

# --- Reproducibility ---
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# --- Metric ---
metric = evaluate.load("glue", "mrpc")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

# --- Training Function for Sweep ---
def train_sweep():
    """
    Training function called by wandb sweep agent.
    Uses hyperparameters from wandb.config automatically.
    """
    # Create descriptive run name based on hyperparameters
    run_name = None

    with wandb.init() as run:
        config = wandb.config

        # Create descriptive run name similar to manual runs
        run_name = (f"lr{config.learning_rate:.0e}_"
                   f"wd{config.weight_decay}_"
                   f"wr{config.warmup_ratio}_"
                   f"sweep_bayesian")
        run.name = run_name

        set_seed(42)

        # --- Load data ---
        dataset = load_dataset("glue", "mrpc")
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

        def preprocess_function(examples):
            return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

        encoded_dataset = dataset.map(preprocess_function, batched=True)

        # --- Model setup ---
        model_config = AutoConfig.from_pretrained("distilbert-base-uncased")
        model_config.num_labels = 2
        model_config.hidden_dropout_prob = config.classifier_dropout

        # Note: Warning about uninitialized weights is expected and normal
        # The classification head is randomly initialized and will be trained
        model = AutoModelForSequenceClassification.from_pretrained(
            "distilbert-base-uncased",
            config=model_config,
        )

        # --- Training Arguments ---
        args = TrainingArguments(
            output_dir="./results",
            eval_strategy="epoch",  # Updated from evaluation_strategy
            save_strategy="epoch",
            save_total_limit=1,
            learning_rate=config.learning_rate,
            weight_decay=config.weight_decay,
            per_device_train_batch_size=config.per_device_train_batch_size,
            gradient_accumulation_steps=config.gradient_accumulation_steps,
            num_train_epochs=3,
            lr_scheduler_type=config.lr_scheduler_type,
            warmup_ratio=config.warmup_ratio,
            optim=config.optimizer_type,
            logging_steps=50,
            report_to="wandb",
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            seed=42,
            data_seed=42,
        )

        # --- Trainer ---
        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=encoded_dataset["train"],
            eval_dataset=encoded_dataset["validation"],
            compute_metrics=compute_metrics,
            tokenizer=tokenizer,
        )

        # --- Train & Evaluate ---
        trainer.train()
        eval_results = trainer.evaluate()

        wandb.log({
            "final_eval_accuracy": eval_results["eval_accuracy"],
            "final_eval_f1": eval_results["eval_f1"],
            "final_eval_loss": eval_results["eval_loss"],
        })

# ============================================
# Sweep Configuration
# ============================================

sweep_config = {
    "name": "week3_bayesian_sweep",
    "method": "bayes",
    "metric": {
        "name": "eval_accuracy",
        "goal": "maximize"
    },
    "parameters": {
        # Focus on the 3 most important hyperparameters
        "learning_rate": {
            "distribution": "uniform",
            "min": 2.5e-5,
            "max": 4e-5
        },
        "weight_decay": {
            "distribution": "uniform",
            "min": 0.08,
            "max": 0.12
        },
        "warmup_ratio": {
            "distribution": "uniform",
            "min": 0.15,
            "max": 0.25
        },
        # Fixed parameters based on Week 2 findings
        "optimizer_type": {
            "value": "adamw_torch"
        },
        "lr_scheduler_type": {
            "value": "linear"
        },
        "per_device_train_batch_size": {
            "value": 16
        },
        "gradient_accumulation_steps": {
            "value": 1
        },
        "classifier_dropout": {
            "value": 0.1
        },
    }
}

# ============================================
# Initialize and Run Sweep
# ============================================

# Create sweep
sweep_id = wandb.sweep(sweep_config, project=PROJECT_NAME)

print(f"Sweep ID: {sweep_id}")
print(f"Running 12 sweep runs with Bayesian optimization...")
print(f"Searching around optimal region:")
print(f"  Learning Rate: [2.5e-5, 4e-5]")
print(f"  Weight Decay: [0.08, 0.12]")
print(f"  Warmup Ratio: [0.15, 0.25]")

# Run sweep agent (12 runs)
wandb.agent(sweep_id, function=train_sweep, count=12)

print("\nSweep completed!")
print(f"View results at: https://wandb.ai/{wandb.api.default_entity}/{PROJECT_NAME}/sweeps/{sweep_id}")

In [None]:
# ============================================
# Quick Check - Verify Your Sweep is Ready
# Run this BEFORE the full analysis
# ============================================

import wandb

def quick_check_sweep(entity, project_name, sweep_id):
    """
    Quick sanity check to verify sweep is complete and has data.
    """
    api = wandb.Api()

    print("="*80)
    print("QUICK SWEEP CHECK")
    print("="*80)

    try:
        sweep = api.sweep(f"{entity}/{project_name}/{sweep_id}")
    except Exception as e:
        print(f"\n‚ùå ERROR: Could not find sweep!")
        print(f"   {e}")
        print(f"\nCheck:")
        print(f"  - ENTITY: {entity}")
        print(f"  - PROJECT: {project_name}")
        print(f"  - SWEEP_ID: {sweep_id}")
        return False

    print(f"\n‚úì Sweep found: {sweep.name}")
    print(f"  State: {sweep.state}")
    print(f"  URL: {sweep.url}")

    runs = list(sweep.runs)
    print(f"\n‚úì Number of runs: {len(runs)}")

    if len(runs) == 0:
        print("\n‚ùå ERROR: No runs in this sweep!")
        return False

    # Check run states
    finished = sum(1 for r in runs if r.state == "finished")
    running = sum(1 for r in runs if r.state == "running")
    failed = sum(1 for r in runs if r.state == "failed")
    crashed = sum(1 for r in runs if r.state == "crashed")

    print(f"\nRun states:")
    print(f"  ‚úì Finished: {finished}")
    if running > 0:
        print(f"  ‚è≥ Running: {running}")
    if failed > 0:
        print(f"  ‚ùå Failed: {failed}")
    if crashed > 0:
        print(f"  ‚ùå Crashed: {crashed}")

    if finished == 0:
        print("\n‚ö†Ô∏è  WARNING: No finished runs yet!")
        print("   Wait for runs to complete before analysis.")
        return False

    # Check first finished run for data
    finished_runs = [r for r in runs if r.state == "finished"]
    first_run = finished_runs[0]

    print(f"\n‚úì Checking first finished run: {first_run.name}")

    try:
        history = first_run.history()

        # Check for eval metrics
        eval_cols = [c for c in history.columns if 'eval' in c.lower() and 'accuracy' in c.lower()]

        if len(eval_cols) == 0:
            print("\n‚ùå ERROR: No eval accuracy column found!")
            print(f"   Available columns: {[c for c in history.columns if not c.startswith('_')][:10]}")
            return False

        acc_col = eval_cols[0]
        print(f"\n‚úì Found accuracy column: '{acc_col}'")

        # Check if it has data
        acc_data = history[acc_col].dropna()
        if len(acc_data) == 0:
            print(f"\n‚ùå ERROR: '{acc_col}' column is empty!")
            return False

        final_acc = acc_data.iloc[-1]
        print(f"  Final accuracy: {final_acc:.4f} ({final_acc*100:.2f}%)")

        # Check for other eval metrics
        f1_cols = [c for c in history.columns if 'eval' in c.lower() and 'f1' in c.lower()]
        if len(f1_cols) > 0:
            f1_col = f1_cols[0]
            final_f1 = history[f1_col].dropna().iloc[-1]
            print(f"  Final F1: {final_f1:.4f} ({final_f1*100:.2f}%)")

    except Exception as e:
        print(f"\n‚ùå ERROR reading run data: {e}")
        return False

    # Check config
    print(f"\n‚úì Checking config...")
    try:
        config = dict(first_run.config) if hasattr(first_run.config, '__iter__') else {}

        if 'learning_rate' in config:
            print(f"  LR: {config['learning_rate']}")
            print(f"  WD: {config.get('weight_decay', 'N/A')}")
            print(f"  WR: {config.get('warmup_ratio', 'N/A')}")
        else:
            print("  ‚ö†Ô∏è  Config not in standard format")
            print("     (Will extract from run names instead)")
    except Exception as e:
        print(f"  ‚ö†Ô∏è  Could not read config: {e}")

    print("\n" + "="*80)
    print("‚úÖ SWEEP READY FOR ANALYSIS!")
    print("="*80)
    print("\nYou can now run:")
    print("  python analyze_sweep_results.py")
    print(f"\nOr use SWEEP_ID: {sweep_id}")

    return True


if __name__ == "__main__":
    # UPDATE THESE VALUES
    ENTITY = "janick-steffen-hslu"
    PROJECT = "MLOPS_p1_distilbert"
    SWEEP_ID = "425e5baw"

    print("Checking sweep readiness...")
    print(f"Entity: {ENTITY}")
    print(f"Project: {PROJECT}")
    print(f"Sweep ID: {SWEEP_ID}\n")

    ready = quick_check_sweep(ENTITY, PROJECT, SWEEP_ID)

    if not ready:
        print("\n‚ö†Ô∏è  Sweep is not ready for analysis yet.")
        print("   Fix the issues above and try again.")