# üß™ UNSW-NB15 Full Experiment Grid

**Complete 90-Experiment Pipeline (Experiments 0-89)**

This notebook runs the full experiment grid:
- **2 Tasks**: Binary, Multiclass
- **3 Models**: Logistic Regression, Random Forest, XGBoost
- **3 Strategies**: S0 (None), S1 (Class Weight), S2a (RandomOverSampler)
- **5 Seeds**: 42-46

**Total**: 2 √ó 3 √ó 3 √ó 5 = **90 experiments**

---

In [None]:
# @title üöÄ **Run Full Experiment Grid (0-89)**
# @markdown ### Configuration
# @markdown Modify these settings as needed:

REPO_URL = "https://github.com/StartDust/ML_PAPER_REVIEW.git"  # @param {type:"string"}
BRANCH = "main"  # @param {type:"string"}
FORCE_FRESH_RUN = True  # @param {type:"boolean"}
SYNC_INTERVAL_SECONDS = 60  # @param {type:"integer"}

# @markdown ---
# @markdown ### Execution
# @markdown Click **Runtime > Run all** or press **Ctrl+F9** to start.

import os
import sys
import subprocess
import time
import shutil
from pathlib import Path
from datetime import datetime

PROJECT_DIR = "/content/ml_project"
DRIVE_BASE_DIR = "/content/drive/MyDrive/UNSW_Archive"

# ==============================================================================
# STEP 1: Mount Google Drive
# ==============================================================================
print("üìÅ Mounting Google Drive for Results...")
from google.colab import drive
drive.mount('/content/drive')

# ==============================================================================
# STEP 2: Setup Unique Results Directory
# ==============================================================================
TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")
RESULTS_DIR = f"{DRIVE_BASE_DIR}/run_{TIMESTAMP}"
os.makedirs(RESULTS_DIR, exist_ok=True)
print(f"‚úÖ Results will be saved to: {RESULTS_DIR}")

# ==============================================================================
# STEP 3: Clone or Update Repository
# ==============================================================================
print("\nüì• Setting up code from GitHub...")

if os.path.exists(PROJECT_DIR):
    print(f"   Repository exists, pulling latest...")
    !cd {PROJECT_DIR} && git pull
else:
    print(f"   Cloning from {REPO_URL}...")
    !git clone -b {BRANCH} {REPO_URL} {PROJECT_DIR}

os.chdir(PROJECT_DIR)
print(f"   üìÇ Working directory: {os.getcwd()}")

# ==============================================================================
# STEP 4: Install Dependencies
# ==============================================================================
print("\nüì¶ Installing Dependencies...")
!pip install -q -r requirements.txt

# ==============================================================================
# STEP 5: Optimize Configuration for Colab
# ==============================================================================
print("\n‚öôÔ∏è  Optimizing Configuration for Colab...")
import yaml

config_path = Path(PROJECT_DIR) / "configs" / "main.yaml"
temp_config_path = Path(PROJECT_DIR) / "configs" / "colab_optimized.yaml"

with open(config_path, "r") as f:
    config = yaml.safe_load(f)

original_jobs = config.get('experiments', {}).get('n_jobs', 'Unknown')
print(f"   ‚ÑπÔ∏è  Original n_jobs: {original_jobs}")

# Force sequential pipeline (prevents fork bomb on Colab)
config['experiments']['n_jobs'] = 1

# Calculate total experiments
n_seeds = config['experiments'].get('n_seeds', 5)
tasks = config['experiments'].get('tasks', ['binary', 'multi'])
models = config['experiments'].get('models', ['lr', 'rf', 'xgb'])
strategies = config['experiments'].get('strategies', ['s0', 's1', 's2a'])

TOTAL_EXPERIMENTS = len(tasks) * len(models) * len(strategies) * n_seeds

print(f"   üìä Experiment Grid:")
print(f"      - Tasks:      {tasks}")
print(f"      - Models:     {models}")
print(f"      - Strategies: {strategies}")
print(f"      - Seeds:      {n_seeds} (42-{41 + n_seeds})")
print(f"      - Total:      {TOTAL_EXPERIMENTS} experiments (0-{TOTAL_EXPERIMENTS - 1})")

with open(temp_config_path, "w") as f:
    yaml.dump(config, f, default_flow_style=False)

print(f"   ‚úÖ Optimized config saved.")

# ==============================================================================
# STEP 6: Clean Previous Results (if requested)
# ==============================================================================
if FORCE_FRESH_RUN:
    print("\nüßπ Cleaning previous results for fresh run (0-89)...")
    metrics_dir = Path(PROJECT_DIR) / "results" / "metrics"
    if metrics_dir.exists():
        shutil.rmtree(metrics_dir)
        print(f"   ‚úÖ Cleared previous metrics.")
    else:
        print(f"   ‚ÑπÔ∏è  No previous metrics found.")
else:
    print("\nüîÑ Incremental mode: Keeping existing results.")

# ==============================================================================
# STEP 7: Run Full Experiment Grid with Sync
# ==============================================================================
print("\n" + "=" * 70)
print("üöÄ STARTING FULL EXPERIMENT GRID (0-89)")
print("=" * 70)
print(f"   Config:        {temp_config_path}")
print(f"   Sync Interval: {SYNC_INTERVAL_SECONDS}s")
print(f"   Drive Target:  {RESULTS_DIR}")
print("=" * 70 + "\n")

LOCAL_RESULTS = str(Path(PROJECT_DIR) / "results")

# Run main.py in subprocess
proc = subprocess.Popen(
    [sys.executable, "main.py", "--config", str(temp_config_path)],
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    bufsize=1
)

last_sync_time = time.time()
exit_code = 0

try:
    while True:
        return_code = proc.poll()
        
        if proc.stdout:
            line = proc.stdout.readline()
            if line:
                print(line, end='')
        
        current_time = time.time()
        if current_time - last_sync_time >= SYNC_INTERVAL_SECONDS:
            !rsync -avq --include='*/' --include='*.json' --include='*.png' --include='*.csv' --exclude='*' {LOCAL_RESULTS}/ "{RESULTS_DIR}/"
            print(f"üíæ Synced artifacts to Drive...")
            last_sync_time = current_time
        
        if return_code is not None:
            remaining = proc.stdout.read() if proc.stdout else ""
            if remaining:
                print(remaining)
            exit_code = return_code
            break
        
        time.sleep(0.1)

except KeyboardInterrupt:
    print("\n‚ö†Ô∏è  Interrupted by user. Saving progress...")
    proc.terminate()
    proc.wait()
    exit_code = 130

# ==============================================================================
# STEP 8: Final Sync and Cleanup
# ==============================================================================
print("\nüì§ Performing final full sync to Drive...")
!rsync -av {LOCAL_RESULTS}/ "{RESULTS_DIR}/"

if os.path.exists(temp_config_path):
    os.remove(temp_config_path)
    print(f"üóëÔ∏è  Cleaned up temp config.")

# ==============================================================================
# SUMMARY
# ==============================================================================
print("\n" + "=" * 70)
print("üìä EXECUTION SUMMARY")
print("=" * 70)
print(f"   Total Experiments: {TOTAL_EXPERIMENTS} (0-{TOTAL_EXPERIMENTS - 1})")
print(f"   Exit Code:         {exit_code}")
print(f"   Results Saved To:  {RESULTS_DIR}")
print(f"   Completed at:      {datetime.now().isoformat()}")

if exit_code == 0:
    print("\n‚úÖ ALL 90 EXPERIMENTS COMPLETED SUCCESSFULLY!")
else:
    print(f"\n‚ö†Ô∏è  Run completed with exit code {exit_code}")

print("=" * 70)

---

## üìÅ View Results

After the run completes, your results will be in Google Drive at:
```
My Drive/UNSW_Archive/run_YYYYMMDD_HHMMSS/
‚îú‚îÄ‚îÄ metrics/          # JSON metrics for each experiment
‚îú‚îÄ‚îÄ figures/          # Confusion matrices, ROC curves, etc.
‚îú‚îÄ‚îÄ tables/           # Aggregated CSV summaries
‚îú‚îÄ‚îÄ learning_curves/  # Training curves (XGBoost only)
‚îî‚îÄ‚îÄ logs/             # Execution logs
```