# VLM Cross-Modal Deep Hashing — Training + Dashboard

Unified notebook: Train a cross-modal hashing model (SigLIP2 → binary codes) with live dashboard.

**Features:**
- GPU training with real-time metrics
- Live dashboard via ngrok (Training Progress, Loss Curves, System Stats)
- Checkpoints saved to Google Drive
- System stats show actual Colab GPU/RAM resources

In [None]:
# Cell 1: GPU Check + Google Drive Mount
import torch

assert torch.cuda.is_available(), "No GPU detected — switch to a GPU runtime."
gpu_name = torch.cuda.get_device_name(0)
vram = torch.cuda.get_device_properties(0).total_memory / 1024**3
print(f"GPU: {gpu_name} ({vram:.1f} GB)")

from google.colab import drive
drive.mount("/content/drive")

!mkdir -p /content/drive/MyDrive/vlm_quantization/checkpoints
!mkdir -p /content/drive/MyDrive/vlm_quantization/monitor

In [None]:
# Cell 2: Clone Repo + Install Dependencies + Load .env
!git clone https://github.com/hyunlord/vlm_quantization.git /content/vlm_quantization 2>/dev/null || true
%cd /content/vlm_quantization
!git pull
!pip install -q -r requirements.txt
!pip install -q pyngrok

import os
env_path = "/content/drive/MyDrive/vlm_quantization/.env"
if os.path.exists(env_path):
    with open(env_path) as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith("#") and "=" in line:
                key, val = line.split("=", 1)
                os.environ[key.strip()] = val.strip()
    print(".env loaded from Google Drive")
else:
    print(f"No .env found at {env_path} — create one if needed")

In [None]:
%%time
# Cell 3: Load COCO Dataset (zip cached on Drive, extracted to local SSD)
import os, shutil

DRIVE_CACHE = "/content/drive/MyDrive/data/coco_zips"
LOCAL_COCO  = "/content/data/coco"

SOURCES = {
    "train2014": {
        "url": "http://images.cocodataset.org/zips/train2014.zip",
        "zip": "train2014.zip",
        "folder": "train2014",
    },
    "val2014": {
        "url": "http://images.cocodataset.org/zips/val2014.zip",
        "zip": "val2014.zip",
        "folder": "val2014",
    },
    "annotations": {
        "url": "http://images.cocodataset.org/annotations/annotations_trainval2014.zip",
        "zip": "annotations_trainval2014.zip",
        "folder": "annotations",
    },
}

os.makedirs(DRIVE_CACHE, exist_ok=True)
os.makedirs(LOCAL_COCO, exist_ok=True)

for i, (name, src) in enumerate(SOURCES.items(), 1):
    local_dir = f"{LOCAL_COCO}/{src['folder']}"
    drive_zip = f"{DRIVE_CACHE}/{src['zip']}"
    tmp_zip   = f"/tmp/{src['zip']}"

    if os.path.isdir(local_dir):
        print(f"  [{i}/3] {name} — already extracted locally, skipping")
        continue

    if os.path.isfile(drive_zip):
        print(f"  [{i}/3] {name} — copying cached zip from Drive...")
        shutil.copy2(drive_zip, tmp_zip)
    else:
        print(f"  [{i}/3] {name} — downloading...")
        !wget -q --show-progress {src['url']} -O {tmp_zip}
        print(f"         caching zip to Drive...")
        shutil.copy2(tmp_zip, drive_zip)

    print(f"         extracting to local disk...")
    !unzip -q {tmp_zip} -d {LOCAL_COCO}/
    os.remove(tmp_zip)

# Karpathy split JSON
KARPATHY_JSON = f"{LOCAL_COCO}/dataset_coco.json"
if not os.path.isfile(KARPATHY_JSON):
    KARPATHY_URL = "https://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip"
    karpathy_zip = "/tmp/caption_datasets.zip"
    drive_karpathy = f"{DRIVE_CACHE}/caption_datasets.zip"

    if os.path.isfile(drive_karpathy):
        print("  [K] Karpathy JSON — copying cached zip from Drive...")
        shutil.copy2(drive_karpathy, karpathy_zip)
    else:
        print("  [K] Karpathy JSON — downloading...")
        !wget -q --show-progress {KARPATHY_URL} -O {karpathy_zip}
        print("         caching zip to Drive...")
        shutil.copy2(karpathy_zip, drive_karpathy)

    print("         extracting dataset_coco.json...")
    !unzip -q -j {karpathy_zip} "dataset_coco.json" -d {LOCAL_COCO}/
    os.remove(karpathy_zip)
else:
    print("  [K] Karpathy JSON — already present, skipping")

# Verify
for name in ("train2014", "val2014", "annotations"):
    assert os.path.isdir(f"{LOCAL_COCO}/{name}"), f"{name} missing!"
assert os.path.isfile(KARPATHY_JSON), "dataset_coco.json missing!"

import json
with open(KARPATHY_JSON) as f:
    kdata = json.load(f)
splits = {}
for img in kdata["images"]:
    s = img["split"]
    splits[s] = splits.get(s, 0) + 1
del kdata

print(f"\nCOCO ready: {LOCAL_COCO}")
print(f"  train2014: {len(os.listdir(f'{LOCAL_COCO}/train2014')):,} images")
print(f"  val2014:   {len(os.listdir(f'{LOCAL_COCO}/val2014')):,} images")
print(f"  Karpathy splits: {splits}")

In [None]:
# Cell 4: Build Frontend (static export for dashboard)
!curl -fsSL https://deb.nodesource.com/setup_20.x | bash - > /dev/null 2>&1
!apt-get -qq install -y nodejs > /dev/null 2>&1
print(f"Node.js version: ", end="")
!node --version
!cd /content/vlm_quantization/monitor/frontend && npm install --silent 2>/dev/null && npm run build 2>/dev/null
print("Frontend built -> monitor/frontend/out/")

In [None]:
# Cell 5: Start Monitoring Server + ngrok Dashboard
import os
import threading
import time

import uvicorn
from pyngrok import ngrok

# Environment setup - metrics DB and checkpoints on Google Drive
MONITOR_DIR = "/content/drive/MyDrive/vlm_quantization/monitor"
CKPT_DIR = "/content/drive/MyDrive/vlm_quantization/checkpoints"
os.makedirs(MONITOR_DIR, exist_ok=True)
os.makedirs(CKPT_DIR, exist_ok=True)

os.environ["MONITOR_DB_PATH"] = f"{MONITOR_DIR}/metrics.db"
os.environ["CHECKPOINT_DIR"] = CKPT_DIR

# Start server in background thread
def run_server():
    uvicorn.run("monitor.server.app:app", host="0.0.0.0", port=8000, log_level="warning")

server_thread = threading.Thread(target=run_server, daemon=True)
server_thread.start()
time.sleep(3)

# Setup ngrok tunnel
token = os.environ.get("NGROK_AUTH_TOKEN", "")
if token:
    ngrok.set_auth_token(token)

try:
    tunnel = ngrok.connect(8000)
    dashboard_url = tunnel.public_url
    print(f"="*60)
    print(f"  DASHBOARD: {dashboard_url}")
    print(f"="*60)
    print(f"  Metrics DB: {os.environ['MONITOR_DB_PATH']}")
    print(f"  Checkpoints: {CKPT_DIR}")
    print(f"\n  Training metrics will appear in real-time!")
    print(f"  System stats show THIS Colab's GPU/RAM.")
except Exception as e:
    err_msg = str(e)
    if "ERR_NGROK_8012" in err_msg:
        print("ERROR: ngrok tunnel failed - server not running on port 8000")
    elif "ERR_NGROK_334" in err_msg or "endpoint already" in err_msg.lower():
        print("ERROR: ngrok endpoint already in use.")
        print("  -> Restart runtime or kill existing tunnel")
    else:
        print(f"ngrok error: {e}")
    print(f"\nServer running locally on http://localhost:8000")

In [None]:
# Cell 6: Start Training
# Metrics are sent to localhost:8000 -> visible in dashboard via ngrok
# Checkpoints saved to Google Drive
!cd /content/vlm_quantization && PYTHONPATH=/content/vlm_quantization python train.py --config configs/colab.yaml

In [None]:
# Cell 7: (Optional) Optuna Hyperparameter Search
# Runs 50 trials with 5 epochs each — takes several hours
OPTUNA_DIR = "/content/drive/MyDrive/vlm_quantization/optuna"
!mkdir -p {OPTUNA_DIR}
!cd /content/vlm_quantization && git pull && PYTHONPATH=/content/vlm_quantization python optuna_search.py \
    --config configs/colab.yaml \
    --n-trials 50 \
    --storage sqlite:///{OPTUNA_DIR}/optuna_results.db \
    --export-config {OPTUNA_DIR}/best_config.yaml

In [None]:
# Cell 8: Retrain with Best Optuna Config
OPTUNA_DIR = "/content/drive/MyDrive/vlm_quantization/optuna"
BEST_CONFIG = f"{OPTUNA_DIR}/best_config.yaml"

import os
assert os.path.exists(BEST_CONFIG), f"Best config not found: {BEST_CONFIG}\nRun Cell 7 (Optuna search) first."

print("Best config contents:")
!cat {BEST_CONFIG}
print("\n--- Starting full training with best hyperparameters ---\n")
!cd /content/vlm_quantization && PYTHONPATH=/content/vlm_quantization python train.py --config {BEST_CONFIG}