# LLM Memorization Project — Colab Runner

Questo notebook è un template riutilizzabile per far girare il progetto su Google Colab.

Supporta 2 modalità:
- **Git clone** del repo
- **Upload di uno ZIP** del progetto e unzip

Nota: per evitare di scaricare l’intero split del dataset su Colab, useremo `--streaming` (aggiunto in `run_experiment.py`).

## 1) Verifica runtime Colab (Python/GPU)
Controlla subito versione Python, GPU e spazio disco.

In [None]:
import os, sys, shutil, platform, json, time, pathlib

print("Python:", sys.version)
print("Platform:", platform.platform())
print("Disk usage (/content):", shutil.disk_usage("/content"))

try:
    import torch
    print("torch:", torch.__version__)
    print("CUDA available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("CUDA device:", torch.cuda.get_device_name(0))
except Exception as e:
    print("Torch not available yet:", repr(e))

# nvidia-smi (se presente)
ret = os.system("nvidia-smi -L")
if ret != 0:
    print("nvidia-smi non disponibile (ok se runtime CPU)")

## 2) Installazione dipendenze (pip/apt)
Installazione via `requirements.txt`. Impostiamo anche la cache Hugging Face in `/content/hf` per evitare problemi di spazio.

In [None]:
# Cache HF/Transformers
os.environ["HF_HOME"] = "/content/hf"
os.environ["TRANSFORMERS_CACHE"] = "/content/hf/transformers"
pathlib.Path(os.environ["HF_HOME"]).mkdir(parents=True, exist_ok=True)

# (Opzionale) install deps di sistema. Di solito non serve per questo progetto.
# !apt-get update -qq

CLEAN_REINSTALL = False
if CLEAN_REINSTALL:
    !pip -q uninstall -y transformers datasets accelerate datasketches tqdm numpy

!pip -q install -U pip

# Se hai già caricato/clonato il repo, esegui questo dopo aver fatto %cd nel repo
# (qui lo lasciamo, verrà rieseguito più sotto quando sappiamo la cartella)
print("Done")

## 3) Clona/aggiorna repository (opzionale)
Scegli **UNA** modalità:
- `METHOD = "git"` se hai un repo pubblico/privato accessibile
- `METHOD = "zip"` se vuoi caricare un archivio del progetto

In [None]:
# Scegli: "git" oppure "zip"
METHOD = "zip"

REPO_URL = "https://github.com/<user>/<repo>.git"  # TODO: se usi git
REPO_DIRNAME = "llm_memorization_project"  # cartella attesa dopo clone/unzip

if METHOD == "git":
    %cd /content
    if not pathlib.Path(REPO_DIRNAME).exists():
        !git clone {REPO_URL} {REPO_DIRNAME}
    %cd /content/{REPO_DIRNAME}

elif METHOD == "zip":
    from google.colab import files
    %cd /content
    uploaded = files.upload()  # carica es. llm_memorization_project.zip
    if not uploaded:
        raise RuntimeError("Nessun file caricato")

    zip_name = next(iter(uploaded.keys()))
    print("Uploaded:", zip_name)

    # Estrai in /content
    !unzip -q -o {zip_name} -d /content

    # Prova a trovare la cartella del repo (se lo zip contiene una top-folder)
    candidates = [p for p in pathlib.Path("/content").glob("**/run_experiment.py") if "site-packages" not in str(p)]
    if not candidates:
        raise RuntimeError("Non trovo run_experiment.py dopo unzip; controlla la struttura dello ZIP")

    repo_root = candidates[0].parent
    # Se run_experiment.py è dentro una sottocartella, risali finché trovi requirements.txt
    while repo_root != repo_root.parent and not (repo_root / "requirements.txt").exists():
        repo_root = repo_root.parent

    print("Repo root:", repo_root)
    %cd {repo_root}
else:
    raise ValueError("METHOD deve essere 'git' o 'zip'")

# Sanity check
for needed in ["run_experiment.py", "requirements.txt", "prompts/code_gen_prompt.txt"]:
    assert pathlib.Path(needed).exists(), f"Missing {needed}"
print("Repo OK")

## 4) Mount Google Drive e gestione percorsi
Se vuoi persistire i risultati anche dopo che Colab si resetta, monta Drive.

In [None]:
import pathlib

MOUNT_DRIVE = False
DRIVE_BASE = "/content/drive/MyDrive"

if MOUNT_DRIVE:
    from google.colab import drive
    drive.mount("/content/drive")

BASE_DIR = pathlib.Path.cwd()
OUT_DIR = BASE_DIR / "reports"
OUT_DIR.mkdir(parents=True, exist_ok=True)

if MOUNT_DRIVE:
    DRIVE_OUT_DIR = pathlib.Path(DRIVE_BASE) / "llm_memorization_project_outputs"
    DRIVE_OUT_DIR.mkdir(parents=True, exist_ok=True)
    print("Drive output dir:", DRIVE_OUT_DIR)

print("BASE_DIR:", BASE_DIR)
print("OUT_DIR:", OUT_DIR)

## 5) Upload dataset/file locali (fallback senza Drive)
Di solito NON serve: il dataset arriva da Hugging Face via `datasets`. Usa questa cella solo se devi caricare file extra.

In [None]:
UPLOAD_EXTRA_FILES = False

if UPLOAD_EXTRA_FILES:
    from google.colab import files
    uploaded = files.upload()
    print("Uploaded:", list(uploaded.keys()))

    # Se carichi zip/tar di dati, estraili qui
    for name in uploaded.keys():
        if name.endswith(".zip"):
            !unzip -q -o {name} -d /content/data_upload
    if pathlib.Path("/content/data_upload").exists():
        total_bytes = sum(p.stat().st_size for p in pathlib.Path("/content/data_upload").rglob("*") if p.is_file())
        print("/content/data_upload size (bytes):", total_bytes)
else:
    print("Skipping")

## 6) Configurazione variabili (path, hyperparam, flags)
Qui imposti modello, numero campioni, token, seed e flags Colab-friendly.

In [None]:
# Config centrale
CFG = {
    "dataset": "Nan-Do/code-search-net-python",
    "split": "train",
    "n": 5,  # tienilo basso per un primo run su Colab
    "seed": 42,
    "model": "Qwen/Qwen2-0.5B-Instruct",
    "prompt": "prompts/code_gen_prompt.txt",
    "max_new_tokens": 256,
    "perturb": True,
    "streaming": True,
    "streaming_buffer_size": 10_000,
    "output_name": "results_colab.json",
}

# Salva config per riproducibilità
(OUT_DIR / "colab_cfg.json").write_text(json.dumps(CFG, indent=2), encoding="utf-8")
print("Saved config:", OUT_DIR / "colab_cfg.json")
print(json.dumps(CFG, indent=2))

## 2b) Installa requirements del progetto
Esegui questa cella DOPO aver clonato/unzippato ed essere dentro la cartella del repo.

In [None]:
!pip -q install -r requirements.txt

# Versioni per riproducibilità
import datasets, transformers
print("datasets:", datasets.__version__)
print("transformers:", transformers.__version__)
print("accelerate:", __import__("accelerate").__version__)
print("datasketches:", __import__("datasketches").__version__)
print("numpy:", __import__("numpy").__version__)

## 7) Esecuzione pipeline/script principali
Esegue `run_experiment.py` e salva risultati in `reports/analysis_<timestamp>/`.

In [None]:
# Costruisci command line
cmd = [
    "python", "run_experiment.py",
    "--dataset", CFG["dataset"],
    "--split", CFG["split"],
    "--n", str(CFG["n"]),
    "--seed", str(CFG["seed"]),
    "--model", CFG["model"],
    "--prompt", CFG["prompt"],
    "--max-new-tokens", str(CFG["max_new_tokens"]),
    "--output", CFG["output_name"],
]

# BooleanOptionalAction: usa --perturb / --no-perturb
cmd.append("--perturb" if CFG["perturb"] else "--no-perturb")
cmd.append("--streaming" if CFG["streaming"] else "--no-streaming")
cmd += ["--streaming-buffer-size", str(CFG["streaming_buffer_size"])]

print("Running:", " ".join(cmd))

# Log su file (utile per debug)
log_path = OUT_DIR / "colab_run.log"

# Esegui con tee: stdout+stderr -> notebook e file
import subprocess
with open(log_path, "w", encoding="utf-8") as lf:
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    for line in p.stdout:
        print(line, end="")
        lf.write(line)
    rc = p.wait()

print("Exit code:", rc)
print("Log:", log_path)
if rc != 0:
    raise RuntimeError(f"run_experiment.py failed with code {rc}")

## 8) Logging e output (stdout + file)
Trova l’ultima cartella `reports/analysis_*` e mostra i file prodotti.

In [None]:
def latest_analysis_dir(reports_dir: pathlib.Path) -> pathlib.Path | None:
    if not reports_dir.exists():
        return None
    dirs = [p for p in reports_dir.glob("analysis_*") if p.is_dir()]
    if not dirs:
        return None
    return max(dirs, key=lambda p: p.stat().st_mtime)

latest = latest_analysis_dir(OUT_DIR)
print("Latest analysis dir:", latest)

if latest:
    files = sorted([p for p in latest.rglob("*") if p.is_file()])
    for p in files:
        print("-", p.relative_to(BASE_DIR), f"({p.stat().st_size} bytes)")

    report_md = latest / "REPORT.md"
    results_json = latest / CFG["output_name"]
    print("\nREPORT.md:", report_md)
    print("results.json:", results_json)
else:
    print("Nessuna cartella analysis_* trovata")

# Mostra ultime righe log
if log_path.exists():
    print("\n--- Tail log ---")
    print("\n".join(log_path.read_text(encoding="utf-8").splitlines()[-40:]))

## 9) Salvataggio artefatti su Drive (modelli, report, zip)
Se Drive è montato, copia l’ultima cartella di risultati e/o crea uno ZIP in Drive.

In [None]:
import shutil

if not latest:
    print("Nessun run da copiare")
elif not MOUNT_DRIVE:
    print("Drive non montato: setta MOUNT_DRIVE=True nella sezione 4")
else:
    dest = DRIVE_OUT_DIR / latest.name
    if dest.exists():
        shutil.rmtree(dest)
    shutil.copytree(latest, dest)
    print("Copied to:", dest)

    zip_in_drive = DRIVE_OUT_DIR / f"{latest.name}.zip"
    if zip_in_drive.exists():
        zip_in_drive.unlink()

    shutil.make_archive(str(zip_in_drive).replace(".zip", ""), "zip", root_dir=latest)
    print("ZIP in Drive:", zip_in_drive, f"({zip_in_drive.stat().st_size} bytes)")

## 10) Download risultati su locale
Crea uno ZIP dell’ultima cartella e scaricalo sul tuo PC.

In [None]:
from google.colab import files

if not latest:
    raise RuntimeError("Nessun run trovato")

zip_path = pathlib.Path("/content") / f"{latest.name}.zip"
if zip_path.exists():
    zip_path.unlink()

shutil.make_archive(str(zip_path).replace(".zip", ""), "zip", root_dir=latest)
print("Created:", zip_path, f"({zip_path.stat().st_size} bytes)")

files.download(str(zip_path))

## 11) Test rapidi / smoke test
Esegui un run ultrarapido per verificare che import/dataset/model funzionino (utile prima di lanciare run lunghi).

In [None]:
# Smoke test: 2 esempi, pochi token, niente perturb
smoke_cmd = [
    "python", "run_experiment.py",
    "--dataset", CFG["dataset"],
    "--split", CFG["split"],
    "--n", "2",
    "--seed", str(CFG["seed"]),
    "--model", CFG["model"],
    "--prompt", CFG["prompt"],
    "--max-new-tokens", "64",
    "--no-perturb",
    "--streaming",
    "--streaming-buffer-size", "2000",
    "--output", "results_smoke.json",
]
print("Smoke:", " ".join(smoke_cmd))

rc = subprocess.call(smoke_cmd)
print("Exit code:", rc)
if rc != 0:
    raise RuntimeError("Smoke test failed")

latest_smoke = latest_analysis_dir(OUT_DIR)
print("Latest after smoke:", latest_smoke)
if latest_smoke:
    print("REPORT:", latest_smoke / "REPORT.md")
    print("JSON:", latest_smoke / "results_smoke.json")