# White Box vs Black Box Knowledge Distillation Experiment

This notebook runs the complete experiment pipeline from the GitHub repository.

1.  **Setup**: Clones the repo and installs dependencies.
2.  **Mount Drive**: Connects to Google Drive for **persistent storage**.
3.  **Data Generation**: Runs Llama-2 teacher (skips if data exists on Drive).
4.  **Training**: Trains TinyLlama student (resumes from last checkpoint on Drive).

**Note**: Select **A100 GPU** runtime for fast data generation.

In [None]:
# @title 1. Clone Repository & Install Dependencies
import os

# Ensure we start from the root content directory to avoid nesting hell
%cd /content

REPO_URL = "https://github.com/j8ck1632/white-box-vs-black-box-kd-llms.git"
REPO_NAME = "white-box-vs-black-box-kd-llms"

if not os.path.exists(REPO_NAME):
    !git clone {REPO_URL}
    %cd {REPO_NAME}
else:
    %cd {REPO_NAME}
    !git fetch origin
    !git reset --hard origin/main

!pip install -r requirements.txt
!pip install flash-attn --no-build-isolation

In [None]:
# @title 2. Mount Google Drive (Persistence Layer)
# This ensures that if Colab disconnects, your data and results are safe.
from google.colab import drive
drive.mount('/content/drive')

# Define paths on Google Drive
DRIVE_BASE = "/content/drive/MyDrive/wbvb_experiment"
OFFLINE_DATA_DIR = os.path.join(DRIVE_BASE, "offline_teacher_data")
RESULTS_DIR = os.path.join(DRIVE_BASE, "results")

# Create directories automatically
for d in [DRIVE_BASE, OFFLINE_DATA_DIR, RESULTS_DIR]:
    os.makedirs(d, exist_ok=True)
    print(f"✅ Checked/Created directory: {d}")

print(f"\nExperiment root: {DRIVE_BASE}")

In [None]:
# @title 3. Hugging Face Login (Required for Llama-2)
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# @title 4. Generate Offline Teacher Data
# Checks if data already exists on Drive to avoid re-running.

os.environ["OFFLINE_BATCH_SIZE"] = "8"
os.environ["WBVB_OFFLINE_DATA_PATH"] = OFFLINE_DATA_DIR # Defined in previous cell
os.environ["PYTHONPATH"] = "."

# Check if parquet file exists
parquet_path = os.path.join(OFFLINE_DATA_DIR, "offline_teacher_data.parquet")

if os.path.exists(parquet_path):
    print(f"✅ Found existing teacher data at {parquet_path}. Skipping generation.")
else:
    print("⚠️ No existing data found. Starting generation (this takes time)...")
    !python src/offline_teacher_data.py

In [None]:
# @title 5. Run Student Training (Resumable)
# Results are saved directly to Drive.
# If interrupted, simply re-run this cell.

os.environ["WBVB_OUTPUT_PATH"] = RESULTS_DIR # Defined in step 2

!python src/train_student.py --seeds "0,1,2,3,4,5,6"