##### 00 — Project Overview & Reproducibility (Entry Notebook)

This is the entry notebook for the repo.

What it does:
1) Forces the working directory to the repo root (so imports work).
2) Validates repo structure (`src/`, `configs/`).
3) Ensures `configs/project.yaml` exists (single source of truth).
4) Creates local folders (`data/*`, `runs/`) safely.
5) Sets reproducibility controls (seed + deterministic).
6) Creates a RUN_ID and writes `runs/<RUN_ID>/meta.json`.

Non-negotiable rules:
- Only real datasets and real results produced by this repo.
- No toy/synthetic/example data.
- Every run must save metadata to `runs/<RUN_ID>/meta.json`.


Imports (single place for shared imports)

In [1]:
# [CELL 00-01] Imports (keep shared imports here; avoid repeating)
import os
import sys
import json
from pathlib import Path

import yaml

Bootstrap: force repo root + sys.path

In [2]:
# [CELL 00-02] Bootstrap: locate repo root reliably (Windows-safe)

import os
import sys
from pathlib import Path

CWD = Path.cwd().resolve()
print("Initial CWD:", CWD)

def find_repo_root(start: Path) -> Path:
    """
    Search upward for repo root.
    Priority: look for PROJECT_STATE.md specifically.
    """
    for p in [start, *start.parents]:
        # Check for PROJECT_STATE.md first (most specific)
        if (p / "PROJECT_STATE.md").exists():
            print(f"  Found PROJECT_STATE.md in: {p}")
            return p
    
    # If not found, check for git
    for p in [start, *start.parents]:
        if (p / ".git").exists():
            print(f"  Found .git in: {p}")
            return p
    
    # Last resort: hardcoded fallback
    fallback = Path(r"D:\00_DS-ML-Workspace\mooc-coldstart-session-meta").resolve()
    print(f"  Using fallback: {fallback}")
    return fallback

REPO_ROOT = find_repo_root(CWD)

os.chdir(REPO_ROOT)

repo_str = str(REPO_ROOT)
if repo_str not in sys.path:
    sys.path.insert(0, repo_str)

print("REPO_ROOT:", REPO_ROOT)
print("CWD now:", Path.cwd())

# Validation checks
checks = {
    "src": (REPO_ROOT / "src").exists(),
    "notebooks": (REPO_ROOT / "notebooks").exists(),
    "PROJECT_STATE.md": (REPO_ROOT / "PROJECT_STATE.md").exists(),
}

print("Validation checks:")
for name, exists in checks.items():
    status = "✅" if exists else "❌"
    print(f"  {status} {name}")

# CRITICAL: Ensure we're in the right place
if not checks["PROJECT_STATE.md"]:
    print("\n⚠️  WARNING: PROJECT_STATE.md not found!")
    print(f"Current REPO_ROOT: {REPO_ROOT}")
    print("Please verify you're in the correct directory.")
    raise FileNotFoundError("PROJECT_STATE.md not found in detected repo root!")

Initial CWD: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\notebooks
  Found PROJECT_STATE.md in: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta
REPO_ROOT: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta
CWD now: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta
Validation checks:
  ✅ src
  ✅ notebooks
  ✅ PROJECT_STATE.md


Ensure src/configs/project.yaml

In [3]:
# [CELL 00-03] Ensure config exists at: src/configs/project.yaml

cfg_dir = REPO_ROOT / "src" / "configs"
cfg_dir.mkdir(parents=True, exist_ok=True)

cfg_path = cfg_dir / "project.yaml"

default_cfg = """project:
  name: mooc-coldstart-session-meta

paths:
  data_raw: data/raw
  data_processed: data/processed
  runs: runs

repro:
  seed: 42
  deterministic: true

training:
  num_workers: 2
  pin_memory: false
"""

if not cfg_path.exists():
    cfg_path.write_text(default_cfg, encoding="utf-8")
    print("Created:", cfg_path)
else:
    print("Exists:", cfg_path)

print("Config path:", cfg_path.resolve())


Exists: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\src\configs\project.yaml
Config path: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\src\configs\project.yaml


Validate/Create src/ structure 

In [4]:
# [CELL 00-03] Repo structure validation (and minimal fix)
# If src/ is missing, create it (safe, no data assumptions).
# This prevents ModuleNotFoundError: No module named 'src'

src_dir = REPO_ROOT / "src"
utils_dir = src_dir / "utils"

if not src_dir.exists():
    src_dir.mkdir(parents=True, exist_ok=True)
    print("Created:", src_dir)

if not utils_dir.exists():
    utils_dir.mkdir(parents=True, exist_ok=True)
    print("Created:", utils_dir)

# Ensure packages are importable
init_src = src_dir / "__init__.py"
init_utils = utils_dir / "__init__.py"

if not init_src.exists():
    init_src.write_text("", encoding="utf-8")
    print("Created:", init_src)

if not init_utils.exists():
    init_utils.write_text("", encoding="utf-8")
    print("Created:", init_utils)

print("src exists:", src_dir.exists())
print("src/utils exists:", utils_dir.exists())


src exists: True
src/utils exists: True


Data Inventory Cell

In [5]:
# [CELL 00-05] Data inventory (no assumptions)

print("\n" + "="*70)
print("DATA INVENTORY")
print("="*70)

# Load config to get paths
CFG = yaml.safe_load(cfg_path.read_text(encoding="utf-8"))
DATA_RAW = REPO_ROOT / CFG["paths"]["data_raw"]
DATA_PROCESSED = REPO_ROOT / CFG["paths"]["data_processed"]

# Create directories if missing
DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

print(f"\nRaw data directory: {DATA_RAW}")
print(f"Exists: {DATA_RAW.exists()}")

if DATA_RAW.exists():
    all_files = sorted(DATA_RAW.rglob("*"))
    files = [f for f in all_files if f.is_file()]
    
    if not files:
        print("\n⚠️  No datasets found in data/raw/")
        print("\nExpected datasets (from PROJECT_STATE.md):")
        print("  - Source: XuetangX (large MOOC dataset)")
        print("  - Target: MARS (small MOOC dataset)")
        print("\nPlease add datasets before proceeding to Notebook 01.")
    else:
        print(f"\nFound {len(files)} file(s):")
        for f in files:
            size_mb = f.stat().st_size / (1024**2)
            rel_path = f.relative_to(DATA_RAW)
            print(f"  - {rel_path}")
            print(f"    Size: {size_mb:.2f} MB")
else:
    print("⚠️  data/raw/ does not exist. Creating it now.")
    DATA_RAW.mkdir(parents=True, exist_ok=True)

print(f"\nProcessed data directory: {DATA_PROCESSED}")
print(f"Exists: {DATA_PROCESSED.exists()}")


DATA INVENTORY

Raw data directory: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\raw
Exists: True

Found 8 file(s):
  - mars\.gitkeep
    Size: 0.00 MB
  - mars\explicit_ratings_en.csv
    Size: 0.14 MB
  - mars\implicit_ratings_en.csv
    Size: 0.69 MB
  - mars\items_en.csv
    Size: 2.31 MB
  - mars\users_en.csv
    Size: 0.12 MB
  - xuetangx\Test.csv
    Size: 268.93 MB
  - xuetangx\Train.csv
    Size: 1042.72 MB
  - xuetangx\user_info.csv
    Size: 111.60 MB

Processed data directory: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed
Exists: True


Run Initialization Cell

In [6]:
# [CELL 00-06] Initialize reproducibility + RUN_ID

from src.utils.repro import ReproConfig, set_seed
from src.utils.runlog import make_run_id, collect_run_info, ensure_dirs, write_run_meta

# Load config
CFG = yaml.safe_load(cfg_path.read_text(encoding="utf-8"))

# Ensure directories
DATA_RAW = CFG["paths"]["data_raw"]
DATA_PROCESSED = CFG["paths"]["data_processed"]
RUNS_DIR = CFG["paths"]["runs"]

ensure_dirs(DATA_RAW, DATA_PROCESSED, RUNS_DIR)

# Create RUN_ID
RUN_ID = make_run_id("exp")
print(f"RUN_ID: {RUN_ID}")

# Set seed
seed = int(CFG["repro"]["seed"])
deterministic = bool(CFG["repro"]["deterministic"])
set_seed(ReproConfig(seed=seed, deterministic=deterministic))
print(f"Seed: {seed} | Deterministic: {deterministic}")

# Collect environment info
info = collect_run_info(RUN_ID)
print(f"\nEnvironment Info:")
print(f"  Python: {info.python}")
print(f"  PyTorch: {info.torch}")
print(f"  CUDA available: {info.cuda_available}")
print(f"  Platform: {info.platform}")

# Save metadata
meta_path = write_run_meta(RUNS_DIR, info, extra={"config": CFG, "notebook": "00_project_overview"})
print(f"\nMetadata saved to: {meta_path}")

RUN_ID: exp_20251227_053631
Seed: 42 | Deterministic: True

Environment Info:
  Python: 3.11.14
  PyTorch: 2.9.1+cpu
  CUDA available: False
  Platform: Windows 10

Metadata saved to: runs\exp_20251227_053631\meta.json
