Notebook intent + rules (print-only)

In [1]:
# [CELL 00-00] Notebook intent + rules (print-only)

from datetime import datetime
t0 = datetime.now()
print(f"[CELL 00-00] start={t0.isoformat(timespec='seconds')}")
print("Notebook: 00_bootstrap_and_conventions.ipynb")
print("Purpose: repo bootstrap + reproducibility conventions + shared utilities.")
print("Rules: real data only; user-level splits; chronological support→query; no leakage; deterministic runs.")
print("Outputs: ensures repo structure dirs; utilities for run registry + hashing + manifest/report writing.")
print("[CELL 00-00] done")


[CELL 00-00] start=2026-02-01T02:01:04
Notebook: 00_bootstrap_and_conventions.ipynb
Purpose: repo bootstrap + reproducibility conventions + shared utilities.
Rules: real data only; user-level splits; chronological support→query; no leakage; deterministic runs.
Outputs: ensures repo structure dirs; utilities for run registry + hashing + manifest/report writing.
[CELL 00-00] done


Imports + environment snapshot

In [2]:
# [CELL 00-01] Imports + environment snapshot

from __future__ import annotations

import os
import sys
import json
import time
import uuid
import shutil
import hashlib
import platform
import subprocess
from dataclasses import dataclass, asdict
from pathlib import Path
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple

import numpy as np
import pandas as pd

try:
    import torch
except Exception as e:
    torch = None
    print("[CELL 00-01] WARN: torch not available yet:", repr(e))

t0 = datetime.now()
print(f"[CELL 00-01] start={t0.isoformat(timespec='seconds')}")
print("Python:", sys.version.replace("\n", " "))
print("Platform:", platform.platform())
print("CWD:", Path.cwd().resolve())
print("NumPy:", np.__version__)
print("Pandas:", pd.__version__)
print("Torch:", getattr(torch, "__version__", None))
print("[CELL 00-01] done")


[CELL 00-01] start=2026-02-01T02:01:05
Python: 3.11.14 | packaged by conda-forge | (main, Jan 26 2026, 23:48:32) [GCC 14.3.0]
Platform: Linux-6.8.0-52-generic-x86_64-with-glibc2.39
CWD: /workspace/anonymous-users-mooc-session-meta/notebooks
NumPy: 2.4.1
Pandas: 3.0.0
Torch: 2.10.0+cu128
[CELL 00-01] done


Locate REPO_ROOT (Windows-safe) + define fixed paths

In [3]:
# [CELL 00-02] Locate REPO_ROOT (Windows-safe) + define fixed paths

def find_repo_root(start: Path) -> Path:
    """
    Search upward for repo root. Single source of truth: PROJECT_STATE.md
    """
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            return p
    raise RuntimeError(
        "Could not find PROJECT_STATE.md in current or parent directories. "
        "Open this notebook from inside the repo, or create PROJECT_STATE.md at repo root."
    )

T0 = time.time()
CWD = Path.cwd().resolve()
REPO_ROOT = find_repo_root(CWD)

# Fixed repo paths (DO NOT invent new paths elsewhere)
PATHS = {
    "PROJECT_STATE": REPO_ROOT / "PROJECT_STATE.md",
    "META_REGISTRY": REPO_ROOT / "meta.json",
    "DATA_RAW": REPO_ROOT / "data" / "raw",
    "DATA_INTERIM": REPO_ROOT / "data" / "interim",
    "DATA_PROCESSED": REPO_ROOT / "data" / "processed",
    "NOTEBOOKS": REPO_ROOT / "notebooks",
    "REPORTS": REPO_ROOT / "reports",
    "SRC": REPO_ROOT / "src",
    "RUNS": REPO_ROOT / "runs",
}

print(f"[CELL 00-02] start={datetime.now().isoformat(timespec='seconds')}")
print("REPO_ROOT:", REPO_ROOT)
for k, v in PATHS.items():
    print(f"  {k}: {v}")

print(f"[CELL 00-02] elapsed={time.time()-T0:.2f}s")


[CELL 00-02] start=2026-02-01T02:01:05
REPO_ROOT: /workspace/anonymous-users-mooc-session-meta
  PROJECT_STATE: /workspace/anonymous-users-mooc-session-meta/PROJECT_STATE.md
  META_REGISTRY: /workspace/anonymous-users-mooc-session-meta/meta.json
  DATA_RAW: /workspace/anonymous-users-mooc-session-meta/data/raw
  DATA_INTERIM: /workspace/anonymous-users-mooc-session-meta/data/interim
  DATA_PROCESSED: /workspace/anonymous-users-mooc-session-meta/data/processed
  NOTEBOOKS: /workspace/anonymous-users-mooc-session-meta/notebooks
  REPORTS: /workspace/anonymous-users-mooc-session-meta/reports
  SRC: /workspace/anonymous-users-mooc-session-meta/src
  RUNS: /workspace/anonymous-users-mooc-session-meta/runs
[CELL 00-02] elapsed=0.00s


Ensure required directories exist (idempotent)

In [4]:
# [CELL 00-03] Ensure required directories exist (idempotent)

T0 = time.time()

def ensure_dir(p: Path) -> None:
    p.mkdir(parents=True, exist_ok=True)

# Base dirs
ensure_dir(PATHS["DATA_RAW"] / "mars")
ensure_dir(PATHS["DATA_RAW"] / "xuetangx")
ensure_dir(PATHS["DATA_INTERIM"])
ensure_dir(PATHS["DATA_PROCESSED"] / "mars")
ensure_dir(PATHS["DATA_PROCESSED"] / "xuetangx")
ensure_dir(PATHS["REPORTS"])

# Optional dirs (keep clean; created but not required)
ensure_dir(PATHS["SRC"])
ensure_dir(PATHS["RUNS"])

print(f"[CELL 00-03] start={datetime.now().isoformat(timespec='seconds')}")
print("Ensured directories exist.")
print("data/raw/mars:", (PATHS["DATA_RAW"] / "mars").exists())
print("data/raw/xuetangx:", (PATHS["DATA_RAW"] / "xuetangx").exists())
print("reports:", PATHS["REPORTS"].exists())
print(f"[CELL 00-03] elapsed={time.time()-T0:.2f}s")


[CELL 00-03] start=2026-02-01T02:01:05
Ensured directories exist.
data/raw/mars: True
data/raw/xuetangx: True
reports: True
[CELL 00-03] elapsed=0.00s


Cell logging helpers (standardized prints)

In [5]:
# [CELL 00-04] Cell logging helpers (standardized prints)

def now_ts() -> str:
    return datetime.now().strftime("%Y%m%d_%H%M%S")

def cell_start(cell_id: str, title: str, **kwargs: Any) -> float:
    t = time.time()
    print(f"\n[{cell_id}] {title}")
    print(f"[{cell_id}] start={datetime.now().isoformat(timespec='seconds')}")
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    return t

def cell_end(cell_id: str, t0: float, **kwargs: Any) -> None:
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    print(f"[{cell_id}] elapsed={time.time()-t0:.2f}s")
    print(f"[{cell_id}] done")

t0 = cell_start("CELL 00-04", "Initialized cell logging helpers", repo_root=str(REPO_ROOT))
cell_end("CELL 00-04", t0)



[CELL 00-04] Initialized cell logging helpers
[CELL 00-04] start=2026-02-01T02:01:05
[CELL 00-04] repo_root=/workspace/anonymous-users-mooc-session-meta
[CELL 00-04] elapsed=0.00s
[CELL 00-04] done


Seed everything + determinism controls

In [6]:
# [CELL 00-05] Seed everything + determinism controls

t0 = cell_start("CELL 00-05", "Global seeding + determinism")

GLOBAL_SEED = 20260106  # fixed seed for reproducibility; override per-run in config if needed

def seed_everything(seed: int) -> None:
    import random
    random.seed(seed)
    np.random.seed(seed)

    if torch is not None:
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        # Determinism (note: may reduce speed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

        # PyTorch 2.x deterministic algorithms (can throw if op not deterministic)
        try:
            torch.use_deterministic_algorithms(True)
        except Exception as e:
            print("[CELL 00-05] WARN: torch.use_deterministic_algorithms failed:", repr(e))

seed_everything(GLOBAL_SEED)

DEVICE = None
if torch is not None:
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

cell_end("CELL 00-05", t0, seed=GLOBAL_SEED, device=DEVICE)



[CELL 00-05] Global seeding + determinism
[CELL 00-05] start=2026-02-01T02:01:05
[CELL 00-05] seed=20260106
[CELL 00-05] device=cuda
[CELL 00-05] elapsed=0.99s
[CELL 00-05] done


Robust JSON IO (atomic writes) + small helpers

In [7]:
# [CELL 00-06] Robust JSON IO (atomic writes) + small helpers

t0 = cell_start("CELL 00-06", "JSON IO utilities (atomic)")

def read_json(path: Path) -> Any:
    if not path.exists():
        raise RuntimeError(f"Missing required JSON file: {path}")
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def write_json_atomic(path: Path, obj: Any, indent: int = 2) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = path.with_suffix(path.suffix + f".tmp_{uuid.uuid4().hex}")
    with tmp.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=indent)
    tmp.replace(path)

def pretty(obj: Any, max_len: int = 3000) -> None:
    s = json.dumps(obj, ensure_ascii=False, indent=2)
    if len(s) > max_len:
        print(s[:max_len] + "\n...<truncated>...")
    else:
        print(s)

cell_end("CELL 00-06", t0)



[CELL 00-06] JSON IO utilities (atomic)
[CELL 00-06] start=2026-02-01T02:01:06
[CELL 00-06] elapsed=0.00s
[CELL 00-06] done


File hashing + directory fingerprinting (for real-data provenance)

In [8]:
# [CELL 00-07] File hashing + directory fingerprinting (for real-data provenance)

t0 = cell_start("CELL 00-07", "Dataset fingerprint utilities")

def sha256_file(path: Path, chunk_size: int = 1024 * 1024) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        while True:
            b = f.read(chunk_size)
            if not b:
                break
            h.update(b)
    return h.hexdigest()

def list_files(root: Path, pattern: str = "*") -> List[Path]:
    if not root.exists():
        return []
    return sorted([p for p in root.rglob(pattern) if p.is_file()])

def dir_fingerprint(root: Path, pattern: str = "*") -> Dict[str, Any]:
    files = list_files(root, pattern=pattern)
    rows = []
    for p in files:
        rel = str(p.relative_to(root))
        rows.append({
            "relpath": rel,
            "bytes": p.stat().st_size,
            "sha256": sha256_file(p),
        })
    # stable aggregate hash
    agg = hashlib.sha256()
    for r in rows:
        agg.update((r["relpath"] + "|" + str(r["bytes"]) + "|" + r["sha256"]).encode("utf-8"))
    return {
        "root": str(root),
        "n_files": len(rows),
        "files": rows,
        "agg_sha256": agg.hexdigest(),
    }

# Sanity: do not hash huge raw dirs here (we only define tools)
cell_end("CELL 00-07", t0)



[CELL 00-07] Dataset fingerprint utilities
[CELL 00-07] start=2026-02-01T02:01:06
[CELL 00-07] elapsed=0.00s
[CELL 00-07] done


Run directory convention + report/config/manifest scaffolding

In [9]:
# [CELL 00-08] Run directory convention + report/config/manifest scaffolding

t0 = cell_start("CELL 00-08", "Run tagging + output dir utilities")

@dataclass
class RunContext:
    notebook: str
    run_tag: str
    out_dir: str  # reports/<notebook>/<RUN_TAG>
    run_id: str   # uuid4

def start_run(notebook_name: str) -> RunContext:
    run_tag = now_ts()
    out_dir = PATHS["REPORTS"] / notebook_name / run_tag
    ensure_dir(out_dir)
    rc = RunContext(
        notebook=notebook_name,
        run_tag=run_tag,
        out_dir=str(out_dir),
        run_id=uuid.uuid4().hex,
    )
    print("[start_run] out_dir:", out_dir)
    return rc

def init_run_files(rc: RunContext, config: Dict[str, Any]) -> Tuple[Path, Path, Path]:
    out_dir = Path(rc.out_dir)
    report_path = out_dir / "report.json"
    config_path = out_dir / "config.json"
    manifest_path = out_dir / "manifest.json"

    write_json_atomic(config_path, config)

    # report skeleton (filled later per notebook)
    report = {
        "run_id": rc.run_id,
        "notebook": rc.notebook,
        "run_tag": rc.run_tag,
        "created_at": datetime.now().isoformat(timespec="seconds"),
        "repo_root": str(REPO_ROOT),
        "metrics": {},
        "key_findings": [],
        "sanity_samples": {},
        "data_fingerprints": {},
        "notes": [],
    }
    write_json_atomic(report_path, report)

    # manifest skeleton
    manifest = {
        "run_id": rc.run_id,
        "notebook": rc.notebook,
        "run_tag": rc.run_tag,
        "artifacts": [],  # list of {path, bytes, sha256}
    }
    write_json_atomic(manifest_path, manifest)

    return report_path, config_path, manifest_path

cell_end("CELL 00-08", t0)



[CELL 00-08] Run tagging + output dir utilities
[CELL 00-08] start=2026-02-01T02:01:06
[CELL 00-08] elapsed=0.00s
[CELL 00-08] done


meta.json registry (append-only) + helpers

In [10]:
# [CELL 00-09] meta.json registry (append-only) + helpers

t0 = cell_start("CELL 00-09", "meta.json run registry (append-only)")

def load_meta_registry(path: Path) -> Dict[str, Any]:
    if not path.exists():
        return {"schema_version": 1, "runs": []}
    return read_json(path)

def append_run_to_registry(path: Path, run_record: Dict[str, Any]) -> None:
    reg = load_meta_registry(path)
    if "runs" not in reg or not isinstance(reg["runs"], list):
        raise RuntimeError("meta.json invalid: missing 'runs' list")
    reg["runs"].append(run_record)
    write_json_atomic(path, reg)

# Ensure file exists (idempotent)
if not PATHS["META_REGISTRY"].exists():
    write_json_atomic(PATHS["META_REGISTRY"], {"schema_version": 1, "runs": []})
    print("Created meta.json:", PATHS["META_REGISTRY"])
else:
    print("meta.json exists:", PATHS["META_REGISTRY"])

# Print small preview
reg_preview = load_meta_registry(PATHS["META_REGISTRY"])
print("meta.json runs:", len(reg_preview.get("runs", [])))

cell_end("CELL 00-09", t0)



[CELL 00-09] meta.json run registry (append-only)
[CELL 00-09] start=2026-02-01T02:01:06
Created meta.json: /workspace/anonymous-users-mooc-session-meta/meta.json
meta.json runs: 0
[CELL 00-09] elapsed=0.00s
[CELL 00-09] done


[CELL 00-10] Skipped

Generic validation helpers (leakage checks scaffolding)

In [11]:
# [CELL 00-11] Generic validation helpers (leakage checks scaffolding)

t0 = cell_start("CELL 00-11", "Validation helpers (used across notebooks)")

def assert_disjoint(a: set, b: set, name_a: str, name_b: str) -> None:
    inter = a.intersection(b)
    if inter:
        raise RuntimeError(f"Leakage: {name_a} ∩ {name_b} not empty (n={len(inter)}). Example={next(iter(inter))}")

def assert_nonempty(x: Any, name: str) -> None:
    if x is None:
        raise RuntimeError(f"{name} is None")
    if hasattr(x, "__len__") and len(x) == 0:
        raise RuntimeError(f"{name} is empty")

def assert_sorted_non_decreasing(arr: np.ndarray, name: str) -> None:
    if np.any(arr[1:] < arr[:-1]):
        raise RuntimeError(f"{name} is not sorted non-decreasing (chronology violation)")

def assert_support_query_disjoint(support_idx: np.ndarray, query_idx: np.ndarray) -> None:
    s = set(map(int, support_idx.tolist()))
    q = set(map(int, query_idx.tolist()))
    inter = s.intersection(q)
    if inter:
        raise RuntimeError(f"Support/Query index overlap (n={len(inter)}). Example={next(iter(inter))}")

cell_end("CELL 00-11", t0)



[CELL 00-11] Validation helpers (used across notebooks)
[CELL 00-11] start=2026-02-01T02:01:06
[CELL 00-11] elapsed=0.00s
[CELL 00-11] done


Quick self-test: create a dummy run record (no data touched)

In [12]:
# [CELL 00-12] Quick self-test: create a dummy run record (no data touched)

t0 = cell_start("CELL 00-12", "Self-test run scaffolding (no dataset required)")

NOTEBOOK_NAME = "00_bootstrap_and_conventions"
rc = start_run(NOTEBOOK_NAME)

config = {
    "notebook": NOTEBOOK_NAME,
    "global_seed": GLOBAL_SEED,
    "device": DEVICE,
    "repo_root": str(REPO_ROOT),
    "paths": {k: str(v) for k, v in PATHS.items()},
}

report_path, config_path, manifest_path = init_run_files(rc, config)

run_record = {
    "run_id": rc.run_id,
    "notebook": rc.notebook,
    "run_tag": rc.run_tag,
    "out_dir": rc.out_dir,
    "created_at": datetime.now().isoformat(timespec="seconds"),
}

append_run_to_registry(PATHS["META_REGISTRY"], run_record)

print("Wrote:", report_path)
print("Wrote:", config_path)
print("Wrote:", manifest_path)
print("Updated meta.json (append-only)")

cell_end("CELL 00-12", t0)



[CELL 00-12] Self-test run scaffolding (no dataset required)
[CELL 00-12] start=2026-02-01T02:01:06
[start_run] out_dir: /workspace/anonymous-users-mooc-session-meta/reports/00_bootstrap_and_conventions/20260201_020106
Wrote: /workspace/anonymous-users-mooc-session-meta/reports/00_bootstrap_and_conventions/20260201_020106/report.json
Wrote: /workspace/anonymous-users-mooc-session-meta/reports/00_bootstrap_and_conventions/20260201_020106/config.json
Wrote: /workspace/anonymous-users-mooc-session-meta/reports/00_bootstrap_and_conventions/20260201_020106/manifest.json
Updated meta.json (append-only)
[CELL 00-12] elapsed=0.00s
[CELL 00-12] done


What Notebook 01 must do (checklist only)

In [13]:
# [CELL 00-13] What Notebook 01 must do (checklist only)

t0 = cell_start("CELL 00-13", "Next notebook checklist")

print("Next: 01_ingest_mars.ipynb")
print("Checklist:")
print("  - Detect raw MARS files under data/raw/mars/ (no synthetic data).")
print("  - Load into a normalized events table (interim) with schema print + counts.")
print("  - Save dataset fingerprint(s) + small sanity samples into report.json.")
print("  - Do NOT sessionize here (that is Notebook 02).")
print("\nNo action required now. Proceed to Notebook 01 when ready.")

cell_end("CELL 00-13", t0)



[CELL 00-13] Next notebook checklist
[CELL 00-13] start=2026-02-01T02:01:06
Next: 01_ingest_mars.ipynb
Checklist:
  - Detect raw MARS files under data/raw/mars/ (no synthetic data).
  - Load into a normalized events table (interim) with schema print + counts.
  - Save dataset fingerprint(s) + small sanity samples into report.json.
  - Do NOT sessionize here (that is Notebook 02).

No action required now. Proceed to Notebook 01 when ready.
[CELL 00-13] elapsed=0.00s
[CELL 00-13] done
