Bootstrap: repo root + paths + logger

In [1]:
# [CELL 03-00] Bootstrap: repo root + paths + logger

import json
import time
import uuid
import hashlib
from pathlib import Path
from datetime import datetime
from typing import Any, Dict, Optional

import numpy as np
import pandas as pd

t0 = datetime.now()
print(f"[CELL 03-00] start={t0.isoformat(timespec='seconds')}")
print("[CELL 03-00] CWD:", Path.cwd().resolve())

def find_repo_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            return p
    raise RuntimeError("Could not find PROJECT_STATE.md. Open notebook from within the repo.")

REPO_ROOT = find_repo_root(Path.cwd())
print("[CELL 03-00] REPO_ROOT:", REPO_ROOT)

PATHS = {
    "META_REGISTRY": REPO_ROOT / "meta.json",
    "DATA_INTERIM": REPO_ROOT / "data" / "interim",
    "DATA_PROCESSED": REPO_ROOT / "data" / "processed",
    "REPORTS": REPO_ROOT / "reports",
}
for k, v in PATHS.items():
    print(f"[CELL 03-00] {k}={v}")

def cell_start(cell_id: str, title: str, **kwargs: Any) -> float:
    t = time.time()
    print(f"\n[{cell_id}] {title}")
    print(f"[{cell_id}] start={datetime.now().isoformat(timespec='seconds')}")
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    return t

def cell_end(cell_id: str, t0: float, **kwargs: Any) -> None:
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    print(f"[{cell_id}] elapsed={time.time()-t0:.2f}s")
    print(f"[{cell_id}] done")

print("[CELL 03-00] done")


[CELL 03-00] start=2026-01-06T22:23:21
[CELL 03-00] CWD: C:\anonymous-users-mooc-session-meta\notebooks
[CELL 03-00] REPO_ROOT: C:\anonymous-users-mooc-session-meta
[CELL 03-00] META_REGISTRY=C:\anonymous-users-mooc-session-meta\meta.json
[CELL 03-00] DATA_INTERIM=C:\anonymous-users-mooc-session-meta\data\interim
[CELL 03-00] DATA_PROCESSED=C:\anonymous-users-mooc-session-meta\data\processed
[CELL 03-00] REPORTS=C:\anonymous-users-mooc-session-meta\reports
[CELL 03-00] done


JSON IO (Timestamp-safe) + hashing

In [2]:
# [CELL 03-01] JSON IO + hashing (Timestamp-safe)

t0 = cell_start("CELL 03-01", "JSON IO + hashing (Timestamp-safe)")

def _json_default(o):
    try:
        import pandas as pd
        if isinstance(o, (pd.Timestamp,)):
            return o.isoformat()
    except Exception:
        pass
    try:
        import numpy as np
        if isinstance(o, (np.integer,)):
            return int(o)
        if isinstance(o, (np.floating,)):
            return float(o)
        if isinstance(o, (np.bool_,)):
            return bool(o)
    except Exception:
        pass
    try:
        from datetime import datetime, date
        if isinstance(o, (datetime, date)):
            return o.isoformat()
    except Exception:
        pass
    return str(o)

def write_json_atomic(path: Path, obj: Any, indent: int = 2) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = path.with_suffix(path.suffix + f".tmp_{uuid.uuid4().hex}")
    with tmp.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=indent, default=_json_default)
    tmp.replace(path)

def read_json(path: Path) -> Any:
    if not path.exists():
        raise RuntimeError(f"Missing JSON file: {path}")
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def sha256_file(path: Path, chunk_size: int = 1024 * 1024) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        while True:
            b = f.read(chunk_size)
            if not b:
                break
            h.update(b)
    return h.hexdigest()

def safe_artifact_record(path: Path) -> Dict[str, Any]:
    rec = {"path": str(path), "bytes": int(path.stat().st_size), "sha256": None, "sha256_error": None}
    try:
        rec["sha256"] = sha256_file(path)
    except PermissionError as e:
        rec["sha256_error"] = f"PermissionError: {e}"
        print("[CELL 03-01] WARN: locked, cannot hash now:", path)
    return rec

cell_end("CELL 03-01", t0)



[CELL 03-01] JSON IO + hashing (Timestamp-safe)
[CELL 03-01] start=2026-01-06T22:23:37
[CELL 03-01] elapsed=0.00s
[CELL 03-01] done


Start run + init report/config/manifest + meta.json append

In [3]:
# [CELL 03-02] Start run + init files + meta.json append-only

t0 = cell_start("CELL 03-02", "Start run")

NOTEBOOK_NAME = "03_build_vocab_and_pairs_mars"
RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_ID = uuid.uuid4().hex

OUT_DIR = PATHS["REPORTS"] / NOTEBOOK_NAME / RUN_TAG
OUT_DIR.mkdir(parents=True, exist_ok=True)

REPORT_PATH = OUT_DIR / "report.json"
CONFIG_PATH = OUT_DIR / "config.json"
MANIFEST_PATH = OUT_DIR / "manifest.json"

DUCKDB_PATH = PATHS["DATA_INTERIM"] / "mars.duckdb"
EVENTS_VIEW = "mars_events_sessionized"  # from Notebook 02

OUT_BASE = PATHS["DATA_PROCESSED"] / "mars"
VOCAB_DIR = OUT_BASE / "vocab"
PAIRS_DIR = OUT_BASE / "pairs"
VOCAB_DIR.mkdir(parents=True, exist_ok=True)
PAIRS_DIR.mkdir(parents=True, exist_ok=True)

CFG = {
    "notebook": NOTEBOOK_NAME,
    "run_id": RUN_ID,
    "run_tag": RUN_TAG,
    "inputs": {"duckdb_path": str(DUCKDB_PATH), "events_view": EVENTS_VIEW},
    "outputs": {
        "vocab_dir": str(VOCAB_DIR),
        "pairs_dir": str(PAIRS_DIR),
        "reports_out_dir": str(OUT_DIR),
    },
    "pairs": {
        "task": "next_item_prediction",
        "min_session_len": 2,      # must have at least (prefix,label)
        "max_prefix_len": 50,      # cap for storage; models can pad later
        "store_prefix_as": "list_int",  # list of item ids (no padding here)
    }
}
write_json_atomic(CONFIG_PATH, CFG)

report = {
    "run_id": RUN_ID,
    "notebook": NOTEBOOK_NAME,
    "run_tag": RUN_TAG,
    "created_at": datetime.now().isoformat(timespec="seconds"),
    "repo_root": str(REPO_ROOT),
    "metrics": {},
    "key_findings": [],
    "sanity_samples": {},
    "data_fingerprints": {},
    "notes": [],
}
write_json_atomic(REPORT_PATH, report)

manifest = {"run_id": RUN_ID, "notebook": NOTEBOOK_NAME, "run_tag": RUN_TAG, "artifacts": []}
write_json_atomic(MANIFEST_PATH, manifest)

# meta.json append-only
META_PATH = PATHS["META_REGISTRY"]
if not META_PATH.exists():
    write_json_atomic(META_PATH, {"schema_version": 1, "runs": []})
meta = read_json(META_PATH)
meta["runs"].append({
    "run_id": RUN_ID,
    "notebook": NOTEBOOK_NAME,
    "run_tag": RUN_TAG,
    "out_dir": str(OUT_DIR),
    "created_at": datetime.now().isoformat(timespec="seconds"),
})
write_json_atomic(META_PATH, meta)

cell_end("CELL 03-02", t0, out_dir=str(OUT_DIR))



[CELL 03-02] Start run
[CELL 03-02] start=2026-01-06T22:23:55
[CELL 03-02] out_dir=C:\anonymous-users-mooc-session-meta\reports\03_build_vocab_and_pairs_mars\20260106_222355
[CELL 03-02] elapsed=0.03s
[CELL 03-02] done


DuckDB: verify events view schema

In [4]:
# [CELL 03-03] DuckDB verify mars_events_sessionized

t0 = cell_start("CELL 03-03", "Verify events view exists", duckdb=str(DUCKDB_PATH), view=EVENTS_VIEW)

import duckdb

if not DUCKDB_PATH.exists():
    raise RuntimeError(f"Missing DuckDB: {DUCKDB_PATH}. Run Notebook 01 & 02 first.")

con = duckdb.connect(str(DUCKDB_PATH), read_only=True)

n = int(con.execute(f"SELECT COUNT(*) FROM {EVENTS_VIEW}").fetchone()[0])
schema_df = con.execute(f"DESCRIBE {EVENTS_VIEW}").fetchdf()

print("[CELL 03-03] n_events:", n)
print("[CELL 03-03] schema head:")
print(schema_df.head(50).to_string(index=False))

needed = {"user_id", "item_id", "ts_epoch", "session_id", "pos_in_sess", "sess_len"}
missing = needed - set(schema_df["column_name"].tolist())
if missing:
    raise RuntimeError(f"Missing required columns in {EVENTS_VIEW}: {missing}")

cell_end("CELL 03-03", t0, n_events=n)



[CELL 03-03] Verify events view exists
[CELL 03-03] start=2026-01-06T22:24:16
[CELL 03-03] duckdb=C:\anonymous-users-mooc-session-meta\data\interim\mars.duckdb
[CELL 03-03] view=mars_events_sessionized
[CELL 03-03] n_events: 3659
[CELL 03-03] schema head:
column_name              column_type null  key default extra
    user_id                   BIGINT  YES None    None  None
    item_id                   BIGINT  YES None    None  None
     rating                   BIGINT  YES None    None  None
         ts TIMESTAMP WITH TIME ZONE  YES None    None  None
   ts_epoch                   BIGINT  YES None    None  None
   sess_num                   DOUBLE  YES None    None  None
 session_id                  VARCHAR  YES None    None  None
pos_in_sess                   BIGINT  YES None    None  None
   sess_len                   BIGINT  YES None    None  None
     ts_raw                  VARCHAR  YES None    None  None
[CELL 03-03] n_events=3659
[CELL 03-03] elapsed=0.06s
[CELL 03-03] done


Build item vocabulary (stable ordering)

In [5]:
# [CELL 03-04] Build item vocabulary (item_id -> item_idx)

t0 = cell_start("CELL 03-04", "Build item vocabulary")

# Stable ordering: sort by item_id (string compare) to keep determinism
items_df = con.execute(f"""
SELECT DISTINCT item_id
FROM {EVENTS_VIEW}
ORDER BY item_id
""").fetchdf()

n_items = int(items_df.shape[0])
print("[CELL 03-04] n_items:", n_items)
print("[CELL 03-04] first10 item_id:")
print(items_df.head(10).to_string(index=False))

item2id = {str(it): int(i) for i, it in enumerate(items_df["item_id"].astype(str).tolist())}
id2item = {int(i): str(it) for it, i in item2id.items()}

item2id_path = VOCAB_DIR / "item2id.json"
id2item_path = VOCAB_DIR / "id2item.json"

write_json_atomic(item2id_path, item2id)
write_json_atomic(id2item_path, id2item)

print("[CELL 03-04] wrote:", item2id_path)
print("[CELL 03-04] wrote:", id2item_path)

cell_end("CELL 03-04", t0, n_items=n_items)



[CELL 03-04] Build item vocabulary
[CELL 03-04] start=2026-01-06T22:24:31
[CELL 03-04] n_items: 776
[CELL 03-04] first10 item_id:
 item_id
     510
     511
     512
     513
     514
     515
     516
     526
     527
     528
[CELL 03-04] wrote: C:\anonymous-users-mooc-session-meta\data\processed\mars\vocab\item2id.json
[CELL 03-04] wrote: C:\anonymous-users-mooc-session-meta\data\processed\mars\vocab\id2item.json
[CELL 03-04] n_items=776
[CELL 03-04] elapsed=0.01s
[CELL 03-04] done


Create pairs table (prefix list[int] â†’ label int)

This makes one row per prediction step within a session:

prefix = items up to position t-1

label = item at position t
No padding. No tensors. Just lists (later notebooks can tensorize if needed).

In [7]:
# [CELL 03-05] Build prefix->label pairs (session next-item)

t0 = cell_start("CELL 03-05", "Build prefix->label pairs (list[int] prefix)")

# Load session sequences into pandas (small dataset; acceptable)
seq_df = con.execute(f"""
SELECT
  session_id,
  user_id,
  ts_epoch,
  pos_in_sess,
  item_id
FROM {EVENTS_VIEW}
ORDER BY session_id, pos_in_sess
""").fetchdf()

print("[CELL 03-05] seq_df shape:", seq_df.shape)
print("[CELL 03-05] head:")
print(seq_df.head(5).to_string(index=False))

# Map to item indices
seq_df["item_idx"] = seq_df["item_id"].astype(str).map(item2id)
if seq_df["item_idx"].isna().any():
    bad = seq_df[seq_df["item_idx"].isna()].head(10)
    print("[CELL 03-05] ERROR: unmapped items head10:\n", bad.to_string(index=False))
    raise RuntimeError("Found items not in vocab mapping.")

min_len = int(CFG["pairs"]["min_session_len"])
max_prefix = int(CFG["pairs"]["max_prefix_len"])

pairs_rows = []
for sid, g in seq_df.groupby("session_id", sort=False):
    g = g.sort_values("pos_in_sess")
    items = g["item_idx"].astype(int).tolist()
    if len(items) < min_len:
        continue
    user_id = g["user_id"].iloc[0]
    # Create training steps t=2..L
    for tpos in range(2, len(items) + 1):
        prefix = items[: tpos - 1]
        if len(prefix) > max_prefix:
            prefix = prefix[-max_prefix:]  # keep last max_prefix (most recent)
        label = items[tpos - 1]
        pairs_rows.append({
            "session_id": sid,
            "user_id": user_id,
            "tpos": int(tpos),
            "prefix": prefix,
            "prefix_len": int(len(prefix)),
            "label": int(label),
        })

pairs = pd.DataFrame(pairs_rows)
if pairs.shape[0] == 0:
    raise RuntimeError("No pairs were generated. Check sessionization output.")

print("[CELL 03-05] pairs shape:", pairs.shape)
print("[CELL 03-05] pairs head3:")
print(pairs.head(3).to_string(index=False))

cell_end("CELL 03-05", t0, n_pairs=int(pairs.shape[0]))



[CELL 03-05] Build prefix->label pairs (list[int] prefix)
[CELL 03-05] start=2026-01-06T22:25:26
[CELL 03-05] seq_df shape: (3659, 5)
[CELL 03-05] head:
   session_id  user_id   ts_epoch  pos_in_sess  item_id
104074_000001   104074 1545205176            1    32033
104074_000002   104074 1563780428            1    52609
104074_000002   104074 1563780660            2    52616
104074_000002   104074 1563780864            3    52615
104074_000002   104074 1563780963            4    52610
[CELL 03-05] pairs shape: (2337, 6)
[CELL 03-05] pairs head3:
   session_id  user_id  tpos          prefix  prefix_len  label
104074_000002   104074     2           [521]           1    528
104074_000002   104074     3      [521, 528]           2    527
104074_000002   104074     4 [521, 528, 527]           3    522
[CELL 03-05] n_pairs=2337
[CELL 03-05] elapsed=0.37s
[CELL 03-05] done


Save pairs Parquet + register DuckDB view

In [8]:
# [CELL 03-06] Save pairs parquet + register DuckDB view

t0 = cell_start("CELL 03-06", "Write pairs.parquet + DuckDB view")

pairs_out = PAIRS_DIR / "pairs.parquet"
pairs.to_parquet(pairs_out, index=False, compression="zstd")
print("[CELL 03-06] wrote:", pairs_out, "bytes=", int(pairs_out.stat().st_size))

# Register a view pointing to the parquet
con.close()
conw = duckdb.connect(str(DUCKDB_PATH), read_only=False)
conw.execute("DROP VIEW IF EXISTS mars_pairs;")
conw.execute(f"""
CREATE VIEW mars_pairs AS
SELECT * FROM read_parquet('{str(pairs_out).replace("'", "''")}')
""")
chk = int(conw.execute("SELECT COUNT(*) FROM mars_pairs").fetchone()[0])
print("[CELL 03-06] mars_pairs rows:", chk)

conw.close()
print("[CELL 03-06] closed DuckDB connection (avoid lock)")

cell_end("CELL 03-06", t0, pairs_rows=chk)



[CELL 03-06] Write pairs.parquet + DuckDB view
[CELL 03-06] start=2026-01-06T22:25:52
[CELL 03-06] wrote: C:\anonymous-users-mooc-session-meta\data\processed\mars\pairs\pairs.parquet bytes= 27812
[CELL 03-06] mars_pairs rows: 2337
[CELL 03-06] closed DuckDB connection (avoid lock)
[CELL 03-06] pairs_rows=2337
[CELL 03-06] elapsed=0.10s
[CELL 03-06] done


Update report + manifest + close

In [9]:
# [CELL 03-07] Update report + manifest

t0 = cell_start("CELL 03-07", "Write report + manifest")

report = read_json(REPORT_PATH)
manifest = read_json(MANIFEST_PATH)

manifest["artifacts"].append(safe_artifact_record(item2id_path))
manifest["artifacts"].append(safe_artifact_record(id2item_path))
manifest["artifacts"].append(safe_artifact_record(pairs_out))

report["key_findings"].append("Built item vocabulary (item2id/id2item) and prefix->label pairs for next-item prediction.")
report["sanity_samples"]["vocab_first10"] = list(item2id.items())[:10]
report["sanity_samples"]["pairs_head3"] = pairs.head(3).to_dict(orient="records")
report["sanity_samples"]["pairs_counts"] = {
    "n_pairs": int(pairs.shape[0]),
    "n_sessions_in_pairs": int(pairs["session_id"].nunique()),
    "avg_prefix_len": float(pairs["prefix_len"].mean()),
    "p50_prefix_len": float(pairs["prefix_len"].median()),
}
report["notes"].append("Pairs store variable-length prefix lists (no padding) to avoid introducing noise.")

write_json_atomic(REPORT_PATH, report)
write_json_atomic(MANIFEST_PATH, manifest)

print("[CELL 03-07] updated:", REPORT_PATH)
print("[CELL 03-07] updated:", MANIFEST_PATH)

cell_end("CELL 03-07", t0, n_artifacts=len(manifest["artifacts"]))



[CELL 03-07] Write report + manifest
[CELL 03-07] start=2026-01-06T22:26:09
[CELL 03-07] updated: C:\anonymous-users-mooc-session-meta\reports\03_build_vocab_and_pairs_mars\20260106_222355\report.json
[CELL 03-07] updated: C:\anonymous-users-mooc-session-meta\reports\03_build_vocab_and_pairs_mars\20260106_222355\manifest.json
[CELL 03-07] n_artifacts=3
[CELL 03-07] elapsed=0.05s
[CELL 03-07] done
