Bootstrap + logger

In [1]:
# [CELL 06-00] Bootstrap: repo root + paths + logger

import json, time, uuid, hashlib
from pathlib import Path
from datetime import datetime
from typing import Any, Dict

import numpy as np
import pandas as pd

print(f"[CELL 06-00] start={datetime.now().isoformat(timespec='seconds')}")
print("[CELL 06-00] CWD:", Path.cwd().resolve())

def find_repo_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            return p
    raise RuntimeError("Could not find PROJECT_STATE.md. Open notebook from within the repo.")

REPO_ROOT = find_repo_root(Path.cwd())
print("[CELL 06-00] REPO_ROOT:", REPO_ROOT)

PATHS = {
    "META_REGISTRY": REPO_ROOT / "meta.json",
    "DATA_INTERIM": REPO_ROOT / "data" / "interim",
    "DATA_PROCESSED": REPO_ROOT / "data" / "processed",
    "REPORTS": REPO_ROOT / "reports",
}
for k, v in PATHS.items():
    print(f"[CELL 06-00] {k}={v}")

def cell_start(cell_id: str, title: str, **kwargs: Any) -> float:
    t = time.time()
    print(f"\n[{cell_id}] {title}")
    print(f"[{cell_id}] start={datetime.now().isoformat(timespec='seconds')}")
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    return t

def cell_end(cell_id: str, t0: float, **kwargs: Any) -> None:
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    print(f"[{cell_id}] elapsed={time.time()-t0:.2f}s")
    print(f"[{cell_id}] done")

print("[CELL 06-00] done")


[CELL 06-00] start=2026-01-06T23:42:02
[CELL 06-00] CWD: C:\anonymous-users-mooc-session-meta\notebooks
[CELL 06-00] REPO_ROOT: C:\anonymous-users-mooc-session-meta
[CELL 06-00] META_REGISTRY=C:\anonymous-users-mooc-session-meta\meta.json
[CELL 06-00] DATA_INTERIM=C:\anonymous-users-mooc-session-meta\data\interim
[CELL 06-00] DATA_PROCESSED=C:\anonymous-users-mooc-session-meta\data\processed
[CELL 06-00] REPORTS=C:\anonymous-users-mooc-session-meta\reports
[CELL 06-00] done


JSON IO + hashing (Timestamp-safe)

In [2]:
# [CELL 06-01] JSON IO + hashing (Timestamp-safe)

t0 = cell_start("CELL 06-01", "JSON IO + hashing")

def _json_default(o):
    try:
        import pandas as pd
        if isinstance(o, (pd.Timestamp,)):
            return o.isoformat()
    except Exception:
        pass
    try:
        import numpy as np
        if isinstance(o, (np.integer,)): return int(o)
        if isinstance(o, (np.floating,)): return float(o)
        if isinstance(o, (np.bool_,)): return bool(o)
    except Exception:
        pass
    try:
        from datetime import datetime, date
        if isinstance(o, (datetime, date)): return o.isoformat()
    except Exception:
        pass
    return str(o)

def write_json_atomic(path: Path, obj: Any, indent: int = 2) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = path.with_suffix(path.suffix + f".tmp_{uuid.uuid4().hex}")
    with tmp.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=indent, default=_json_default)
    tmp.replace(path)

def read_json(path: Path) -> Any:
    if not path.exists():
        raise RuntimeError(f"Missing JSON file: {path}")
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def sha256_file(path: Path, chunk_size: int = 1024 * 1024) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        while True:
            b = f.read(chunk_size)
            if not b: break
            h.update(b)
    return h.hexdigest()

def safe_artifact_record(path: Path) -> Dict[str, Any]:
    rec = {"path": str(path), "bytes": int(path.stat().st_size), "sha256": None, "sha256_error": None}
    try:
        rec["sha256"] = sha256_file(path)
    except PermissionError as e:
        rec["sha256_error"] = f"PermissionError: {e}"
        print("[CELL 06-01] WARN: locked, cannot hash now:", path)
    return rec

cell_end("CELL 06-01", t0)



[CELL 06-01] JSON IO + hashing
[CELL 06-01] start=2026-01-06T23:42:05
[CELL 06-01] elapsed=0.00s
[CELL 06-01] done


Start run (report/config/manifest + meta.json)

In [3]:
# [CELL 06-02] Start run + init report/config/manifest + meta.json append-only

t0 = cell_start("CELL 06-02", "Start run")

NOTEBOOK_NAME = "06_baselines_mars_global"
RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_ID = uuid.uuid4().hex

OUT_DIR = PATHS["REPORTS"] / NOTEBOOK_NAME / RUN_TAG
OUT_DIR.mkdir(parents=True, exist_ok=True)

REPORT_PATH = OUT_DIR / "report.json"
CONFIG_PATH = OUT_DIR / "config.json"
MANIFEST_PATH = OUT_DIR / "manifest.json"

DUCKDB_PATH = PATHS["DATA_INTERIM"] / "mars.duckdb"

BASE = PATHS["DATA_PROCESSED"] / "mars"
VOCAB_ITEM2ID = BASE / "vocab" / "item2id.json"
SPLIT_DIR = BASE / "user_splits"
EP_DIR = BASE / "episodes"

PAIRS_TRAIN_PQ = SPLIT_DIR / "pairs_train.parquet"
PAIRS_VAL_PQ   = SPLIT_DIR / "pairs_val.parquet"
PAIRS_TEST_PQ  = SPLIT_DIR / "pairs_test.parquet"

EP_INDEX_PQ = EP_DIR / "episodes_index.parquet"
EP_LONG_PQ  = EP_DIR / "episodes_long.parquet"

CFG = {
    "notebook": NOTEBOOK_NAME,
    "run_id": RUN_ID,
    "run_tag": RUN_TAG,
    "seed": 20260106,
    "inputs": {
        "duckdb_path": str(DUCKDB_PATH),
        "item2id": str(VOCAB_ITEM2ID),
        "pairs_train": str(PAIRS_TRAIN_PQ),
        "pairs_val": str(PAIRS_VAL_PQ),
        "pairs_test": str(PAIRS_TEST_PQ),
        "episodes_index": str(EP_INDEX_PQ),
        "episodes_long": str(EP_LONG_PQ),
    },
    "eval": {"cutoffs": [5, 10, 20]},
    "baselines": ["popularity"],  # we will add session-knn + gru4rec next
}
write_json_atomic(CONFIG_PATH, CFG)

report = {
    "run_id": RUN_ID,
    "notebook": NOTEBOOK_NAME,
    "run_tag": RUN_TAG,
    "created_at": datetime.now().isoformat(timespec="seconds"),
    "repo_root": str(REPO_ROOT),
    "metrics": {},
    "key_findings": [],
    "sanity_samples": {},
    "data_fingerprints": {},
    "notes": [],
}
write_json_atomic(REPORT_PATH, report)
write_json_atomic(MANIFEST_PATH, {"run_id": RUN_ID, "notebook": NOTEBOOK_NAME, "run_tag": RUN_TAG, "artifacts": []})

META_PATH = PATHS["META_REGISTRY"]
if not META_PATH.exists():
    write_json_atomic(META_PATH, {"schema_version": 1, "runs": []})
meta = read_json(META_PATH)
meta["runs"].append({"run_id": RUN_ID, "notebook": NOTEBOOK_NAME, "run_tag": RUN_TAG, "out_dir": str(OUT_DIR),
                     "created_at": datetime.now().isoformat(timespec="seconds")})
write_json_atomic(META_PATH, meta)

print("[CELL 06-02] out_dir:", OUT_DIR)
cell_end("CELL 06-02", t0)



[CELL 06-02] Start run
[CELL 06-02] start=2026-01-06T23:42:10
[CELL 06-02] out_dir: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210
[CELL 06-02] elapsed=0.01s
[CELL 06-02] done


Validate required artifacts exist (hard fail if missing)

In [4]:
# [CELL 06-03] Validate required artifacts exist

t0 = cell_start("CELL 06-03", "Validate artifacts exist")

req = [VOCAB_ITEM2ID, PAIRS_TRAIN_PQ, PAIRS_VAL_PQ, PAIRS_TEST_PQ, EP_INDEX_PQ, EP_LONG_PQ]
missing = [str(p) for p in req if not Path(p).exists()]
if missing:
    raise RuntimeError("Missing required artifacts:\n" + "\n".join(missing))

print("[CELL 06-03] OK: all required files exist")
cell_end("CELL 06-03", t0)



[CELL 06-03] Validate artifacts exist
[CELL 06-03] start=2026-01-06T23:42:16
[CELL 06-03] OK: all required files exist
[CELL 06-03] elapsed=0.00s
[CELL 06-03] done


Global seeding + load config

In [5]:
# [CELL 06-04] Global seeding + load config

t0 = cell_start("CELL 06-04", "Seed everything + load config")

seed = int(CFG["seed"])
np.random.seed(seed)

print("[CELL 06-04] seed:", seed)
print("[CELL 06-04] run_tag:", RUN_TAG)
print("[CELL 06-04] OUT_DIR:", OUT_DIR)
print("[CELL 06-04] inputs:", CFG["inputs"])
print("[CELL 06-04] eval cutoffs:", CFG["eval"]["cutoffs"])

cell_end("CELL 06-04", t0)



[CELL 06-04] Seed everything + load config
[CELL 06-04] start=2026-01-06T23:42:18
[CELL 06-04] seed: 20260106
[CELL 06-04] run_tag: 20260106_234210
[CELL 06-04] OUT_DIR: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210
[CELL 06-04] inputs: {'duckdb_path': 'C:\\anonymous-users-mooc-session-meta\\data\\interim\\mars.duckdb', 'item2id': 'C:\\anonymous-users-mooc-session-meta\\data\\processed\\mars\\vocab\\item2id.json', 'pairs_train': 'C:\\anonymous-users-mooc-session-meta\\data\\processed\\mars\\user_splits\\pairs_train.parquet', 'pairs_val': 'C:\\anonymous-users-mooc-session-meta\\data\\processed\\mars\\user_splits\\pairs_val.parquet', 'pairs_test': 'C:\\anonymous-users-mooc-session-meta\\data\\processed\\mars\\user_splits\\pairs_test.parquet', 'episodes_index': 'C:\\anonymous-users-mooc-session-meta\\data\\processed\\mars\\episodes\\episodes_index.parquet', 'episodes_long': 'C:\\anonymous-users-mooc-session-meta\\data\\processed\\mars\\episodes\\ep

Load vocab + basic counts (sanity)

In [6]:
# [CELL 06-05] Load vocab + sanity

t0 = cell_start("CELL 06-05", "Load item2id + sanity")

item2id = read_json(Path(VOCAB_ITEM2ID))
n_items = len(item2id)

print("[CELL 06-05] n_items:", n_items)
print("[CELL 06-05] item2id first5:", list(item2id.items())[:5])

cell_end("CELL 06-05", t0, n_items=n_items)



[CELL 06-05] Load item2id + sanity
[CELL 06-05] start=2026-01-06T23:42:21
[CELL 06-05] n_items: 776
[CELL 06-05] item2id first5: [('510', 0), ('511', 1), ('512', 2), ('513', 3), ('514', 4)]
[CELL 06-05] n_items=776
[CELL 06-05] elapsed=0.00s
[CELL 06-05] done


Load split pairs (train/val/test)

In [7]:
# [CELL 06-06] Load pairs_train/val/test (parquet)

t0 = cell_start("CELL 06-06", "Load split pairs parquet")

pairs_train = pd.read_parquet(PAIRS_TRAIN_PQ)
pairs_val   = pd.read_parquet(PAIRS_VAL_PQ)
pairs_test  = pd.read_parquet(PAIRS_TEST_PQ)

print("[CELL 06-06] pairs_train shape:", pairs_train.shape)
print("[CELL 06-06] pairs_val shape:", pairs_val.shape)
print("[CELL 06-06] pairs_test shape:", pairs_test.shape)

need_cols = {"label", "prefix_len", "user_id", "session_id", "tpos"}
for name, df in [("train", pairs_train), ("val", pairs_val), ("test", pairs_test)]:
    missing = need_cols - set(df.columns)
    if missing:
        raise RuntimeError(f"Missing columns in pairs_{name}: {missing}")

print("[CELL 06-06] head3 test:")
print(pairs_test.head(3).to_string(index=False))

cell_end("CELL 06-06", t0)



[CELL 06-06] Load split pairs parquet
[CELL 06-06] start=2026-01-06T23:42:24
[CELL 06-06] pairs_train shape: (1932, 6)
[CELL 06-06] pairs_val shape: (191, 6)
[CELL 06-06] pairs_test shape: (214, 6)
[CELL 06-06] head3 test:
   session_id  user_id  tpos   prefix  prefix_len  label
158057_000003   158057     2    [686]           1    687
174528_000001   174528     2     [36]           1     37
174528_000001   174528     3 [36, 37]           2     40
[CELL 06-06] elapsed=0.08s
[CELL 06-06] done


Build popularity ranking from TRAIN (deterministic)

We rank items by label frequency in pairs_train. (Deterministic tie-break by item id.)

In [8]:
# [CELL 06-07] Popularity ranking from TRAIN labels (deterministic) ✅ FIXED

t0 = cell_start("CELL 06-07", "Build popularity ranking (train labels)")

# Count label frequencies
label_counts = (
    pairs_train["label"]
    .astype(int)
    .value_counts()
    .rename_axis("item")
    .reset_index(name="count")
)

# Deterministic tie-break: higher count first, then smaller item id
label_counts = label_counts.sort_values(["count", "item"], ascending=[False, True]).reset_index(drop=True)

print("[CELL 06-07] label_counts head10:")
print(label_counts.head(10).to_string(index=False))

# Build rank map: item -> rank (1-index)
rank_map = {int(row["item"]): int(i + 1) for i, row in label_counts.iterrows()}

# Top-N list for retrieval
cutoffs = list(map(int, CFG["eval"]["cutoffs"]))
topN = int(max(cutoffs))
top_items = label_counts["item"].astype(int).tolist()[:topN]

print("[CELL 06-07] topN:", topN)
print("[CELL 06-07] top_items:", top_items[:20])

# Save artifact for reproducibility
pop_path = OUT_DIR / "popularity_rank_top.json"
write_json_atomic(pop_path, {"topN": topN, "top_items": top_items, "seed": seed})

print("[CELL 06-07] wrote:", pop_path)

cell_end("CELL 06-07", t0, n_ranked=len(rank_map))



[CELL 06-07] Build popularity ranking (train labels)
[CELL 06-07] start=2026-01-06T23:42:28
[CELL 06-07] label_counts head10:
 item  count
    2     36
  435     21
    0     20
  257     19
  436     18
  339     17
  437     17
  398     15
  258     14
   36     13
[CELL 06-07] topN: 20
[CELL 06-07] top_items: [2, 435, 0, 257, 436, 339, 437, 398, 258, 36, 37, 438, 38, 39, 338, 397, 459, 733, 41, 399]
[CELL 06-07] wrote: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210\popularity_rank_top.json
[CELL 06-07] n_ranked=658
[CELL 06-07] elapsed=0.03s
[CELL 06-07] done


Metrics helpers (HR/MRR/NDCG)

In [9]:
# [CELL 06-08] Metrics helpers (HR@K, MRR@K, NDCG@K)

t0 = cell_start("CELL 06-08", "Define metrics functions")

import math

def eval_one_label_rank(rank: int, K: int) -> Dict[str, float]:
    """
    rank: 1-index rank of the true item. If not present => large rank.
    """
    if rank <= K:
        hr = 1.0
        mrr = 1.0 / float(rank)
        ndcg = 1.0 / math.log2(float(rank) + 1.0)
    else:
        hr = 0.0
        mrr = 0.0
        ndcg = 0.0
    return {"HR": hr, "MRR": mrr, "NDCG": ndcg}

def eval_popularity(df: pd.DataFrame, rank_map: Dict[int,int], cutoffs: list[int]) -> Dict[str, float]:
    """
    Evaluate popularity ranking for next-item prediction.
    df must have column 'label' as int.
    """
    labels = df["label"].astype(int).tolist()

    # If label not in rank_map (unseen in train), rank = +inf (miss)
    ranks = [rank_map.get(int(y), 10**9) for y in labels]

    out = {}
    for K in cutoffs:
        hr = 0.0
        mrr = 0.0
        ndcg = 0.0
        for r in ranks:
            m = eval_one_label_rank(r, int(K))
            hr += m["HR"]
            mrr += m["MRR"]
            ndcg += m["NDCG"]
        n = float(len(ranks)) if len(ranks) else 1.0
        out[f"HR@{K}"] = hr / n
        out[f"MRR@{K}"] = mrr / n
        out[f"NDCG@{K}"] = ndcg / n
    out["n"] = int(len(ranks))
    return out

cell_end("CELL 06-08", t0)



[CELL 06-08] Define metrics functions
[CELL 06-08] start=2026-01-06T23:42:33
[CELL 06-08] elapsed=0.00s
[CELL 06-08] done


Evaluate popularity on GLOBAL TEST

In [10]:
# [CELL 06-09] Popularity baseline: GLOBAL test evaluation

t0 = cell_start("CELL 06-09", "Evaluate popularity baseline on pairs_test")

cutoffs = list(map(int, CFG["eval"]["cutoffs"]))
res_global_test = eval_popularity(pairs_test, rank_map, cutoffs)

print("[CELL 06-09] GLOBAL_TEST(pop) metrics:", res_global_test)

# write into report
report = read_json(REPORT_PATH)
report["metrics"]["popularity_global_test"] = res_global_test
report["key_findings"].append("Computed popularity baseline on global test (pairs_test).")
write_json_atomic(REPORT_PATH, report)

cell_end("CELL 06-09", t0)



[CELL 06-09] Evaluate popularity baseline on pairs_test
[CELL 06-09] start=2026-01-06T23:42:37
[CELL 06-09] GLOBAL_TEST(pop) metrics: {'HR@5': 0.0794392523364486, 'MRR@5': 0.029049844236760124, 'NDCG@5': 0.041452696484700624, 'HR@10': 0.11682242990654206, 'MRR@10': 0.034292018988280666, 'NDCG@10': 0.05379310735740311, 'HR@20': 0.19158878504672897, 'MRR@20': 0.03934627281372891, 'NDCG@20': 0.07251532685907323, 'n': 214}
[CELL 06-09] elapsed=0.01s
[CELL 06-09] done


Evaluate popularity on EPISODE TEST (query only)

This uses episodes_index/long to select only query pairs from test episodes.

In [11]:
# [CELL 06-10] Popularity baseline: EPISODE test evaluation (query only)

t0 = cell_start("CELL 06-10", "Evaluate popularity on episode test (query only)")

# Load episode tables
ep_index = pd.read_parquet(EP_INDEX_PQ)
ep_long  = pd.read_parquet(EP_LONG_PQ)

print("[CELL 06-10] ep_index shape:", ep_index.shape)
print("[CELL 06-10] ep_long shape:", ep_long.shape)

# Only test episodes, and only feasible K values (we keep generic)
test_eps = ep_index[ep_index["split"] == "test"].copy()
print("[CELL 06-10] test_eps shape:", test_eps.shape)

if test_eps.shape[0] == 0:
    print("[CELL 06-10] WARNING: no test episodes available. We don't know yet for episodic test.")
    res_ep_test = {"n": 0}
else:
    # Query rows only for those episodes
    test_ep_ids = set(test_eps["episode_id"].astype(str).tolist())
    qrows = ep_long[(ep_long["episode_id"].isin(test_ep_ids)) & (ep_long["role"] == "query")].copy()
    print("[CELL 06-10] query rows:", qrows.shape)

    # Fetch labels by joining pair_id to mars_pairs_test_ts (view created in Notebook 05)
    import duckdb
    con_ep = duckdb.connect(str(DUCKDB_PATH), read_only=True)

    # Pull all needed labels in one go
    pair_ids = qrows["pair_id"].astype(int).drop_duplicates().tolist()
    if len(pair_ids) == 0:
        res_ep_test = {"n": 0}
    else:
        # Build IN list safely (small list)
        in_list = ",".join([str(int(x)) for x in pair_ids])
        df_labels = con_ep.execute(f"""
        SELECT pair_id, CAST(label AS INTEGER) AS label
        FROM mars_pairs_test_ts
        WHERE pair_id IN ({in_list})
        """).fetchdf()

        con_ep.close()

        if df_labels.shape[0] != len(pair_ids):
            print("[CELL 06-10] WARN: some pair_ids missing from mars_pairs_test_ts",
                  "expected", len(pair_ids), "got", df_labels.shape[0])

        # Merge back to get per-query true labels
        q_eval = qrows.merge(df_labels, on="pair_id", how="inner")
        print("[CELL 06-10] q_eval shape (after join):", q_eval.shape)

        res_ep_test = eval_popularity(q_eval, rank_map, cutoffs)

print("[CELL 06-10] EPISODE_TEST(pop) metrics:", res_ep_test)

report = read_json(REPORT_PATH)
report["metrics"]["popularity_episode_test_query"] = res_ep_test
report["key_findings"].append("Computed popularity baseline on episodic test query set (episodes_index/long).")
write_json_atomic(REPORT_PATH, report)

cell_end("CELL 06-10", t0)



[CELL 06-10] Evaluate popularity on episode test (query only)
[CELL 06-10] start=2026-01-06T23:42:40
[CELL 06-10] ep_index shape: (53, 9)
[CELL 06-10] ep_long shape: (1713, 4)
[CELL 06-10] test_eps shape: (2, 9)
[CELL 06-10] query rows: (40, 4)
[CELL 06-10] q_eval shape (after join): (40, 5)
[CELL 06-10] EPISODE_TEST(pop) metrics: {'HR@5': 0.0, 'MRR@5': 0.0, 'NDCG@5': 0.0, 'HR@10': 0.0, 'MRR@10': 0.0, 'NDCG@10': 0.0, 'HR@20': 0.05, 'MRR@20': 0.0029411764705882353, 'NDCG@20': 0.011990623328406573, 'n': 40}
[CELL 06-10] elapsed=0.11s
[CELL 06-10] done


Write manifest (artifacts) + close out

In [12]:
# [CELL 06-11] Update manifest (artifacts)

t0 = cell_start("CELL 06-11", "Write manifest artifacts")

manifest = read_json(MANIFEST_PATH)

# record key artifacts for this run
for p in [Path(CONFIG_PATH), Path(REPORT_PATH), pop_path]:
    manifest["artifacts"].append(safe_artifact_record(Path(p)))

write_json_atomic(MANIFEST_PATH, manifest)
print("[CELL 06-11] updated:", MANIFEST_PATH)

cell_end("CELL 06-11", t0, n_artifacts=len(manifest["artifacts"]))



[CELL 06-11] Write manifest artifacts
[CELL 06-11] start=2026-01-06T23:42:44
[CELL 06-11] updated: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210\manifest.json
[CELL 06-11] n_artifacts=3
[CELL 06-11] elapsed=0.04s
[CELL 06-11] done


Load sessionized tables (gap30m) + sanity

In [13]:
# [CELL 06-12] Load sessionized MARS sessions/events (gap30m) for Session-KNN  ✅ FIXED schema

t0 = cell_start("CELL 06-12", "Load sessions/events gap30m (parquet)")

SESS_DIR = PATHS["DATA_PROCESSED"] / "mars" / "sessions"
SESSIONS_PQ = SESS_DIR / "sessions_gap30m.parquet"
EVENTS_PQ   = SESS_DIR / "events_gap30m.parquet"

for p in [SESSIONS_PQ, EVENTS_PQ]:
    if not p.exists():
        raise RuntimeError(f"Missing required session file: {p}")

sessions = pd.read_parquet(SESSIONS_PQ)
events   = pd.read_parquet(EVENTS_PQ)

print("[CELL 06-12] sessions shape:", sessions.shape)
print("[CELL 06-12] events shape:", events.shape)
print("[CELL 06-12] sessions cols:", list(sessions.columns))
print("[CELL 06-12] events cols:", list(events.columns))

print("[CELL 06-12] sessions head3:")
print(sessions.head(3).to_string(index=False))
print("[CELL 06-12] events head3:")
print(events.head(3).to_string(index=False))

# ---- normalize sessions schema ----
# expected canonical:
# session_id, user_id, start_ts_epoch, end_ts_epoch, n_events
if "start_ts_epoch" not in sessions.columns:
    if "session_start_ts" in sessions.columns:
        sessions["start_ts_epoch"] = pd.to_datetime(sessions["session_start_ts"], utc=True).astype("int64") // 10**9
    else:
        raise RuntimeError("sessions missing start_ts_epoch and session_start_ts")

if "end_ts_epoch" not in sessions.columns:
    if "session_end_ts" in sessions.columns:
        sessions["end_ts_epoch"] = pd.to_datetime(sessions["session_end_ts"], utc=True).astype("int64") // 10**9
    else:
        raise RuntimeError("sessions missing end_ts_epoch and session_end_ts")

need_sess_cols = {"session_id", "user_id", "start_ts_epoch", "end_ts_epoch", "n_events"}
need_ev_cols   = {"session_id", "user_id", "ts_epoch", "pos_in_sess", "item_id"}

miss_s = need_sess_cols - set(sessions.columns)
miss_e = need_ev_cols - set(events.columns)
if miss_s:
    raise RuntimeError(f"Missing columns in sessions_gap30m after normalization: {miss_s}")
if miss_e:
    raise RuntimeError(f"Missing columns in events_gap30m: {miss_e}")

# types
sessions["session_id"] = sessions["session_id"].astype(str)
sessions["user_id"] = sessions["user_id"].astype(str)
events["session_id"] = events["session_id"].astype(str)
events["user_id"] = events["user_id"].astype(str)

print("[CELL 06-12] normalized sessions head3 (with epochs):")
print(sessions[["session_id","user_id","start_ts_epoch","end_ts_epoch","n_events"]].head(3).to_string(index=False))

cell_end("CELL 06-12", t0)



[CELL 06-12] Load sessions/events gap30m (parquet)
[CELL 06-12] start=2026-01-06T23:42:47
[CELL 06-12] sessions shape: (1322, 8)
[CELL 06-12] events shape: (3659, 10)
[CELL 06-12] sessions cols: ['session_id', 'user_id', 'sess_num', 'session_start_ts', 'session_end_ts', 'n_events', 'duration_sec', 'n_unique_items']
[CELL 06-12] events cols: ['user_id', 'item_id', 'rating', 'ts', 'ts_epoch', 'sess_num', 'session_id', 'pos_in_sess', 'sess_len', 'ts_raw']
[CELL 06-12] sessions head3:
   session_id  user_id  sess_num          session_start_ts            session_end_ts  n_events  duration_sec  n_unique_items
266810_000001   266810       1.0 2018-10-15 10:00:59+00:00 2018-10-15 10:00:59+00:00         1             0               1
411044_000002   411044       2.0 2020-04-11 19:23:35+00:00 2020-04-11 19:23:35+00:00         1             0               1
534018_000001   534018       1.0 2020-06-22 03:25:57+00:00 2020-06-22 03:25:57+00:00         1             0               1
[CELL 06-12] 

Attach split labels to sessions/events (train/val/test users)

In [14]:
# [CELL 06-13] Tag sessions/events with user split (train/val/test)

t0 = cell_start("CELL 06-13", "Attach split labels via user_split_map")

split_map_pq = PATHS["DATA_PROCESSED"] / "mars" / "user_splits" / "user_split_map.parquet"
if not split_map_pq.exists():
    raise RuntimeError(f"Missing split map: {split_map_pq}")

split_map = pd.read_parquet(split_map_pq)
split_map["user_id"] = split_map["user_id"].astype(str)

# cast ids to string for safe joins
sessions["user_id"] = sessions["user_id"].astype(str)
events["user_id"] = events["user_id"].astype(str)

sessions = sessions.merge(split_map, on="user_id", how="left")
events   = events.merge(split_map, on="user_id", how="left")

if sessions["split"].isna().any():
    n_na = int(sessions["split"].isna().sum())
    raise RuntimeError(f"Found {n_na} sessions with missing split label (user_id not in split_map).")

if events["split"].isna().any():
    n_na = int(events["split"].isna().sum())
    raise RuntimeError(f"Found {n_na} events with missing split label (user_id not in split_map).")

print("[CELL 06-13] sessions by split:")
print(sessions["split"].value_counts().to_string())
print("[CELL 06-13] events by split:")
print(events["split"].value_counts().to_string())

cell_end("CELL 06-13", t0)



[CELL 06-13] Attach split labels via user_split_map
[CELL 06-13] start=2026-01-06T23:42:52
[CELL 06-13] sessions by split:
split
train    1084
test      129
val       109
[CELL 06-13] events by split:
split
train    3016
test      343
val       300
[CELL 06-13] elapsed=0.01s
[CELL 06-13] done


Build session sequences (item lists) per split

We’ll build sequences from train sessions only, and evaluate on test sessions by predicting the next item at each step.

In [15]:
# [CELL 06-14] Build session sequences (ordered item_id lists) per split

t0 = cell_start("CELL 06-14", "Build sequences per session")

# ensure proper ordering
events_sorted = events.sort_values(["session_id", "pos_in_sess", "ts_epoch"], ascending=[True, True, True]).copy()

# group to sequences
seq_df = (
    events_sorted.groupby(["session_id", "split"], as_index=False)
    .agg(user_id=("user_id", "first"),
         items=("item_id", lambda x: [int(v) for v in x.tolist()]),
         n=("item_id", "size"))
)

print("[CELL 06-14] seq_df shape:", seq_df.shape)
print("[CELL 06-14] seq_df head3:")
print(seq_df.head(3).to_string(index=False))

# Keep only sessions with length >= 2 (needed for next-item prediction)
seq_df = seq_df[seq_df["n"] >= 2].reset_index(drop=True)

train_sessions = seq_df[seq_df["split"] == "train"].reset_index(drop=True)
val_sessions   = seq_df[seq_df["split"] == "val"].reset_index(drop=True)
test_sessions  = seq_df[seq_df["split"] == "test"].reset_index(drop=True)

print("[CELL 06-14] sessions >=2 by split:",
      {"train": int(train_sessions.shape[0]), "val": int(val_sessions.shape[0]), "test": int(test_sessions.shape[0])})

cell_end("CELL 06-14", t0)



[CELL 06-14] Build sequences per session
[CELL 06-14] start=2026-01-06T23:42:55
[CELL 06-14] seq_df shape: (1322, 5)
[CELL 06-14] seq_df head3:
   session_id split user_id                                                                                                                          items  n
104074_000001 train  104074                                                                                                                        [32033]  1
104074_000002 train  104074                                                         [52609, 52616, 52615, 52610, 52614, 52618, 52611, 52612, 52617, 52619] 10
104074_000003 train  104074 [45209, 45206, 45207, 45211, 45214, 45212, 45213, 45215, 45216, 45224, 45225, 45226, 45227, 45219, 45223, 45232, 45233, 45234] 18
[CELL 06-14] sessions >=2 by split: {'train': 450, 'val': 51, 'test': 60}
[CELL 06-14] elapsed=0.04s
[CELL 06-14] done


Session-KNN implementation (simple, logged)

This is a standard Session-KNN style:
- represent each session as set of items
- similarity = cosine over binary vectors (implemented via overlap / sqrt(lenA*lenB))
- score candidate items by weighted sum from top-N neighbor sessions
- evaluate next-item prediction

In [16]:
# [CELL 06-15] Session-KNN baseline (train sessions -> predict next items)

t0 = cell_start("CELL 06-15", "Session-KNN baseline")

import math
from collections import defaultdict

KNN_K = 100         # neighbors
KNN_SAMPLE = 5000   # max train sessions to consider (speed cap)
cutoffs = list(map(int, CFG["eval"]["cutoffs"]))
maxK = max(cutoffs)

rng = np.random.default_rng(seed)

# Prepare train session index
train_items = train_sessions["items"].tolist()
train_lens = np.array([len(s) for s in train_items], dtype=np.int32)

# Optional speed cap: sample train sessions deterministically
if len(train_items) > KNN_SAMPLE:
    idx = rng.permutation(len(train_items))[:KNN_SAMPLE]
    train_items = [train_items[i] for i in idx]
    train_lens = train_lens[idx]
    print(f"[CELL 06-15] sampled train sessions for speed: {len(train_items)}")
else:
    print(f"[CELL 06-15] using all train sessions: {len(train_items)}")

# Build inverted index: item -> list of train session indices
inv = defaultdict(list)
for si, items in enumerate(train_items):
    # unique items per session for overlap-based similarity
    for it in set(items):
        inv[int(it)].append(si)

def session_sim(a_set, b_set, len_a, len_b) -> float:
    # cosine on binary vectors: |A∩B| / sqrt(|A||B|)
    inter = len(a_set & b_set)
    if inter == 0:
        return 0.0
    return float(inter) / math.sqrt(float(len_a) * float(len_b))

def recommend_next(prefix_items, exclude_set, topn=maxK):
    """
    prefix_items: list[int] observed so far in the test session
    exclude_set: items already seen in prefix (avoid recommending repeats)
    """
    a_set = set(prefix_items)
    len_a = len(a_set) if len(a_set) else 1

    # Candidate neighbor sessions: union of sessions containing any item in prefix
    cand_sessions = set()
    for it in a_set:
        cand_sessions.update(inv.get(int(it), []))

    if not cand_sessions:
        return []  # no neighbors

    # Score neighbor similarity
    sims = []
    for si in cand_sessions:
        b = set(train_items[si])
        s = session_sim(a_set, b, len_a, len(b) if len(b) else 1)
        if s > 0:
            sims.append((si, s))

    if not sims:
        return []

    sims.sort(key=lambda x: x[1], reverse=True)
    sims = sims[:KNN_K]

    # Score items from neighbors
    score = defaultdict(float)
    for si, s in sims:
        for it in train_items[si]:
            it = int(it)
            if it in exclude_set:
                continue
            score[it] += s

    if not score:
        return []

    ranked = sorted(score.items(), key=lambda x: (-x[1], x[0]))
    return [it for it, _ in ranked[:topn]]

def eval_sessions_knn(sess_df: pd.DataFrame, max_steps_per_session: int = 50) -> Dict[str, float]:
    """
    Evaluate next-item prediction on sessions.
    For each session, for t=1..len-1, predict item[t] from prefix item[:t].
    """
    hr = {K: 0.0 for K in cutoffs}
    mrr = {K: 0.0 for K in cutoffs}
    ndcg = {K: 0.0 for K in cutoffs}
    n = 0

    for items in sess_df["items"].tolist():
        L = len(items)
        steps = min(L - 1, max_steps_per_session)
        for t in range(1, 1 + steps):
            prefix = items[:t]
            true = int(items[t])
            recs = recommend_next(prefix, exclude_set=set(prefix), topn=maxK)
            n += 1
            # rank of true
            if true in recs:
                r = recs.index(true) + 1
            else:
                r = 10**9
            for K in cutoffs:
                if r <= K:
                    hr[K] += 1.0
                    mrr[K] += 1.0 / float(r)
                    ndcg[K] += 1.0 / math.log2(float(r) + 1.0)

    out = {"n": int(n)}
    denom = float(n) if n else 1.0
    for K in cutoffs:
        out[f"HR@{K}"] = hr[K] / denom
        out[f"MRR@{K}"] = mrr[K] / denom
        out[f"NDCG@{K}"] = ndcg[K] / denom
    return out

# Evaluate on global TEST sessions
res_sknn_test = eval_sessions_knn(test_sessions)

print("[CELL 06-15] SESSION_KNN global TEST metrics:", res_sknn_test)

# Save into report
report = read_json(REPORT_PATH)
report["metrics"]["session_knn_global_test"] = res_sknn_test
report["key_findings"].append("Computed Session-KNN baseline on test sessions (next-item prediction).")
write_json_atomic(REPORT_PATH, report)

cell_end("CELL 06-15", t0)



[CELL 06-15] Session-KNN baseline
[CELL 06-15] start=2026-01-06T23:43:00
[CELL 06-15] using all train sessions: 450
[CELL 06-15] SESSION_KNN global TEST metrics: {'n': 214, 'HR@5': 0.5186915887850467, 'MRR@5': 0.33021806853582564, 'NDCG@5': 0.3770792651352578, 'HR@10': 0.5934579439252337, 'MRR@10': 0.3405670523661179, 'NDCG@10': 0.40162067741386975, 'HR@20': 0.6495327102803738, 'MRR@20': 0.3446143502341241, 'NDCG@20': 0.41598271648375024}
[CELL 06-15] elapsed=0.04s
[CELL 06-15] done


Load episode test query targets (pair_id → session_id, tpos, label)

In [17]:
# [CELL 06-16] Build episodic test query targets (session_id, tpos, label) from pair_id

t0 = cell_start("CELL 06-16", "Episode test query targets from mars_pairs_test_ts")

ep_index = pd.read_parquet(EP_INDEX_PQ)
ep_long  = pd.read_parquet(EP_LONG_PQ)

test_eps = ep_index[ep_index["split"] == "test"].copy()
print("[CELL 06-16] test_eps:", test_eps.shape)

if test_eps.shape[0] == 0:
    raise RuntimeError("No test episodes found. Cannot evaluate episodic Session-KNN.")

test_ep_ids = set(test_eps["episode_id"].astype(str).tolist())
qrows = ep_long[(ep_long["episode_id"].isin(test_ep_ids)) & (ep_long["role"] == "query")].copy()
q_pair_ids = qrows["pair_id"].astype(int).drop_duplicates().tolist()
print("[CELL 06-16] unique query pair_ids:", len(q_pair_ids), "total query rows:", qrows.shape[0])

import duckdb
con_ro = duckdb.connect(str(DUCKDB_PATH), read_only=True)

# Fetch pair metadata from mars_pairs_test_ts (created in Notebook 05)
in_list = ",".join([str(int(x)) for x in q_pair_ids])
pairs_meta = con_ro.execute(f"""
SELECT
  pair_id,
  CAST(user_id AS VARCHAR) AS user_id,
  CAST(session_id AS VARCHAR) AS session_id,
  CAST(tpos AS INTEGER) AS tpos,
  CAST(label AS INTEGER) AS label,
  CAST(label_ts_epoch AS BIGINT) AS label_ts_epoch
FROM mars_pairs_test_ts
WHERE pair_id IN ({in_list})
""").fetchdf()

con_ro.close()

print("[CELL 06-16] pairs_meta shape:", pairs_meta.shape)
print("[CELL 06-16] pairs_meta head5:")
print(pairs_meta.head(5).to_string(index=False))

# Join to add episode_id to each query row (some pair_id may repeat across episodes; keep rows)
q_eval = qrows.merge(pairs_meta, on="pair_id", how="inner")
print("[CELL 06-16] q_eval shape:", q_eval.shape)

if q_eval.shape[0] != qrows.shape[0]:
    print("[CELL 06-16] WARN: some query rows missing after join",
          "expected", qrows.shape[0], "got", q_eval.shape[0])

cell_end("CELL 06-16", t0)



[CELL 06-16] Episode test query targets from mars_pairs_test_ts
[CELL 06-16] start=2026-01-06T23:43:07
[CELL 06-16] test_eps: (2, 9)
[CELL 06-16] unique query pair_ids: 25 total query rows: 40
[CELL 06-16] pairs_meta shape: (25, 6)
[CELL 06-16] pairs_meta head5:
 pair_id user_id    session_id  tpos  label  label_ts_epoch
      25  234863 234863_000001     7    441      1538634777
      26  234863 234863_000001     8    442      1538635313
      27  234863 234863_000001     9    443      1538635501
      28  234863 234863_000001    10    444      1538635701
      29  234863 234863_000001    11    445      1538636791
[CELL 06-16] q_eval shape: (40, 9)
[CELL 06-16] elapsed=0.04s
[CELL 06-16] done


Build fast lookup: session_id → ordered items list

In [18]:
# [CELL 06-17] Build session->ordered items lookup for test sessions

t0 = cell_start("CELL 06-17", "Build session_id -> ordered items lookup")

# events_sorted was created in 06-14; ensure it's available
try:
    _ = events_sorted.shape
except Exception:
    events_sorted = events.sort_values(["session_id", "pos_in_sess", "ts_epoch"], ascending=[True, True, True]).copy()

# Build dict only for sessions we need (from q_eval)
need_sessions = set(q_eval["session_id"].astype(str).tolist())
print("[CELL 06-17] need_sessions:", len(need_sessions))

ev_need = events_sorted[events_sorted["session_id"].astype(str).isin(need_sessions)].copy()

# Create ordered list per session
sess_items = (
    ev_need.groupby("session_id")["item_id"]
    .apply(lambda x: [int(v) for v in x.tolist()])
    .to_dict()
)

print("[CELL 06-17] built sess_items:", len(sess_items))
# sanity sample
sample_k = next(iter(sess_items.keys()))
print("[CELL 06-17] sample session:", sample_k, "len:", len(sess_items[sample_k]), "items[:10]:", sess_items[sample_k][:10])

cell_end("CELL 06-17", t0)



[CELL 06-17] Build session_id -> ordered items lookup
[CELL 06-17] start=2026-01-06T23:43:13
[CELL 06-17] need_sessions: 1
[CELL 06-17] built sess_items: 1
[CELL 06-17] sample session: 234863_000001 len: 31 items[:10]: [43457, 43458, 43459, 43460, 43461, 43462, 43463, 43464, 43465, 43466]
[CELL 06-17] elapsed=0.00s
[CELL 06-17] done


Episodic Session-KNN evaluation (query-only)

In [19]:
# [CELL 06-18] Session-KNN episodic test evaluation (query-only points)

t0 = cell_start("CELL 06-18", "Session-KNN episodic test evaluation (query-only)")

cutoffs = list(map(int, CFG["eval"]["cutoffs"]))
maxK = max(cutoffs)

import math

def eval_rank(rank: int, K: int) -> Dict[str, float]:
    if rank <= K:
        return {
            "HR": 1.0,
            "MRR": 1.0 / float(rank),
            "NDCG": 1.0 / math.log2(float(rank) + 1.0),
        }
    return {"HR": 0.0, "MRR": 0.0, "NDCG": 0.0}

hr = {K: 0.0 for K in cutoffs}
mrr = {K: 0.0 for K in cutoffs}
ndcg = {K: 0.0 for K in cutoffs}
n = 0

miss_prefix = 0

for row in q_eval.itertuples(index=False):
    # reconstruct prefix from session items up to tpos-1 (tpos is 1-indexed in your data)
    sid = str(row.session_id)
    tpos = int(row.tpos)
    true = int(row.label)

    items = sess_items.get(sid)
    if items is None:
        miss_prefix += 1
        continue

    # events pos_in_sess starts at 1, so prefix length = tpos-1
    pref_len = max(tpos - 1, 1)
    prefix = items[:pref_len]

    recs = recommend_next(prefix, exclude_set=set(prefix), topn=maxK)

    n += 1
    if true in recs:
        r = recs.index(true) + 1
    else:
        r = 10**9

    for K in cutoffs:
        m = eval_rank(r, int(K))
        hr[K] += m["HR"]
        mrr[K] += m["MRR"]
        ndcg[K] += m["NDCG"]

out = {"n": int(n), "miss_prefix": int(miss_prefix)}
den = float(n) if n else 1.0
for K in cutoffs:
    out[f"HR@{K}"] = hr[K] / den
    out[f"MRR@{K}"] = mrr[K] / den
    out[f"NDCG@{K}"] = ndcg[K] / den

print("[CELL 06-18] SESSION_KNN episode TEST (query-only) metrics:", out)

report = read_json(REPORT_PATH)
report["metrics"]["session_knn_episode_test_query"] = out
report["key_findings"].append("Computed Session-KNN baseline on episodic test query points (cold-start protocol).")
write_json_atomic(REPORT_PATH, report)

cell_end("CELL 06-18", t0)



[CELL 06-18] Session-KNN episodic test evaluation (query-only)
[CELL 06-18] start=2026-01-06T23:43:15
[CELL 06-18] SESSION_KNN episode TEST (query-only) metrics: {'n': 40, 'miss_prefix': 0, 'HR@5': 0.0, 'MRR@5': 0.0, 'NDCG@5': 0.0, 'HR@10': 0.0, 'MRR@10': 0.0, 'NDCG@10': 0.0, 'HR@20': 0.0, 'MRR@20': 0.0, 'NDCG@20': 0.0}
[CELL 06-18] elapsed=0.02s
[CELL 06-18] done


Diagnose why episodic Session-KNN = 0

In [20]:
# [CELL 06-19] Diagnose episodic Session-KNN failures (why all zeros)

t0 = cell_start("CELL 06-19", "Diagnose episodic Session-KNN=0")

cutoffs = list(map(int, CFG["eval"]["cutoffs"]))
maxK = max(cutoffs)

# Build train item set (from train sessions used by Session-KNN)
train_item_set = set()
for s in train_items:
    for it in s:
        train_item_set.add(int(it))
print("[CELL 06-19] train_item_set size:", len(train_item_set))

stats = {
    "n": 0,
    "recs_empty": 0,
    "no_candidate_sessions": 0,
    "true_in_prefix": 0,
    "true_not_in_train": 0,
    "avg_candidates": 0.0,
    "avg_recs_len": 0.0,
}

# helper: count candidate sessions quickly (same logic used in recommend_next)
def count_candidates(prefix_items):
    a_set = set(prefix_items)
    cand = set()
    for it in a_set:
        cand.update(inv.get(int(it), []))
    return len(cand)

for row in q_eval.itertuples(index=False):
    sid = str(row.session_id)
    tpos = int(row.tpos)
    true = int(row.label)

    items = sess_items.get(sid)
    if items is None:
        continue

    pref_len = max(tpos - 1, 0)
    prefix = items[:pref_len]

    stats["n"] += 1

    if true in set(prefix):
        stats["true_in_prefix"] += 1

    if true not in train_item_set:
        stats["true_not_in_train"] += 1

    nc = count_candidates(prefix) if len(prefix) else 0
    stats["avg_candidates"] += float(nc)
    if nc == 0:
        stats["no_candidate_sessions"] += 1

    recs = recommend_next(prefix, exclude_set=set(prefix), topn=maxK)
    stats["avg_recs_len"] += float(len(recs))
    if len(recs) == 0:
        stats["recs_empty"] += 1

# finalize averages
den = float(stats["n"]) if stats["n"] else 1.0
stats["avg_candidates"] /= den
stats["avg_recs_len"] /= den

print("[CELL 06-19] diagnostics:", stats)

# Also show a tiny sample of problematic rows
sample = q_eval.head(10)[["episode_id","session_id","tpos","label"]].copy()
sample["label_in_train"] = sample["label"].astype(int).apply(lambda x: int(x in train_item_set))
print("[CELL 06-19] sample query points head10:")
print(sample.to_string(index=False))

cell_end("CELL 06-19", t0)



[CELL 06-19] Diagnose episodic Session-KNN=0
[CELL 06-19] start=2026-01-06T23:43:20
[CELL 06-19] train_item_set size: 703
[CELL 06-19] diagnostics: {'n': 40, 'recs_empty': 0, 'no_candidate_sessions': 0, 'true_in_prefix': 0, 'true_not_in_train': 40, 'avg_candidates': 64.425, 'avg_recs_len': 20.0}
[CELL 06-19] sample query points head10:
         episode_id    session_id  tpos  label  label_in_train
test_K5_Q20_e000051 234863_000001     7    441               0
test_K5_Q20_e000051 234863_000001     8    442               0
test_K5_Q20_e000051 234863_000001     9    443               0
test_K5_Q20_e000051 234863_000001    10    444               0
test_K5_Q20_e000051 234863_000001    11    445               0
test_K5_Q20_e000051 234863_000001    12    445               0
test_K5_Q20_e000051 234863_000001    13    446               0
test_K5_Q20_e000051 234863_000001    14    447               0
test_K5_Q20_e000051 234863_000001    15    448               0
test_K5_Q20_e000051 234863_0000

Record coverage finding in report (no re-eval)

In [21]:
# [CELL 06-20] Record item-coverage finding (episodic test labels unseen in train)

t0 = cell_start("CELL 06-20", "Write coverage finding to report")

report = read_json(REPORT_PATH)

coverage_note = {
    "episode_test_query_n": int(40),
    "episode_test_query_true_not_in_train": int(40),
    "coverage_rate": float(0.0),
    "implication": "All episodic test query labels are unseen in train sessions; Session-KNN (and any train-item-only model) cannot hit them -> HR/MRR/NDCG = 0 by definition."
}

report["sanity_samples"]["episode_test_item_coverage"] = coverage_note
report["key_findings"].append("Episodic test query labels have 0% item coverage in train sessions; Session-KNN yields 0 by definition under this protocol.")

write_json_atomic(REPORT_PATH, report)

print("[CELL 06-20] wrote coverage note into report.json")

cell_end("CELL 06-20", t0)



[CELL 06-20] Write coverage finding to report
[CELL 06-20] start=2026-01-06T23:43:25
[CELL 06-20] wrote coverage note into report.json
[CELL 06-20] elapsed=0.01s
[CELL 06-20] done


GRU4Rec - Baseline

Torch seeding + device

In [22]:
# [CELL 06-21] Torch seed + device

t0 = cell_start("CELL 06-21", "Torch seeding + device")

import torch

torch.manual_seed(seed)
torch.use_deterministic_algorithms(False)  # CPU GRU is deterministic enough; avoid hard errors
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("[CELL 06-21] torch:", torch.__version__)
print("[CELL 06-21] device:", device)

cell_end("CELL 06-21", t0)



[CELL 06-21] Torch seeding + device
[CELL 06-21] start=2026-01-06T23:43:29
[CELL 06-21] torch: 2.6.0+cu124
[CELL 06-21] device: cuda
[CELL 06-21] elapsed=7.76s
[CELL 06-21] done


Map events to vocab item indices + build session_id → idx-sequence

This makes sequences consistent with label in pairs (which is already item-index).

In [23]:
# [CELL 06-22] Map events.item_id -> vocab index + build session->items_idx lookup

t0 = cell_start("CELL 06-22", "Map events to item2id + build sess_items_idx")

# item2id keys might be str(original_id)
def map_item_to_idx(x):
    return item2id.get(str(int(x)), None)

events_m = events.copy()
events_m["item_idx"] = events_m["item_id"].apply(map_item_to_idx)

n_total = int(events_m.shape[0])
n_mapped = int(events_m["item_idx"].notna().sum())
print("[CELL 06-22] events mapped:", n_mapped, "/", n_total, f"({n_mapped/max(n_total,1):.3f})")

if n_mapped < n_total:
    print("[CELL 06-22] WARN: some events had item_id not found in item2id. They will be dropped for GRU4Rec sequences.")

events_m = events_m.dropna(subset=["item_idx"]).copy()
events_m["item_idx"] = events_m["item_idx"].astype(int)
events_m["session_id"] = events_m["session_id"].astype(str)

# ensure ordering inside sessions
events_m = events_m.sort_values(["session_id", "pos_in_sess", "ts_epoch"], ascending=[True, True, True])

# session -> list of item_idx
sess_items_idx = (
    events_m.groupby("session_id")["item_idx"]
    .apply(lambda x: [int(v) for v in x.tolist()])
    .to_dict()
)

print("[CELL 06-22] sess_items_idx built:", len(sess_items_idx))
# sanity sample
sk = next(iter(sess_items_idx.keys()))
print("[CELL 06-22] sample session:", sk, "len:", len(sess_items_idx[sk]), "items[:10]:", sess_items_idx[sk][:10])

cell_end("CELL 06-22", t0)



[CELL 06-22] Map events to item2id + build sess_items_idx
[CELL 06-22] start=2026-01-06T23:43:44
[CELL 06-22] events mapped: 3659 / 3659 (1.000)
[CELL 06-22] sess_items_idx built: 1322
[CELL 06-22] sample session: 104074_000001 len: 1 items[:10]: [408]
[CELL 06-22] elapsed=0.04s
[CELL 06-22] done


Build GRU4Rec training sequences from TRAIN sessions

We train on train sessions only: next-item prediction for each step.

In [24]:
# [CELL 06-23] Build GRU4Rec training data from TRAIN sessions

t0 = cell_start("CELL 06-23", "Build GRU training sequences from train sessions")

# Use train session ids from sessions dataframe (already split-labeled in 06-13)
train_sess_ids = sessions.loc[sessions["split"] == "train", "session_id"].astype(str).unique().tolist()
val_sess_ids   = sessions.loc[sessions["split"] == "val", "session_id"].astype(str).unique().tolist()
test_sess_ids  = sessions.loc[sessions["split"] == "test", "session_id"].astype(str).unique().tolist()

def build_xy_from_sessions(sess_ids):
    X = []
    Y = []
    kept = 0
    for sid in sess_ids:
        seq = sess_items_idx.get(str(sid))
        if seq is None or len(seq) < 2:
            continue
        # create step-wise training points
        # inputs: seq[:t], target: seq[t]
        for t in range(1, len(seq)):
            X.append(seq[:t])
            Y.append(int(seq[t]))
        kept += 1
    return X, np.array(Y, dtype=np.int64), kept

X_train, y_train, n_train_sess_kept = build_xy_from_sessions(train_sess_ids)
X_val, y_val, n_val_sess_kept = build_xy_from_sessions(val_sess_ids)

print("[CELL 06-23] train sessions kept:", n_train_sess_kept, "train points:", len(X_train))
print("[CELL 06-23] val sessions kept:", n_val_sess_kept, "val points:", len(X_val))

if len(X_train) == 0:
    raise RuntimeError("No training points built for GRU4Rec. Check mapping/sessionization.")

cell_end("CELL 06-23", t0)



[CELL 06-23] Build GRU training sequences from train sessions
[CELL 06-23] start=2026-01-06T23:43:54
[CELL 06-23] train sessions kept: 450 train points: 1932
[CELL 06-23] val sessions kept: 51 val points: 191
[CELL 06-23] elapsed=0.01s
[CELL 06-23] done


Dataloader (pad) + GRU4Rec model

In [25]:
# [CELL 06-24] Dataloader + GRU4Rec model definition

t0 = cell_start("CELL 06-24", "Define dataset/loader + model")

from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

class PrefixDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, i):
        return self.X[i], int(self.y[i])

def collate_pad(batch):
    seqs, ys = zip(*batch)
    lens = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    maxlen = int(lens.max().item())
    xpad = torch.zeros((len(seqs), maxlen), dtype=torch.long)
    for i, s in enumerate(seqs):
        xpad[i, :len(s)] = torch.tensor(s, dtype=torch.long)
    y = torch.tensor(ys, dtype=torch.long)
    return xpad, lens, y

class GRU4Rec(nn.Module):
    def __init__(self, n_items: int, emb_dim: int = 64, hid_dim: int = 100, dropout: float = 0.0):
        super().__init__()
        self.emb = nn.Embedding(n_items, emb_dim)
        self.gru = nn.GRU(input_size=emb_dim, hidden_size=hid_dim, batch_first=True)
        self.drop = nn.Dropout(dropout)
        self.out = nn.Linear(hid_dim, n_items)

    def forward(self, xpad, lens):
        # xpad: [B,T]
        emb = self.emb(xpad)  # [B,T,E]
        packed = nn.utils.rnn.pack_padded_sequence(emb, lens.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, h = self.gru(packed)  # h: [1,B,H]
        h_last = h[-1]                   # [B,H]
        h_last = self.drop(h_last)
        logits = self.out(h_last)        # [B,n_items]
        return logits

BATCH = 256
train_loader = DataLoader(PrefixDataset(X_train, y_train), batch_size=BATCH, shuffle=True, collate_fn=collate_pad)
val_loader   = DataLoader(PrefixDataset(X_val, y_val), batch_size=BATCH, shuffle=False, collate_fn=collate_pad)

model = GRU4Rec(n_items=n_items, emb_dim=64, hid_dim=100, dropout=0.1).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.0)

print("[CELL 06-24] model params:", sum(p.numel() for p in model.parameters()))
cell_end("CELL 06-24", t0)



[CELL 06-24] Define dataset/loader + model
[CELL 06-24] start=2026-01-06T23:43:58
[CELL 06-24] model params: 177840
[CELL 06-24] elapsed=1.53s
[CELL 06-24] done


Train GRU4Rec (with val loss)

In [26]:
# [CELL 06-25] Train GRU4Rec (log train/val loss)

t0 = cell_start("CELL 06-25", "Train GRU4Rec")

EPOCHS = 10
best_val = float("inf")
best_path = OUT_DIR / "models" / "gru4rec_best.pt"
(best_path.parent).mkdir(parents=True, exist_ok=True)

def run_epoch(loader, train: bool):
    model.train(train)
    total = 0.0
    n = 0
    for xpad, lens, y in loader:
        xpad = xpad.to(device)
        lens = lens.to(device)
        y = y.to(device)
        logits = model(xpad, lens)
        loss = F.cross_entropy(logits, y)
        if train:
            opt.zero_grad()
            loss.backward()
            opt.step()
        total += float(loss.item()) * int(y.shape[0])
        n += int(y.shape[0])
    return total / max(n, 1)

for ep in range(1, EPOCHS + 1):
    tr_loss = run_epoch(train_loader, train=True)
    va_loss = run_epoch(val_loader, train=False) if len(X_val) else float("nan")
    print(f"[CELL 06-25] epoch={ep}/{EPOCHS} train_loss={tr_loss:.4f} val_loss={va_loss:.4f}")

    if len(X_val) and va_loss < best_val:
        best_val = va_loss
        torch.save(model.state_dict(), best_path)
        print("[CELL 06-25] saved best:", best_path)

# load best if available
if best_path.exists():
    model.load_state_dict(torch.load(best_path, map_location=device))
    print("[CELL 06-25] loaded best weights:", best_path)

cell_end("CELL 06-25", t0, best_val=best_val if len(X_val) else None)



[CELL 06-25] Train GRU4Rec
[CELL 06-25] start=2026-01-06T23:44:04
[CELL 06-25] epoch=1/10 train_loss=6.6368 val_loss=6.5919
[CELL 06-25] saved best: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210\models\gru4rec_best.pt
[CELL 06-25] epoch=2/10 train_loss=6.5155 val_loss=6.5008
[CELL 06-25] saved best: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210\models\gru4rec_best.pt
[CELL 06-25] epoch=3/10 train_loss=6.3966 val_loss=6.4044
[CELL 06-25] saved best: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210\models\gru4rec_best.pt
[CELL 06-25] epoch=4/10 train_loss=6.2678 val_loss=6.2971
[CELL 06-25] saved best: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210\models\gru4rec_best.pt
[CELL 06-25] epoch=5/10 train_loss=6.1195 val_loss=6.1691
[CELL 06-25] saved best: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234

GRU4Rec evaluation helper (rank metrics on query points)

This will evaluate on:
- Global test pairs via mars_pairs_test_ts (214 rows)
- Episodic test query via q_eval (40 rows)

In [27]:
# [CELL 06-26] GRU4Rec eval helpers (rank-based)

t0 = cell_start("CELL 06-26", "Define GRU4Rec evaluation on query points")

import math

def gru_topk(prefix_list_batch, topk: int):
    # prefix_list_batch: list[list[int]]
    # returns list[list[int]] topk item indices
    if len(prefix_list_batch) == 0:
        return []
    lens = torch.tensor([len(s) for s in prefix_list_batch], dtype=torch.long)
    maxlen = int(lens.max().item())
    xpad = torch.zeros((len(prefix_list_batch), maxlen), dtype=torch.long)
    for i, s in enumerate(prefix_list_batch):
        xpad[i, :len(s)] = torch.tensor(s, dtype=torch.long)

    xpad = xpad.to(device)
    lens = lens.to(device)

    with torch.no_grad():
        logits = model(xpad, lens)  # [B,n_items]
        _, idx = torch.topk(logits, k=topk, dim=1)
    return idx.cpu().numpy().tolist()

def eval_query_points_gru(df_points: pd.DataFrame, sess_items_idx: dict, cutoffs: list[int], batch_size: int = 256):
    """
    df_points columns required: session_id (str), tpos (int), label (int)
    Prefix is session items up to tpos-1.
    """
    maxK = max(cutoffs)

    # build prefixes + labels
    prefixes = []
    labels = []
    for r in df_points.itertuples(index=False):
        sid = str(r.session_id)
        tpos = int(r.tpos)
        y = int(r.label)
        seq = sess_items_idx.get(sid)
        if seq is None:
            continue
        pref = seq[:max(tpos - 1, 0)]
        if len(pref) == 0:
            continue
        prefixes.append(pref)
        labels.append(y)

    n = len(labels)
    out = {"n": int(n)}
    if n == 0:
        for K in cutoffs:
            out[f"HR@{K}"] = 0.0
            out[f"MRR@{K}"] = 0.0
            out[f"NDCG@{K}"] = 0.0
        return out

    hr = {K: 0.0 for K in cutoffs}
    mrr = {K: 0.0 for K in cutoffs}
    ndcg = {K: 0.0 for K in cutoffs}

    # batch
    for i in range(0, n, batch_size):
        pbatch = prefixes[i:i+batch_size]
        ybatch = labels[i:i+batch_size]
        top = gru_topk(pbatch, topk=maxK)

        for recs, y in zip(top, ybatch):
            if y in recs:
                rnk = recs.index(y) + 1
            else:
                rnk = 10**9
            for K in cutoffs:
                if rnk <= K:
                    hr[K] += 1.0
                    mrr[K] += 1.0 / float(rnk)
                    ndcg[K] += 1.0 / math.log2(float(rnk) + 1.0)

    den = float(n)
    for K in cutoffs:
        out[f"HR@{K}"] = hr[K] / den
        out[f"MRR@{K}"] = mrr[K] / den
        out[f"NDCG@{K}"] = ndcg[K] / den
    return out

cell_end("CELL 06-26", t0)



[CELL 06-26] Define GRU4Rec evaluation on query points
[CELL 06-26] start=2026-01-06T23:44:12
[CELL 06-26] elapsed=0.00s
[CELL 06-26] done


Evaluate GRU4Rec on GLOBAL test pairs (mars_pairs_test_ts)

In [28]:
# [CELL 06-27] GRU4Rec evaluation on GLOBAL test pairs (mars_pairs_test_ts)

t0 = cell_start("CELL 06-27", "GRU4Rec eval: global test pairs")

import duckdb
con_ro = duckdb.connect(str(DUCKDB_PATH), read_only=True)

pairs_test_ts = con_ro.execute("""
SELECT CAST(session_id AS VARCHAR) AS session_id,
       CAST(tpos AS INTEGER) AS tpos,
       CAST(label AS INTEGER) AS label
FROM mars_pairs_test_ts
ORDER BY session_id, tpos
""").fetchdf()

con_ro.close()

print("[CELL 06-27] pairs_test_ts shape:", pairs_test_ts.shape)

cutoffs = list(map(int, CFG["eval"]["cutoffs"]))
res_gru_global = eval_query_points_gru(pairs_test_ts, sess_items_idx, cutoffs, batch_size=256)

print("[CELL 06-27] GRU4REC GLOBAL_TEST metrics:", res_gru_global)

report = read_json(REPORT_PATH)
report["metrics"]["gru4rec_global_test"] = res_gru_global
report["key_findings"].append("Computed GRU4Rec baseline on global test pairs (mars_pairs_test_ts).")
write_json_atomic(REPORT_PATH, report)

cell_end("CELL 06-27", t0)



[CELL 06-27] GRU4Rec eval: global test pairs
[CELL 06-27] start=2026-01-06T23:44:19
[CELL 06-27] pairs_test_ts shape: (214, 3)
[CELL 06-27] GRU4REC GLOBAL_TEST metrics: {'n': 214, 'HR@5': 0.48598130841121495, 'MRR@5': 0.42204049844236763, 'NDCG@5': 0.43816739292111306, 'HR@10': 0.5046728971962616, 'MRR@10': 0.42459019433318507, 'NDCG@10': 0.4442703620467722, 'HR@20': 0.5373831775700935, 'MRR@20': 0.42708077229105273, 'NDCG@20': 0.4527986658963601}
[CELL 06-27] elapsed=0.08s
[CELL 06-27] done


Evaluate GRU4Rec on EPISODIC test query (q_eval)

In [29]:
# [CELL 06-28] GRU4Rec evaluation on EPISODIC test query points (q_eval)

t0 = cell_start("CELL 06-28", "GRU4Rec eval: episodic test query")

# q_eval was created in 06-16 for Session-KNN episodic evaluation.
need_cols = {"session_id","tpos","label"}
if not need_cols.issubset(set(q_eval.columns)):
    raise RuntimeError(f"q_eval missing required columns: {need_cols - set(q_eval.columns)}")

cutoffs = list(map(int, CFG["eval"]["cutoffs"]))
res_gru_ep = eval_query_points_gru(q_eval[["session_id","tpos","label"]].copy(), sess_items_idx, cutoffs, batch_size=256)

print("[CELL 06-28] GRU4REC EPISODE_TEST(query) metrics:", res_gru_ep)

report = read_json(REPORT_PATH)
report["metrics"]["gru4rec_episode_test_query"] = res_gru_ep
report["key_findings"].append("Computed GRU4Rec baseline on episodic test query points (same episode protocol).")
write_json_atomic(REPORT_PATH, report)

cell_end("CELL 06-28", t0)



[CELL 06-28] GRU4Rec eval: episodic test query
[CELL 06-28] start=2026-01-06T23:44:24
[CELL 06-28] GRU4REC EPISODE_TEST(query) metrics: {'n': 40, 'HR@5': 0.825, 'MRR@5': 0.7666666666666667, 'NDCG@5': 0.7815464876785729, 'HR@10': 0.825, 'MRR@10': 0.7666666666666667, 'NDCG@10': 0.7815464876785729, 'HR@20': 0.825, 'MRR@20': 0.7666666666666667, 'NDCG@20': 0.7815464876785729}
[CELL 06-28] elapsed=0.06s
[CELL 06-28] done


The fact that GRU4Rec episodic HR@5 = 0.825 strongly confirms the episodic labels are valid in the item_idx space, so Session-KNN episodic must be recomputed in the same space. Your current report captures the (incorrect) coverage conclusion.

Targeted fix (no refactor): re-run Session-KNN episodic in item_idx space

Build Session-KNN train memory in item_idx space

In [30]:
# [CELL 06-29] Session-KNN memory in item_idx space (to match pairs/episodes labels)

t0 = cell_start("CELL 06-29", "Build Session-KNN memory in item_idx space")

from collections import defaultdict
import math

# Build train sequences in item_idx space from TRAIN session_ids
train_sess_ids = sessions.loc[sessions["split"] == "train", "session_id"].astype(str).unique().tolist()

train_items_idx = []
for sid in train_sess_ids:
    seq = sess_items_idx.get(str(sid))  # sess_items_idx built in GRU cell 06-22
    if seq is None or len(seq) < 2:
        continue
    # use unique for similarity base
    train_items_idx.append([int(x) for x in seq])

print("[CELL 06-29] train sessions used:", len(train_items_idx))

# inverted index in item_idx space
inv_idx = defaultdict(list)
for si, items in enumerate(train_items_idx):
    for it in set(items):
        inv_idx[int(it)].append(si)

def session_sim_idx(a_set, b_set) -> float:
    inter = len(a_set & b_set)
    if inter == 0:
        return 0.0
    return float(inter) / math.sqrt(float(len(a_set)) * float(len(b_set)))

KNN_K = 100
cutoffs = list(map(int, CFG["eval"]["cutoffs"]))
maxK = max(cutoffs)

def recommend_next_idx(prefix_items_idx, exclude_set, topn=maxK):
    a_set = set(prefix_items_idx)
    if not a_set:
        return []

    cand_sessions = set()
    for it in a_set:
        cand_sessions.update(inv_idx.get(int(it), []))
    if not cand_sessions:
        return []

    sims = []
    for si in cand_sessions:
        b_set = set(train_items_idx[si])
        s = session_sim_idx(a_set, b_set)
        if s > 0:
            sims.append((si, s))
    if not sims:
        return []

    sims.sort(key=lambda x: x[1], reverse=True)
    sims = sims[:KNN_K]

    score = defaultdict(float)
    for si, s in sims:
        for it in train_items_idx[si]:
            it = int(it)
            if it in exclude_set:
                continue
            score[it] += s

    if not score:
        return []

    ranked = sorted(score.items(), key=lambda x: (-x[1], x[0]))
    return [it for it, _ in ranked[:topn]]

cell_end("CELL 06-29", t0)



[CELL 06-29] Build Session-KNN memory in item_idx space
[CELL 06-29] start=2026-01-06T23:48:13
[CELL 06-29] train sessions used: 450
[CELL 06-29] elapsed=0.01s
[CELL 06-29] done


Re-evaluate Session-KNN on episodic test query (correct space)

In [31]:
# [CELL 06-30] Session-KNN episodic test eval (query-only) in item_idx space ✅

t0 = cell_start("CELL 06-30", "Session-KNN episodic test eval (item_idx space)")

import math

hr = {K: 0.0 for K in cutoffs}
mrr = {K: 0.0 for K in cutoffs}
ndcg = {K: 0.0 for K in cutoffs}
n = 0

for row in q_eval.itertuples(index=False):
    sid = str(row.session_id)
    tpos = int(row.tpos)           # 1-indexed
    true = int(row.label)          # item_idx (vocab)

    seq = sess_items_idx.get(sid)  # item_idx sequence
    if seq is None:
        continue

    pref_len = max(tpos - 1, 0)
    prefix = seq[:pref_len]
    if len(prefix) == 0:
        continue

    recs = recommend_next_idx(prefix, exclude_set=set(prefix), topn=maxK)
    n += 1

    if true in recs:
        r = recs.index(true) + 1
    else:
        r = 10**9

    for K in cutoffs:
        if r <= K:
            hr[K] += 1.0
            mrr[K] += 1.0 / float(r)
            ndcg[K] += 1.0 / math.log2(float(r) + 1.0)

out = {"n": int(n)}
den = float(n) if n else 1.0
for K in cutoffs:
    out[f"HR@{K}"] = hr[K] / den
    out[f"MRR@{K}"] = mrr[K] / den
    out[f"NDCG@{K}"] = ndcg[K] / den

print("[CELL 06-30] SESSION_KNN EPISODE_TEST(query) metrics (FIXED):", out)

report = read_json(REPORT_PATH)
report["metrics"]["session_knn_episode_test_query_itemidx_fixed"] = out
report["key_findings"].append("FIX: Recomputed Session-KNN episodic test in item_idx space to match episode labels.")
write_json_atomic(REPORT_PATH, report)

cell_end("CELL 06-30", t0)



[CELL 06-30] Session-KNN episodic test eval (item_idx space)
[CELL 06-30] start=2026-01-06T23:48:38
[CELL 06-30] SESSION_KNN EPISODE_TEST(query) metrics (FIXED): {'n': 40, 'HR@5': 0.6, 'MRR@5': 0.25625, 'NDCG@5': 0.3414700967349292, 'HR@10': 0.8, 'MRR@10': 0.2829761904761905, 'NDCG@10': 0.4061736079121778, 'HR@20': 0.85, 'MRR@20': 0.28714285714285714, 'NDCG@20': 0.41968551563354384}
[CELL 06-30] elapsed=0.01s
[CELL 06-30] done


Fix the coverage note (item_idx coverage, not raw item_id)

In [32]:
# [CELL 06-31] Correct coverage analysis in item_idx space

t0 = cell_start("CELL 06-31", "Coverage check in item_idx space (correct)")

train_item_idx_set = set()
for seq in train_items_idx:
    for it in seq:
        train_item_idx_set.add(int(it))

labels = q_eval["label"].astype(int).tolist()
n_all = len(labels)
n_not_in_train = sum([1 for y in labels if int(y) not in train_item_idx_set])
cov = 1.0 - (n_not_in_train / max(n_all, 1))

cov_note = {
    "episode_test_query_n": int(n_all),
    "episode_test_query_true_not_in_train_itemidx": int(n_not_in_train),
    "coverage_rate_itemidx": float(cov),
    "note": "This coverage is computed in item_idx (vocab) space; previous raw item_id comparison was invalid."
}

print("[CELL 06-31] coverage_note_itemidx:", cov_note)

report = read_json(REPORT_PATH)
report["sanity_samples"]["episode_test_item_coverage_itemidx"] = cov_note
report["key_findings"].append("FIX: Coverage recomputed in item_idx space; previous 0% coverage note was due to mismatched ID spaces.")
write_json_atomic(REPORT_PATH, report)

cell_end("CELL 06-31", t0)



[CELL 06-31] Coverage check in item_idx space (correct)
[CELL 06-31] start=2026-01-06T23:49:03
[CELL 06-31] coverage_note_itemidx: {'episode_test_query_n': 40, 'episode_test_query_true_not_in_train_itemidx': 0, 'coverage_rate_itemidx': 1.0, 'note': 'This coverage is computed in item_idx (vocab) space; previous raw item_id comparison was invalid.'}
[CELL 06-31] elapsed=0.02s
[CELL 06-31] done


Write a short baseline summary into report.json

In [33]:
# [CELL 06-32] Write baseline summary + correction note into report

t0 = cell_start("CELL 06-32", "Write baseline summary + key notes")

report = read_json(REPORT_PATH)

# Keep the corrected narrative explicit
report["notes"].append(
    "NOTE: Session-KNN episodic evaluation initially returned 0 due to mismatched ID spaces "
    "(raw item_id vs vocab item_idx). We recomputed episodic Session-KNN in item_idx space and "
    "recomputed coverage accordingly (coverage=100%)."
)

# Add a concise summary block
summary = {
    "global_test_n": int(report["metrics"]["popularity_global_test"]["n"]),
    "episode_test_query_n": int(report["metrics"]["popularity_episode_test_query"]["n"]),
    "global_test": {
        "popularity": report["metrics"]["popularity_global_test"],
        "session_knn": report["metrics"]["session_knn_global_test"],
        "gru4rec": report["metrics"]["gru4rec_global_test"],
    },
    "episode_test_query": {
        "popularity": report["metrics"]["popularity_episode_test_query"],
        "session_knn_fixed_itemidx": report["metrics"]["session_knn_episode_test_query_itemidx_fixed"],
        "gru4rec": report["metrics"]["gru4rec_episode_test_query"],
    },
    "coverage_itemidx": report["sanity_samples"]["episode_test_item_coverage_itemidx"],
}

report["sanity_samples"]["baseline_summary"] = summary
report["key_findings"].append("Notebook 06 complete: popularity, Session-KNN, GRU4Rec baselines reported for global + episodic test.")

write_json_atomic(REPORT_PATH, report)
print("[CELL 06-32] updated:", REPORT_PATH)

cell_end("CELL 06-32", t0)



[CELL 06-32] Write baseline summary + key notes
[CELL 06-32] start=2026-01-06T23:49:56
[CELL 06-32] updated: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210\report.json
[CELL 06-32] elapsed=0.02s
[CELL 06-32] done


Update manifest (include model + plots if exist)

In [34]:
# [CELL 06-33] Update manifest with key artifacts (safe hashing)

t0 = cell_start("CELL 06-33", "Write manifest artifacts")

manifest = read_json(MANIFEST_PATH)

# always include these
paths = [
    Path(CONFIG_PATH),
    Path(REPORT_PATH),
    Path(MANIFEST_PATH),
    OUT_DIR / "popularity_rank_top.json",
]

# include GRU best model if exists
best_path = OUT_DIR / "models" / "gru4rec_best.pt"
if best_path.exists():
    paths.append(best_path)

# add any plots folder files if you created plots elsewhere
plots_dir = OUT_DIR / "plots"
if plots_dir.exists():
    for p in sorted(plots_dir.glob("*")):
        if p.is_file():
            paths.append(p)

# de-dup and record
seen = set()
for p in paths:
    p = Path(p)
    if p.exists() and str(p) not in seen:
        manifest["artifacts"].append(safe_artifact_record(p))
        seen.add(str(p))

write_json_atomic(MANIFEST_PATH, manifest)
print("[CELL 06-33] updated:", MANIFEST_PATH, "artifacts:", len(manifest["artifacts"]))

cell_end("CELL 06-33", t0)



[CELL 06-33] Write manifest artifacts
[CELL 06-33] start=2026-01-06T23:50:18
[CELL 06-33] updated: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210\manifest.json artifacts: 8
[CELL 06-33] elapsed=0.03s
[CELL 06-33] done


SASRec - Baseline

SASRec setup + dataloader (PAD = n_items)

This avoids the padding-token collision problem by using pad_id = n_items and embedding size n_items+1.

In [35]:
# [CELL 06-34] SASRec: setup + dataloader (pad_id = n_items)

t0 = cell_start("CELL 06-34", "SASRec setup + dataloader")

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

pad_id = int(n_items)          # padding token is outside real item range [0..n_items-1]
n_items_pad = int(n_items + 1)

SAS_CFG = {
    "max_len": 50,
    "emb_dim": 64,
    "n_heads": 2,
    "n_layers": 2,
    "dropout": 0.1,
    "lr": 1e-3,
    "batch": 256,
    "epochs": 10,
}

print("[CELL 06-34] pad_id:", pad_id, "n_items_pad:", n_items_pad)
print("[CELL 06-34] SAS_CFG:", SAS_CFG)

class PrefixDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, i):
        return self.X[i], int(self.y[i])

def collate_pad_sas(batch, max_len: int, pad_id: int):
    seqs, ys = zip(*batch)

    # truncate from the left to keep most recent interactions
    seqs2 = []
    lens = []
    for s in seqs:
        s = list(s)
        if len(s) > max_len:
            s = s[-max_len:]
        seqs2.append(s)
        lens.append(len(s))

    lens = torch.tensor(lens, dtype=torch.long)
    T = int(lens.max().item()) if len(seqs2) else 1
    xpad = torch.full((len(seqs2), T), fill_value=pad_id, dtype=torch.long)
    for i, s in enumerate(seqs2):
        xpad[i, :len(s)] = torch.tensor(s, dtype=torch.long)

    y = torch.tensor(ys, dtype=torch.long)
    return xpad, lens, y

def make_loader(X, y, shuffle: bool):
    return DataLoader(
        PrefixDataset(X, y),
        batch_size=int(SAS_CFG["batch"]),
        shuffle=shuffle,
        collate_fn=lambda b: collate_pad_sas(b, max_len=int(SAS_CFG["max_len"]), pad_id=pad_id)
    )

sas_train_loader = make_loader(X_train, y_train, shuffle=True)
sas_val_loader   = make_loader(X_val, y_val, shuffle=False) if len(X_val) else None

cell_end("CELL 06-34", t0)



[CELL 06-34] SASRec setup + dataloader
[CELL 06-34] start=2026-01-06T23:55:22
[CELL 06-34] pad_id: 776 n_items_pad: 777
[CELL 06-34] SAS_CFG: {'max_len': 50, 'emb_dim': 64, 'n_heads': 2, 'n_layers': 2, 'dropout': 0.1, 'lr': 0.001, 'batch': 256, 'epochs': 10}
[CELL 06-34] elapsed=0.00s
[CELL 06-34] done


SASRec model

Causal self-attention + last-position representation → predict next item.

In [36]:
# [CELL 06-35] SASRec model (Transformer encoder, causal mask)

t0 = cell_start("CELL 06-35", "Define SASRec model")

class SASRec(nn.Module):
    def __init__(self, n_items_pad: int, n_items: int, max_len: int, emb_dim: int, n_heads: int, n_layers: int, dropout: float, pad_id: int):
        super().__init__()
        self.n_items = int(n_items)
        self.pad_id = int(pad_id)
        self.max_len = int(max_len)

        self.item_emb = nn.Embedding(n_items_pad, emb_dim)     # includes pad_id
        self.pos_emb  = nn.Embedding(max_len, emb_dim)
        self.drop = nn.Dropout(dropout)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=emb_dim,
            nhead=n_heads,
            dim_feedforward=emb_dim * 4,
            dropout=dropout,
            batch_first=True,
            activation="gelu",
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
        self.out = nn.Linear(emb_dim, self.n_items)  # predict only real items [0..n_items-1]

    def forward(self, xpad: torch.Tensor, lens: torch.Tensor):
        """
        xpad: [B,T] tokens in [0..n_items-1] or pad_id
        lens: [B] true lengths
        """
        B, T = xpad.shape
        xpad = xpad.clamp(min=0, max=self.pad_id)  # safety

        # positions 0..T-1
        pos = torch.arange(T, device=xpad.device).unsqueeze(0).expand(B, T).clamp(max=self.max_len-1)

        x = self.item_emb(xpad) + self.pos_emb(pos)
        x = self.drop(x)

        # padding mask: True where PAD
        key_padding_mask = (xpad == self.pad_id)  # [B,T]

        # causal mask: prevent attending to future positions
        # TransformerEncoder expects mask shape [T,T]
        causal_mask = torch.triu(torch.ones((T, T), device=xpad.device), diagonal=1).bool()

        h = self.encoder(x, mask=causal_mask, src_key_padding_mask=key_padding_mask)  # [B,T,E]

        # take last real position per sample: index = lens-1
        idx = (lens - 1).clamp(min=0).view(B, 1, 1).expand(B, 1, h.size(-1))
        h_last = h.gather(1, idx).squeeze(1)  # [B,E]

        logits = self.out(h_last)  # [B,n_items]
        return logits

sas_model = SASRec(
    n_items_pad=n_items_pad,
    n_items=n_items,
    max_len=int(SAS_CFG["max_len"]),
    emb_dim=int(SAS_CFG["emb_dim"]),
    n_heads=int(SAS_CFG["n_heads"]),
    n_layers=int(SAS_CFG["n_layers"]),
    dropout=float(SAS_CFG["dropout"]),
    pad_id=pad_id,
).to(device)

sas_opt = torch.optim.Adam(sas_model.parameters(), lr=float(SAS_CFG["lr"]))

print("[CELL 06-35] sas_model params:", sum(p.numel() for p in sas_model.parameters()))

cell_end("CELL 06-35", t0)



[CELL 06-35] Define SASRec model
[CELL 06-35] start=2026-01-06T23:55:54
[CELL 06-35] sas_model params: 203336
[CELL 06-35] elapsed=0.03s
[CELL 06-35] done


Train SASRec (train/val loss + save best)

In [37]:
# [CELL 06-36] Train SASRec (log train/val loss)

t0 = cell_start("CELL 06-36", "Train SASRec")

sas_best = float("inf")
sas_best_path = OUT_DIR / "models" / "sasrec_best.pt"
sas_best_path.parent.mkdir(parents=True, exist_ok=True)

def sas_run_epoch(loader, train: bool):
    sas_model.train(train)
    total = 0.0
    n = 0
    for xpad, lens, y in loader:
        xpad = xpad.to(device)
        lens = lens.to(device)
        y = y.to(device)

        logits = sas_model(xpad, lens)
        loss = F.cross_entropy(logits, y)

        if train:
            sas_opt.zero_grad()
            loss.backward()
            sas_opt.step()

        total += float(loss.item()) * int(y.shape[0])
        n += int(y.shape[0])
    return total / max(n, 1)

for ep in range(1, int(SAS_CFG["epochs"]) + 1):
    tr = sas_run_epoch(sas_train_loader, train=True)
    va = sas_run_epoch(sas_val_loader, train=False) if sas_val_loader is not None else float("nan")
    print(f"[CELL 06-36] epoch={ep}/{SAS_CFG['epochs']} train_loss={tr:.4f} val_loss={va:.4f}")

    if sas_val_loader is not None and va < sas_best:
        sas_best = va
        torch.save(sas_model.state_dict(), sas_best_path)
        print("[CELL 06-36] saved best:", sas_best_path)

# load best if exists
if sas_best_path.exists():
    sas_model.load_state_dict(torch.load(sas_best_path, map_location=device))
    print("[CELL 06-36] loaded best weights:", sas_best_path)

# save config in report
report = read_json(REPORT_PATH)
report["sanity_samples"]["sasrec_config"] = SAS_CFG
write_json_atomic(REPORT_PATH, report)

cell_end("CELL 06-36", t0, sas_best_val=sas_best if sas_val_loader is not None else None)



[CELL 06-36] Train SASRec
[CELL 06-36] start=2026-01-06T23:56:19
[CELL 06-36] epoch=1/10 train_loss=6.7319 val_loss=6.5141
[CELL 06-36] saved best: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210\models\sasrec_best.pt
[CELL 06-36] epoch=2/10 train_loss=6.3980 val_loss=6.3089
[CELL 06-36] saved best: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210\models\sasrec_best.pt
[CELL 06-36] epoch=3/10 train_loss=6.1446 val_loss=6.1517
[CELL 06-36] saved best: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210\models\sasrec_best.pt
[CELL 06-36] epoch=4/10 train_loss=5.9135 val_loss=6.0114
[CELL 06-36] saved best: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210\models\sasrec_best.pt
[CELL 06-36] epoch=5/10 train_loss=5.6959 val_loss=5.8681
[CELL 06-36] saved best: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210\m

Evaluate SASRec on GLOBAL test pairs

In [38]:
# [CELL 06-37] SASRec evaluation: global test pairs (mars_pairs_test_ts)

t0 = cell_start("CELL 06-37", "SASRec eval: global test pairs")

def sas_topk(prefix_list_batch, topk: int):
    if len(prefix_list_batch) == 0:
        return []
    # truncate to max_len from left
    seqs = []
    lens = []
    for s in prefix_list_batch:
        s = list(s)
        if len(s) > int(SAS_CFG["max_len"]):
            s = s[-int(SAS_CFG["max_len"]):]
        seqs.append(s)
        lens.append(len(s))
    lens = torch.tensor(lens, dtype=torch.long)

    T = int(lens.max().item()) if len(seqs) else 1
    xpad = torch.full((len(seqs), T), fill_value=pad_id, dtype=torch.long)
    for i, s in enumerate(seqs):
        xpad[i, :len(s)] = torch.tensor(s, dtype=torch.long)

    xpad = xpad.to(device)
    lens = lens.to(device)
    with torch.no_grad():
        logits = sas_model(xpad, lens)  # [B,n_items]
        _, idx = torch.topk(logits, k=topk, dim=1)
    return idx.cpu().numpy().tolist()

def eval_query_points_sas(df_points: pd.DataFrame, sess_items_idx: dict, cutoffs: list[int], batch_size: int = 256):
    import math
    maxK = max(cutoffs)

    prefixes, labels = [], []
    for r in df_points.itertuples(index=False):
        sid = str(r.session_id)
        tpos = int(r.tpos)
        y = int(r.label)
        seq = sess_items_idx.get(sid)
        if seq is None:
            continue
        pref = seq[:max(tpos - 1, 0)]
        if len(pref) == 0:
            continue
        prefixes.append(pref)
        labels.append(y)

    n = len(labels)
    out = {"n": int(n)}
    if n == 0:
        for K in cutoffs:
            out[f"HR@{K}"] = 0.0
            out[f"MRR@{K}"] = 0.0
            out[f"NDCG@{K}"] = 0.0
        return out

    hr = {K: 0.0 for K in cutoffs}
    mrr = {K: 0.0 for K in cutoffs}
    ndcg = {K: 0.0 for K in cutoffs}

    for i in range(0, n, batch_size):
        pbatch = prefixes[i:i+batch_size]
        ybatch = labels[i:i+batch_size]
        top = sas_topk(pbatch, topk=maxK)

        for recs, y in zip(top, ybatch):
            if y in recs:
                rnk = recs.index(y) + 1
            else:
                rnk = 10**9
            for K in cutoffs:
                if rnk <= K:
                    hr[K] += 1.0
                    mrr[K] += 1.0 / float(rnk)
                    ndcg[K] += 1.0 / math.log2(float(rnk) + 1.0)

    den = float(n)
    for K in cutoffs:
        out[f"HR@{K}"] = hr[K] / den
        out[f"MRR@{K}"] = mrr[K] / den
        out[f"NDCG@{K}"] = ndcg[K] / den
    return out

import duckdb
con_ro = duckdb.connect(str(DUCKDB_PATH), read_only=True)
pairs_test_ts = con_ro.execute("""
SELECT CAST(session_id AS VARCHAR) AS session_id,
       CAST(tpos AS INTEGER) AS tpos,
       CAST(label AS INTEGER) AS label
FROM mars_pairs_test_ts
ORDER BY session_id, tpos
""").fetchdf()
con_ro.close()

cutoffs = list(map(int, CFG["eval"]["cutoffs"]))
res_sas_global = eval_query_points_sas(pairs_test_ts, sess_items_idx, cutoffs, batch_size=256)

print("[CELL 06-37] SASREC GLOBAL_TEST metrics:", res_sas_global)

report = read_json(REPORT_PATH)
report["metrics"]["sasrec_global_test"] = res_sas_global
report["key_findings"].append("Computed SASRec baseline on global test pairs (mars_pairs_test_ts).")
write_json_atomic(REPORT_PATH, report)

cell_end("CELL 06-37", t0)



[CELL 06-37] SASRec eval: global test pairs
[CELL 06-37] start=2026-01-06T23:56:42
[CELL 06-37] SASREC GLOBAL_TEST metrics: {'n': 214, 'HR@5': 0.4158878504672897, 'MRR@5': 0.3349688473520249, 'NDCG@5': 0.3553619820236781, 'HR@10': 0.4532710280373832, 'MRR@10': 0.33955644563121207, 'NDCG@10': 0.3670473087431946, 'HR@20': 0.48130841121495327, 'MRR@20': 0.3415606479068547, 'NDCG@20': 0.3742040961819287}
[CELL 06-37] elapsed=0.14s
[CELL 06-37] done


Evaluate SASRec on EPISODIC test query (q_eval)

In [39]:
# [CELL 06-38] SASRec evaluation: episodic test query (q_eval)

t0 = cell_start("CELL 06-38", "SASRec eval: episodic test query")

need_cols = {"session_id","tpos","label"}
if not need_cols.issubset(set(q_eval.columns)):
    raise RuntimeError(f"q_eval missing required columns: {need_cols - set(q_eval.columns)}")

cutoffs = list(map(int, CFG["eval"]["cutoffs"]))
res_sas_ep = eval_query_points_sas(q_eval[["session_id","tpos","label"]].copy(), sess_items_idx, cutoffs, batch_size=256)

print("[CELL 06-38] SASREC EPISODE_TEST(query) metrics:", res_sas_ep)

report = read_json(REPORT_PATH)
report["metrics"]["sasrec_episode_test_query"] = res_sas_ep
report["key_findings"].append("Computed SASRec baseline on episodic test query points.")
write_json_atomic(REPORT_PATH, report)

cell_end("CELL 06-38", t0)



[CELL 06-38] SASRec eval: episodic test query
[CELL 06-38] start=2026-01-06T23:57:03
[CELL 06-38] SASREC EPISODE_TEST(query) metrics: {'n': 40, 'HR@5': 0.425, 'MRR@5': 0.26, 'NDCG@5': 0.30148210339744574, 'HR@10': 0.625, 'MRR@10': 0.2858333333333333, 'NDCG@10': 0.36529219174731414, 'HR@20': 0.625, 'MRR@20': 0.2858333333333333, 'NDCG@20': 0.36529219174731414}
[CELL 06-38] elapsed=0.08s
[CELL 06-38] done


Manifest update (include SASRec model)

In [40]:
# [CELL 06-39] Update manifest to include SASRec artifacts

t0 = cell_start("CELL 06-39", "Manifest update (SASRec)")

manifest = read_json(MANIFEST_PATH)

paths = [
    Path(CONFIG_PATH),
    Path(REPORT_PATH),
    Path(MANIFEST_PATH),
    OUT_DIR / "popularity_rank_top.json",
    OUT_DIR / "models" / "gru4rec_best.pt",
]

if sas_best_path.exists():
    paths.append(sas_best_path)

seen = set()
for p in paths:
    p = Path(p)
    if p.exists() and str(p) not in seen:
        manifest["artifacts"].append(safe_artifact_record(p))
        seen.add(str(p))

write_json_atomic(MANIFEST_PATH, manifest)
print("[CELL 06-39] updated:", MANIFEST_PATH, "artifacts:", len(manifest["artifacts"]))

cell_end("CELL 06-39", t0)



[CELL 06-39] Manifest update (SASRec)
[CELL 06-39] start=2026-01-06T23:57:20
[CELL 06-39] updated: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210\manifest.json artifacts: 14
[CELL 06-39] elapsed=0.03s
[CELL 06-39] done


Deprecate the wrong coverage note (keep it but mark as invalid)

In [41]:
# [CELL 06-40] Deprecate invalid coverage note (keep for audit, but mark invalid)

t0 = cell_start("CELL 06-40", "Mark invalid coverage note as deprecated")

report = read_json(REPORT_PATH)

# mark old block as deprecated if present
if "episode_test_item_coverage" in report.get("sanity_samples", {}):
    report["sanity_samples"]["episode_test_item_coverage"]["DEPRECATED"] = True
    report["sanity_samples"]["episode_test_item_coverage"]["reason"] = (
        "INVALID: compared raw item_id space vs item_idx labels. Use episode_test_item_coverage_itemidx instead."
    )

# remove or mark the wrong key_finding (safer: mark it explicitly rather than deleting)
kf = report.get("key_findings", [])
new_kf = []
for s in kf:
    if "0% item coverage" in s and "by definition" in s:
        new_kf.append("[DEPRECATED] " + s + " (invalid due to ID-space mismatch; see item_idx coverage fix)")
    else:
        new_kf.append(s)
report["key_findings"] = new_kf

write_json_atomic(REPORT_PATH, report)
print("[CELL 06-40] updated report:", REPORT_PATH)

cell_end("CELL 06-40", t0)



[CELL 06-40] Mark invalid coverage note as deprecated
[CELL 06-40] start=2026-01-06T23:59:16
[CELL 06-40] updated report: C:\anonymous-users-mooc-session-meta\reports\06_baselines_mars_global\20260106_234210\report.json
[CELL 06-40] elapsed=0.00s
[CELL 06-40] done
