Imports + env info (Session-KNN baseline)

In [1]:
# [CELL 08-00] Imports + env info (Session-KNN baseline)

import os
import json
import math
import time
import glob
import hashlib
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import torch

print("[08-00] Imports OK")
print("[08-00] torch:", torch.__version__)
print("[08-00] pandas:", pd.__version__)
print("[08-00] numpy:", np.__version__)


[08-00] Imports OK
[08-00] torch: 2.9.1+cpu
[08-00] pandas: 2.3.3
[08-00] numpy: 2.4.0


Locate repo root + run tags + load protocol/config artifacts

In [2]:
# [CELL 08-01] Locate repo root + run tags + load protocol/config artifacts

def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists() and (p / "meta.json").exists():
            return p
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            return p
    raise FileNotFoundError("Could not locate repo root (expected PROJECT_STATE.md).")

REPO_ROOT = find_repo_root(Path.cwd().resolve())
print("[08-01] REPO_ROOT:", REPO_ROOT)

RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
print("[08-01] RUN_TAG:", RUN_TAG)

# Fixed upstream run tags (do NOT change)
TARGET_TAG = "20251229_163357"
SOURCE_TAG = "20251229_232834"

def load_json(path: Path) -> dict:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

cfg_path_repo = REPO_ROOT / "data/processed/supervised" / f"dataloader_config_{TARGET_TAG}_{SOURCE_TAG}.json"
sanity_path_repo = REPO_ROOT / "data/processed/supervised" / f"sanity_metrics_{TARGET_TAG}_{SOURCE_TAG}.json"
gaps_path_repo = REPO_ROOT / "data/processed/normalized_events" / "session_gap_thresholds.json"

dataloader_cfg = load_json(cfg_path_repo)
sanity_metrics = load_json(sanity_path_repo)
session_gaps = load_json(gaps_path_repo)

print("[08-01] Loaded dataloader_config keys:", list(dataloader_cfg.keys()))
print("[08-01] Loaded sanity_metrics keys:", list(sanity_metrics.keys()))
print("[08-01] Loaded session_gap_thresholds keys:", list(session_gaps.keys()))

# Enforce fixed decisions
assert session_gaps["target"]["primary_threshold_seconds"] == 1800, "Target gap must be 30m (1800s)."
assert session_gaps["source"]["primary_threshold_seconds"] == 600, "Source gap must be 10m (600s)."
print("[08-01] ✅ Session gaps confirmed: target=30m, source=10m")

proto = dataloader_cfg["protocol"]
K_LIST = [5, 10, 20]
MAX_K = max(K_LIST)

MAX_PREFIX_LEN = int(proto["max_prefix_len"])
CAP_ENABLED = bool(proto["source_long_session_policy"]["enabled"])
CAP_SESSION_LEN = int(proto["source_long_session_policy"]["cap_session_len"])
CAP_STRATEGY = str(proto["source_long_session_policy"]["cap_strategy"])

print("[08-01] Protocol from 06:")
print("  K_LIST:", K_LIST)
print("  MAX_PREFIX_LEN:", MAX_PREFIX_LEN)
print("  CAP_ENABLED:", CAP_ENABLED)
print("  CAP_SESSION_LEN:", CAP_SESSION_LEN)
print("  CAP_STRATEGY:", CAP_STRATEGY)

print("\n[08-01] CHECKPOINT A")
print("Confirm all three JSON files loaded and session gaps asserts passed.")


[08-01] REPO_ROOT: C:\mooc-coldstart-session-meta
[08-01] RUN_TAG: 20260102_140713
[08-01] Loaded dataloader_config keys: ['target', 'source', 'protocol']
[08-01] Loaded sanity_metrics keys: ['run_tag_target', 'run_tag_source', 'created_at', 'target', 'source', 'notes']
[08-01] Loaded session_gap_thresholds keys: ['generated_from_run_tag', 'generated_at', 'target', 'source', 'decision_notes']
[08-01] ✅ Session gaps confirmed: target=30m, source=10m
[08-01] Protocol from 06:
  K_LIST: [5, 10, 20]
  MAX_PREFIX_LEN: 20
  CAP_ENABLED: True
  CAP_SESSION_LEN: 200
  CAP_STRATEGY: take_last

[08-01] CHECKPOINT A
Confirm all three JSON files loaded and session gaps asserts passed.


Resolve artifact paths (target tensors + source sequences) + existence checks

In [3]:
# [CELL 08-02] Resolve artifact paths (target tensors + source sequences) + existence checks

def must_exist(p: Path, label: str):
    if not p.exists():
        raise FileNotFoundError(f"{label} not found: {p}")
    return p

# TARGET tensors (05B output)
TARGET_TENSOR_DIR = REPO_ROOT / "data/processed/tensor_target"
target_train_pt = TARGET_TENSOR_DIR / f"target_tensor_train_{TARGET_TAG}.pt"
target_val_pt   = TARGET_TENSOR_DIR / f"target_tensor_val_{TARGET_TAG}.pt"
target_test_pt  = TARGET_TENSOR_DIR / f"target_tensor_test_{TARGET_TAG}.pt"
target_vocab_json = TARGET_TENSOR_DIR / f"target_vocab_items_{TARGET_TAG}.json"
target_tensor_meta_json = TARGET_TENSOR_DIR / f"target_tensor_metadata_{TARGET_TAG}.json"

# SOURCE sequences (05C output)
SOURCE_SEQ_ROOT = REPO_ROOT / "data/processed/session_sequences" / f"source_sessions_{SOURCE_TAG}"
source_train_dir = SOURCE_SEQ_ROOT / "train"
source_val_dir   = SOURCE_SEQ_ROOT / "val"
source_test_dir  = SOURCE_SEQ_ROOT / "test"
source_vocab_json = SOURCE_SEQ_ROOT / f"source_vocab_items_{SOURCE_TAG}.json"

# checks
for p, lbl in [
    (target_train_pt, "target_train_pt"),
    (target_val_pt, "target_val_pt"),
    (target_test_pt, "target_test_pt"),
    (target_vocab_json, "target_vocab_json"),
    (target_tensor_meta_json, "target_tensor_meta_json"),
    (source_train_dir, "source_train_dir"),
    (source_val_dir, "source_val_dir"),
    (source_test_dir, "source_test_dir"),
    (source_vocab_json, "source_vocab_json"),
]:
    must_exist(p, lbl)

print("[08-02] ✅ All required artifacts exist")

print("\n[08-02] CHECKPOINT B")
print("If any artifact missing, STOP and paste the error.")


[08-02] ✅ All required artifacts exist

[08-02] CHECKPOINT B
If any artifact missing, STOP and paste the error.


Torch loader (PyTorch 2.6+) + vocab loading + infer vocab sizes + PAD/UNK resolve

In [4]:
# [CELL 08-03] Torch loader (PyTorch 2.6+) + vocab loading + infer vocab sizes + PAD/UNK resolve

def torch_load_repo_artifact(path, map_location="cpu"):
    path = str(path)
    try:
        obj = torch.load(path, map_location=map_location, weights_only=False)
        print(f"[08-03] torch.load OK (weights_only=False): {path}")
        return obj
    except TypeError:
        obj = torch.load(path, map_location=map_location)
        print(f"[08-03] torch.load OK (no weights_only arg): {path}")
        return obj

target_vocab = load_json(target_vocab_json)
source_vocab = load_json(source_vocab_json)

def infer_vocab_size(vocab: dict, name: str) -> int:
    for k in ["vocab_size", "n_items", "num_items", "size"]:
        if k in vocab:
            vs = int(vocab[k])
            print(f"[08-03] {name}: vocab_size from key '{k}' = {vs}")
            return vs

    if "vocab" in vocab and isinstance(vocab["vocab"], dict):
        d = vocab["vocab"]
        if len(d) == 0:
            print(f"[08-03] {name}: vocab empty -> 0")
            return 0
        sample_k = next(iter(d.keys()))
        sample_v = d[sample_k]
        if isinstance(sample_v, int):
            ids = list(d.values())
            vs = max(ids) + 1 if len(ids) else 0
            print(f"[08-03] {name}: vocab_size from max(vocab values)+1 (token->id) = {vs}")
            return vs
        try:
            keys_int = [int(k) for k in d.keys()]
            vs = max(keys_int) + 1 if len(keys_int) else 0
            print(f"[08-03] {name}: vocab_size from max(vocab keys)+1 (id->token) = {vs}")
            return vs
        except Exception:
            vs = len(d)
            print(f"[08-03] {name}: vocab_size fallback len(vocab) = {vs}")
            return vs

    if "item2id" in vocab and isinstance(vocab["item2id"], dict):
        ids = list(vocab["item2id"].values())
        vs = max(ids) + 1 if len(ids) else 0
        print(f"[08-03] {name}: vocab_size from max(item2id values)+1 = {vs}")
        return vs

    if "items" in vocab and isinstance(vocab["items"], list):
        vs = len(vocab["items"])
        print(f"[08-03] {name}: vocab_size from len(items) = {vs}")
        return vs

    raise KeyError(f"[08-03] {name}: Could not infer vocab_size. Keys={list(vocab.keys())}")

vocab_size_target = infer_vocab_size(target_vocab, "TARGET")
vocab_size_source = infer_vocab_size(source_vocab, "SOURCE")

# Resolve PAD/UNK for target (target vocab uses "pad_token"/"unk_token" + "vocab" token->id)
def get_special_id(vocab_obj: dict, token_key: str, fallback: int, name: str) -> int:
    tok = vocab_obj.get(token_key, None)
    if tok is None:
        print(f"[08-03] {name}: missing {token_key}, fallback id={fallback}")
        return fallback
    mapping = vocab_obj.get("vocab", {})
    if isinstance(mapping, dict) and tok in mapping and isinstance(mapping[tok], int):
        return int(mapping[tok])
    print(f"[08-03] {name}: could not resolve {token_key}='{tok}' in vocab mapping, fallback id={fallback}")
    return fallback

PAD_ID_TARGET = get_special_id(target_vocab, "pad_token", 0, "TARGET")
UNK_ID_TARGET = get_special_id(target_vocab, "unk_token", 1, "TARGET")

# Resolve PAD/UNK for source (explicit)
PAD_ID_SOURCE = int(source_vocab.get("pad_id", 0))
UNK_ID_SOURCE = int(source_vocab.get("unk_id", 1))

print("[08-03] PAD_ID_TARGET:", PAD_ID_TARGET, "| UNK_ID_TARGET:", UNK_ID_TARGET)
print("[08-03] PAD_ID_SOURCE:", PAD_ID_SOURCE, "| UNK_ID_SOURCE:", UNK_ID_SOURCE)

assert PAD_ID_TARGET == 0, "Target PAD must be 0."
assert PAD_ID_SOURCE == 0, "Source PAD must be 0."

# Build source token->id mapping (strings -> ids)
def build_token_to_id(vocab_obj: dict) -> dict:
    if "vocab" in vocab_obj and isinstance(vocab_obj["vocab"], dict):
        d = vocab_obj["vocab"]
        if len(d) > 0:
            sample_k = next(iter(d.keys()))
            if isinstance(d[sample_k], int):
                return d
            try:
                _ = int(sample_k)
                return {v: int(k) for k, v in d.items()}
            except Exception:
                pass
    if "item2id" in vocab_obj and isinstance(vocab_obj["item2id"], dict):
        return vocab_obj["item2id"]
    if "items" in vocab_obj and isinstance(vocab_obj["items"], list):
        return {tok: i for i, tok in enumerate(vocab_obj["items"])}
    raise KeyError(f"[08-03] Could not build token_to_id. Keys={list(vocab_obj.keys())}")

source_token_to_id = build_token_to_id(source_vocab)
print("[08-03] source_token_to_id size:", len(source_token_to_id))

# Map a source seq (np.ndarray/list of strings) to ids
def map_source_seq_to_ids(seq) -> np.ndarray:
    if seq is None:
        return np.array([], dtype=np.int64)
    if isinstance(seq, np.ndarray):
        seq_list = seq.tolist()
    else:
        seq_list = list(seq)
    if len(seq_list) == 0:
        return np.array([], dtype=np.int64)
    # already ints?
    if isinstance(seq_list[0], (int, np.integer)):
        return np.asarray(seq_list, dtype=np.int64)
    out = np.fromiter((source_token_to_id.get(tok, UNK_ID_SOURCE) for tok in seq_list), dtype=np.int64)
    return out

print("[08-03] ✅ Vocab + mapping ready")

print("\n[08-03] CHECKPOINT C")
print("Confirm vocab sizes + PAD/UNK printed as expected before building KNN indexes.")


[08-03] TARGET: vocab_size from max(vocab values)+1 (token->id) = 747
[08-03] SOURCE: vocab_size from key 'vocab_size' = 1620
[08-03] PAD_ID_TARGET: 0 | UNK_ID_TARGET: 1
[08-03] PAD_ID_SOURCE: 0 | UNK_ID_SOURCE: 1
[08-03] source_token_to_id size: 1620
[08-03] ✅ Vocab + mapping ready

[08-03] CHECKPOINT C
Confirm vocab sizes + PAD/UNK printed as expected before building KNN indexes.


Metrics (reuse 06 protocol): HR/MRR/NDCG @ K={5,10,20}

In [5]:
# [CELL 08-04] Metrics (reuse 06 protocol): HR/MRR/NDCG @ K={5,10,20}

def init_metrics():
    return {f"{m}@{k}": 0.0 for m in ["HR", "MRR", "NDCG"] for k in K_LIST}

def update_metrics_from_rank(metrics: dict, rank0: int | None):
    if rank0 is None:
        return
    r = rank0 + 1
    for k in K_LIST:
        if r <= k:
            metrics[f"HR@{k}"] += 1.0
            metrics[f"MRR@{k}"] += 1.0 / r
            metrics[f"NDCG@{k}"] += 1.0 / math.log2(r + 1.0)

def finalize_metrics(metrics: dict, n: int) -> dict:
    return {k: (float(v / n) if n > 0 else 0.0) for k, v in metrics.items()}

print("[08-04] ✅ Metric functions ready")


[08-04] ✅ Metric functions ready


Session-KNN core (inverted index) + scoring

In [6]:
# [CELL 08-05] Session-KNN core (inverted index) + scoring
# Similarity: cosine on binary sets: sim = overlap / sqrt(|Q|*|S|)
# Scoring: sum(sim * pos_weight) over neighbor sessions and their items
# pos_weight: 1/(distance_from_end) where end distance = 1..L

from collections import defaultdict

SKNN_CFG = {
    "k_neighbors": 200,          # neighbors to use in scoring
    "candidate_sessions_cap": 50000,  # safety cap for candidate sessions per query
    "pos_weighting": "inv_dist", # inv_dist | none
    "exclude_pad": True,
}

print("[08-05] SKNN_CFG:", SKNN_CFG)

def build_inverted_index(sessions_items: list[np.ndarray], pad_id: int) -> tuple[list[int], dict[int, list[int]]]:
    """
    sessions_items: list of np.ndarray[int] per session/prefix
    Returns:
      - session_lens: list[int]
      - inv: dict[item_id] -> list[session_idx]
    """
    inv = defaultdict(list)
    session_lens = []
    for s_idx, arr in enumerate(sessions_items):
        if arr is None or arr.size == 0:
            session_lens.append(0)
            continue
        # unique items for similarity postings
        uniq = np.unique(arr)
        if pad_id in uniq:
            uniq = uniq[uniq != pad_id]
        session_lens.append(int(len(uniq)))
        for it in uniq:
            inv[int(it)].append(s_idx)
    return session_lens, inv

def score_items_from_neighbors(
    query_items: np.ndarray,
    sessions_items: list[np.ndarray],
    session_lens: list[int],
    inv: dict[int, list[int]],
    k_neighbors: int,
    candidate_sessions_cap: int,
    pos_weighting: str,
    pad_id: int,
) -> dict[int, float]:
    """
    Returns item->score for query next-item ranking.
    """
    # query unique items for similarity
    q = np.unique(query_items)
    q = q[q != pad_id]
    if q.size == 0:
        return {}

    # accumulate overlap counts using inverted index
    overlap = defaultdict(int)
    candidates = 0

    for it in q:
        posting = inv.get(int(it), [])
        # iterate through postings
        for s_idx in posting:
            overlap[s_idx] += 1
            candidates += 1
            if candidates >= candidate_sessions_cap:
                break
        if candidates >= candidate_sessions_cap:
            break

    if len(overlap) == 0:
        return {}

    # compute similarity for candidate sessions
    q_len = float(len(q))
    sims = []
    for s_idx, inter in overlap.items():
        s_len = float(session_lens[s_idx])
        if s_len <= 0:
            continue
        sim = float(inter) / math.sqrt(q_len * s_len)
        if sim > 0:
            sims.append((sim, s_idx))

    if len(sims) == 0:
        return {}

    sims.sort(reverse=True, key=lambda x: x[0])
    sims = sims[:k_neighbors]

    # score items from neighbor sessions
    scores = defaultdict(float)
    for sim, s_idx in sims:
        items = sessions_items[s_idx]
        if items is None or items.size == 0:
            continue
        # exclude PAD
        if pad_id is not None:
            items = items[items != pad_id]
        if items.size == 0:
            continue

        if pos_weighting == "none":
            for it in items:
                scores[int(it)] += sim
        elif pos_weighting == "inv_dist":
            L = int(items.size)
            # distance_from_end: 1..L
            for pos, it in enumerate(items):
                dist = (L - pos)
                scores[int(it)] += sim * (1.0 / float(dist))
        else:
            raise ValueError(f"Unknown pos_weighting: {pos_weighting}")

    return dict(scores)

def topk_from_scores(scores: dict[int, float], k: int, pad_id: int) -> list[int]:
    if not scores:
        return []
    # remove PAD if it exists
    if pad_id in scores:
        scores.pop(pad_id, None)
    # sort by score desc then item_id asc
    items = sorted(scores.items(), key=lambda x: (-x[1], x[0]))
    return [it for it, _ in items[:k]]

print("[08-05] ✅ Session-KNN helpers ready")

print("\n[08-05] CHECKPOINT D")
print("If you want to adjust SKNN_CFG (k_neighbors/caps/pos_weighting), do it now before building indexes.")


[08-05] SKNN_CFG: {'k_neighbors': 200, 'candidate_sessions_cap': 50000, 'pos_weighting': 'inv_dist', 'exclude_pad': True}
[08-05] ✅ Session-KNN helpers ready

[08-05] CHECKPOINT D
If you want to adjust SKNN_CFG (k_neighbors/caps/pos_weighting), do it now before building indexes.


TARGET: build KNN index from target TRAIN prefixes (small => full in-memory)

In [7]:
# [CELL 08-06] TARGET: build KNN index from target TRAIN prefixes (small => full in-memory)
# Training "sessions" = each TRAIN row prefix items (PAD removed, attn_mask used).
# This keeps everything consistent with the supervised next-item setting.

train_obj = torch_load_repo_artifact(target_train_pt, map_location="cpu")
assert isinstance(train_obj, dict), "Expected dict in target train pt"

input_ids = torch.as_tensor(train_obj["input_ids"]).detach().cpu().long()
attn_mask = torch.as_tensor(train_obj["attn_mask"]).detach().cpu().long()

print("[08-06] target_train input_ids:", tuple(input_ids.shape), "attn_mask:", tuple(attn_mask.shape))

target_train_sessions = []
for i in range(input_ids.shape[0]):
    mask = attn_mask[i].numpy().astype(bool)
    arr = input_ids[i].numpy()[mask].astype(np.int64)
    arr = arr[arr != PAD_ID_TARGET]
    # keep only last MAX_PREFIX_LEN (should already be)
    if arr.size > MAX_PREFIX_LEN:
        arr = arr[-MAX_PREFIX_LEN:]
    target_train_sessions.append(arr)

print("[08-06] target_train_sessions:", len(target_train_sessions),
      "| avg_len:", float(np.mean([len(s) for s in target_train_sessions])))

target_session_lens, target_inv = build_inverted_index(target_train_sessions, PAD_ID_TARGET)
print("[08-06] target_inv items:", len(target_inv))

# quick posting stats
post_sizes = np.array([len(v) for v in target_inv.values()], dtype=np.int64)
print("[08-06] posting sizes: min/median/p95/max =",
      int(post_sizes.min()), int(np.median(post_sizes)), int(np.quantile(post_sizes, 0.95)), int(post_sizes.max()))

print("\n[08-06] CHECKPOINT E")
print("Target KNN index built. Next we evaluate on target VAL/TEST.")


[08-03] torch.load OK (weights_only=False): C:\mooc-coldstart-session-meta\data\processed\tensor_target\target_tensor_train_20251229_163357.pt
[08-06] target_train input_ids: (1944, 20) attn_mask: (1944, 20)
[08-06] target_train_sessions: 1944 | avg_len: 6.370884773662551
[08-06] target_inv items: 631
[08-06] posting sizes: min/median/p95/max = 1 14 62 174

[08-06] CHECKPOINT E
Target KNN index built. Next we evaluate on target VAL/TEST.


TARGET evaluation (VAL + TEST) with Session-KNN

In [8]:
# [CELL 08-07] TARGET evaluation (VAL + TEST) with Session-KNN

def eval_target_knn(pt_path: Path, split_name: str) -> dict:
    obj = torch_load_repo_artifact(pt_path, map_location="cpu")
    assert isinstance(obj, dict), f"{split_name}: expected dict"
    x = torch.as_tensor(obj["input_ids"]).detach().cpu().long()
    m = torch.as_tensor(obj["attn_mask"]).detach().cpu().long()
    y = torch.as_tensor(obj["labels"]).detach().cpu().long().numpy()

    metrics = init_metrics()
    n = 0

    for i in range(x.shape[0]):
        if int(y[i]) == PAD_ID_TARGET:
            continue

        mask = m[i].numpy().astype(bool)
        q = x[i].numpy()[mask].astype(np.int64)
        q = q[q != PAD_ID_TARGET]
        if q.size > MAX_PREFIX_LEN:
            q = q[-MAX_PREFIX_LEN:]

        scores = score_items_from_neighbors(
            query_items=q,
            sessions_items=target_train_sessions,
            session_lens=target_session_lens,
            inv=target_inv,
            k_neighbors=int(SKNN_CFG["k_neighbors"]),
            candidate_sessions_cap=int(SKNN_CFG["candidate_sessions_cap"]),
            pos_weighting=str(SKNN_CFG["pos_weighting"]),
            pad_id=PAD_ID_TARGET,
        )

        top = topk_from_scores(scores, MAX_K, PAD_ID_TARGET)
        top_index = {it: idx for idx, it in enumerate(top)}

        rank0 = top_index.get(int(y[i]), None)
        update_metrics_from_rank(metrics, rank0)
        n += 1

        if (i + 1) % 2000 == 0:
            print(f"[08-07] {split_name}: processed {i+1}/{x.shape[0]}")

    out = finalize_metrics(metrics, n)
    out["_n_examples"] = int(n)
    return out

t_val_knn = eval_target_knn(target_val_pt, "target_val")
t_test_knn = eval_target_knn(target_test_pt, "target_test")

print("[08-07] TARGET VAL (Session-KNN):", t_val_knn)
print("[08-07] TARGET TEST (Session-KNN):", t_test_knn)

print("\n[08-07] CHECKPOINT F")
print("Paste TARGET KNN metrics if you want to sanity-check before running SOURCE (heavy).")


[08-03] torch.load OK (weights_only=False): C:\mooc-coldstart-session-meta\data\processed\tensor_target\target_tensor_val_20251229_163357.pt
[08-03] torch.load OK (weights_only=False): C:\mooc-coldstart-session-meta\data\processed\tensor_target\target_tensor_test_20251229_163357.pt
[08-07] TARGET VAL (Session-KNN): {'HR@5': 0.30687830687830686, 'HR@10': 0.48148148148148145, 'HR@20': 0.6243386243386243, 'MRR@5': 0.11940035273368603, 'MRR@10': 0.14264508272444779, 'MRR@20': 0.15248653060646283, 'NDCG@5': 0.1662376128266329, 'NDCG@10': 0.22261874958963768, 'NDCG@20': 0.25863209838882517, '_n_examples': 189}
[08-07] TARGET TEST (Session-KNN): {'HR@5': 0.255, 'HR@10': 0.37, 'HR@20': 0.49, 'MRR@5': 0.08908333333333332, 'MRR@10': 0.10474999999999998, 'MRR@20': 0.1125188543826315, 'NDCG@5': 0.1301628535815534, 'NDCG@10': 0.16767645572690335, 'NDCG@20': 0.1973000384141053, '_n_examples': 200}

[08-07] CHECKPOINT F
Paste TARGET KNN metrics if you want to sanity-check before running SOURCE (heavy

SOURCE: build a memory-safe KNN index from a deterministic sample of TRAIN sessions

In [9]:
# [CELL 08-08] SOURCE: build a memory-safe KNN index from a deterministic sample of TRAIN sessions
# Source is huge, so we build an index from a reproducible subset of sessions.
# IMPORTANT: This is model-side sampling (not protocol). It is logged in run_meta.json.

SOURCE_INDEX_CFG = {
    "sample_size_sessions": 100_000,   # adjust if you have RAM; 100k is a reasonable starting point
    "sample_mod": 50,                  # keep session if md5(session_id)%sample_mod == 0
    "max_shards_to_scan": None,        # None = scan until sample_size reached (or end)
}

print("[08-08] SOURCE_INDEX_CFG:", SOURCE_INDEX_CFG)

def list_parquet_shards(dir_path: Path) -> list[Path]:
    files = sorted([Path(p) for p in glob.glob(str(dir_path / "sessions_b*.parquet"))])
    if len(files) == 0:
        raise FileNotFoundError(f"No shards found under {dir_path} (expected sessions_b*.parquet)")
    return files

train_shards = list_parquet_shards(source_train_dir)
val_shards   = list_parquet_shards(source_val_dir)
test_shards  = list_parquet_shards(source_test_dir)

print("[08-08] Source shards:", "train=", len(train_shards), "val=", len(val_shards), "test=", len(test_shards))

# detect seq col
probe = pd.read_parquet(train_shards[0])
seq_col = "items" if "items" in probe.columns else None
if seq_col is None:
    # fallback detector
    def detect_sequence_column(df: pd.DataFrame) -> str:
        for c in ["items", "item_ids", "sequence", "seq", "course_ids"]:
            if c in df.columns:
                return c
        for c in df.columns:
            s = df[c].dropna()
            if len(s) == 0:
                continue
            v = s.iloc[0]
            if isinstance(v, (list, tuple, np.ndarray)):
                return c
        raise KeyError(f"Could not detect sequence column from columns={list(df.columns)}")
    seq_col = detect_sequence_column(probe)

print("[08-08] Using seq_col:", seq_col)
assert "session_id" in probe.columns, "[08-08] Expected session_id column for deterministic sampling"

def stable_mod(session_id, mod: int) -> int:
    s = str(session_id).encode("utf-8")
    h = hashlib.md5(s).hexdigest()
    return int(h, 16) % int(mod)

source_index_sessions = []
source_index_session_ids = []
scanned_sessions = 0
kept_sessions = 0

t0 = time.time()
max_shards = SOURCE_INDEX_CFG["max_shards_to_scan"]
for shard_i, fp in enumerate(train_shards, 1):
    if max_shards is not None and shard_i > int(max_shards):
        break

    df = pd.read_parquet(fp, columns=["session_id", seq_col])

    for sid, seq in zip(df["session_id"].values, df[seq_col].values):
        scanned_sessions += 1
        if stable_mod(sid, int(SOURCE_INDEX_CFG["sample_mod"])) != 0:
            continue

        # cap long sessions before mapping
        if CAP_ENABLED and len(seq) > CAP_SESSION_LEN and CAP_STRATEGY == "take_last":
            seq = seq[-CAP_SESSION_LEN:]

        arr = map_source_seq_to_ids(seq)
        arr = arr[arr != PAD_ID_SOURCE]
        if arr.size < 2:
            continue

        # also cap to last MAX_PREFIX_LEN? (for similarity, we keep full capped session <=200 for source)
        # KEEP full capped session (<=200) to match session-based neighbor structure

        # range check
        if arr.min() < 0 or arr.max() >= vocab_size_source:
            raise ValueError(f"[08-08] mapped source id out of range in {fp}: min={arr.min()} max={arr.max()} vocab={vocab_size_source}")

        source_index_sessions.append(arr.astype(np.int64))
        source_index_session_ids.append(str(sid))
        kept_sessions += 1

        if kept_sessions >= int(SOURCE_INDEX_CFG["sample_size_sessions"]):
            break

    if shard_i % 25 == 0:
        dt = time.time() - t0
        print(f"[08-08] scanned_shards={shard_i}/{len(train_shards)} | scanned_sessions={scanned_sessions:,} kept={kept_sessions:,} | elapsed={dt:.1f}s")

    if kept_sessions >= int(SOURCE_INDEX_CFG["sample_size_sessions"]):
        break

dt = time.time() - t0
print("[08-08] ✅ Source index sample built:",
      "kept_sessions=", f"{kept_sessions:,}",
      "| scanned_sessions=", f"{scanned_sessions:,}",
      "| shards_scanned=", shard_i,
      "| elapsed=", f"{dt:.1f}s")

source_session_lens, source_inv = build_inverted_index(source_index_sessions, PAD_ID_SOURCE)
print("[08-08] source_inv items:", len(source_inv))

post_sizes = np.array([len(v) for v in source_inv.values()], dtype=np.int64)
print("[08-08] posting sizes: min/median/p95/max =",
      int(post_sizes.min()), int(np.median(post_sizes)), int(np.quantile(post_sizes, 0.95)), int(post_sizes.max()))

print("\n[08-08] CHECKPOINT G")
print("If kept_sessions is too low, reduce sample_mod (e.g., 20) and rerun this cell.")
print("Next we evaluate source val/test with an optional pair cap for compute.")


[08-08] SOURCE_INDEX_CFG: {'sample_size_sessions': 100000, 'sample_mod': 50, 'max_shards_to_scan': None}
[08-08] Source shards: train= 1024 val= 1024 test= 1024
[08-08] Using seq_col: items
[08-08] scanned_shards=25/1024 | scanned_sessions=162,528 kept=3,310 | elapsed=0.9s
[08-08] scanned_shards=50/1024 | scanned_sessions=325,575 kept=6,585 | elapsed=1.7s
[08-08] scanned_shards=75/1024 | scanned_sessions=488,607 kept=9,823 | elapsed=2.7s
[08-08] scanned_shards=100/1024 | scanned_sessions=650,531 kept=13,105 | elapsed=3.5s
[08-08] scanned_shards=125/1024 | scanned_sessions=813,823 kept=16,526 | elapsed=4.4s
[08-08] scanned_shards=150/1024 | scanned_sessions=977,651 kept=19,826 | elapsed=5.3s
[08-08] scanned_shards=175/1024 | scanned_sessions=1,140,398 kept=23,115 | elapsed=6.4s
[08-08] scanned_shards=200/1024 | scanned_sessions=1,304,244 kept=26,373 | elapsed=7.4s
[08-08] scanned_shards=225/1024 | scanned_sessions=1,466,729 kept=29,689 | elapsed=8.6s
[08-08] scanned_shards=250/1024 | sc

SOURCE evaluation (VAL + TEST) with Session-KNN (streaming transitions)

In [10]:
# [CELL 08-09] SOURCE evaluation (VAL + TEST) with Session-KNN (streaming transitions)
# NOTE: Source has ~13.8M pairs per split; Session-KNN is expensive.
# We include an explicit compute cap (logged). Set to None to run full evaluation.
# This does NOT change the protocol definition; it's a runtime limit for feasibility.

SOURCE_EVAL_CFG = {
    "pair_cap": 300_000,   # None for full (may take a very long time); otherwise evaluates first N pairs encountered
    "log_every_pairs": 100_000,
}

print("[08-09] SOURCE_EVAL_CFG:", SOURCE_EVAL_CFG)

def eval_source_knn(shards: list[Path], split_name: str) -> dict:
    metrics = init_metrics()
    n_pairs = 0
    n_sessions_seen = 0
    n_unk_labels = 0

    pair_cap = SOURCE_EVAL_CFG["pair_cap"]
    log_every = int(SOURCE_EVAL_CFG["log_every_pairs"])

    t0 = time.time()
    for shard_i, fp in enumerate(shards, 1):
        df = pd.read_parquet(fp, columns=[seq_col])

        for seq in df[seq_col].values:
            if seq is None:
                continue

            if CAP_ENABLED and len(seq) > CAP_SESSION_LEN and CAP_STRATEGY == "take_last":
                seq = seq[-CAP_SESSION_LEN:]

            arr = map_source_seq_to_ids(seq)
            arr = arr[arr != PAD_ID_SOURCE]
            L = int(arr.size)
            if L < 2:
                continue

            n_sessions_seen += 1

            # transitions t=1..L-1
            for t in range(1, L):
                y = int(arr[t])
                if y == PAD_ID_SOURCE:
                    continue
                if y == UNK_ID_SOURCE:
                    n_unk_labels += 1

                # prefix = last MAX_PREFIX_LEN of arr[:t]
                prefix = arr[:t]
                if prefix.size > MAX_PREFIX_LEN:
                    prefix = prefix[-MAX_PREFIX_LEN:]

                scores = score_items_from_neighbors(
                    query_items=prefix,
                    sessions_items=source_index_sessions,
                    session_lens=source_session_lens,
                    inv=source_inv,
                    k_neighbors=int(SKNN_CFG["k_neighbors"]),
                    candidate_sessions_cap=int(SKNN_CFG["candidate_sessions_cap"]),
                    pos_weighting=str(SKNN_CFG["pos_weighting"]),
                    pad_id=PAD_ID_SOURCE,
                )
                top = topk_from_scores(scores, MAX_K, PAD_ID_SOURCE)
                top_index = {it: idx for idx, it in enumerate(top)}

                rank0 = top_index.get(y, None)
                update_metrics_from_rank(metrics, rank0)

                n_pairs += 1

                if (n_pairs % log_every) == 0:
                    dt = time.time() - t0
                    print(f"[08-09] {split_name}: pairs={n_pairs:,} sessions_seen={n_sessions_seen:,} elapsed={dt:.1f}s")

                if pair_cap is not None and n_pairs >= int(pair_cap):
                    out = finalize_metrics(metrics, n_pairs)
                    out["_n_pairs"] = int(n_pairs)
                    out["_n_sessions_seen"] = int(n_sessions_seen)
                    out["_n_unk_labels"] = int(n_unk_labels)
                    out["_pair_cap"] = int(pair_cap)
                    out["_note"] = "Evaluation capped for compute feasibility (see SOURCE_EVAL_CFG)."
                    return out

    out = finalize_metrics(metrics, n_pairs)
    out["_n_pairs"] = int(n_pairs)
    out["_n_sessions_seen"] = int(n_sessions_seen)
    out["_n_unk_labels"] = int(n_unk_labels)
    out["_pair_cap"] = None
    return out

s_val_knn = eval_source_knn(val_shards, "source_val")
s_test_knn = eval_source_knn(test_shards, "source_test")

print("[08-09] SOURCE VAL (Session-KNN):", s_val_knn)
print("[08-09] SOURCE TEST (Session-KNN):", s_test_knn)

print("\n[08-09] CHECKPOINT H")
print("Paste SOURCE KNN metrics. If you want full eval, set SOURCE_EVAL_CFG['pair_cap']=None and rerun (may be long).")


[08-09] SOURCE_EVAL_CFG: {'pair_cap': 300000, 'log_every_pairs': 100000}
[08-09] source_val: pairs=100,000 sessions_seen=5,825 elapsed=244.6s
[08-09] source_val: pairs=200,000 sessions_seen=12,015 elapsed=520.3s
[08-09] source_val: pairs=300,000 sessions_seen=18,120 elapsed=774.9s
[08-09] source_test: pairs=100,000 sessions_seen=6,060 elapsed=242.3s
[08-09] source_test: pairs=200,000 sessions_seen=12,119 elapsed=478.2s
[08-09] source_test: pairs=300,000 sessions_seen=18,311 elapsed=714.1s
[08-09] SOURCE VAL (Session-KNN): {'HR@5': 0.9701633333333334, 'HR@10': 0.9728133333333333, 'HR@20': 0.9742666666666666, 'MRR@5': 0.939244777777809, 'MRR@10': 0.9396163240741069, 'MRR@20': 0.9397188795157997, 'NDCG@5': 0.947181040573676, 'NDCG@10': 0.9480558182513753, 'NDCG@20': 0.9484253802103635, '_n_pairs': 300000, '_n_sessions_seen': 18120, '_n_unk_labels': 2, '_pair_cap': 300000, '_note': 'Evaluation capped for compute feasibility (see SOURCE_EVAL_CFG).'}
[08-09] SOURCE TEST (Session-KNN): {'HR@5

Write report artifacts to reports/08_session_knn_baseline/<RUN_TAG>/ + update meta.json

In [12]:
# [CELL 08-10] Write report artifacts to reports/08_session_knn_baseline/<RUN_TAG>/ + update meta.json

REPORT_DIR = REPO_ROOT / "reports" / "08_session_knn_baseline" / RUN_TAG
REPORT_DIR.mkdir(parents=True, exist_ok=True)

def save_json(obj: dict, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2, ensure_ascii=False)

run_meta = {
    "run_tag": RUN_TAG,
    "created_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "inputs": {
        "target_run_tag": TARGET_TAG,
        "source_run_tag": SOURCE_TAG,
        "target_train_pt": str(target_train_pt),
        "target_val_pt": str(target_val_pt),
        "target_test_pt": str(target_test_pt),
        "target_vocab_json": str(target_vocab_json),
        "target_tensor_metadata_json": str(target_tensor_meta_json),
        "source_train_dir": str(source_train_dir),
        "source_val_dir": str(source_val_dir),
        "source_test_dir": str(source_test_dir),
        "source_vocab_json": str(source_vocab_json),
        "dataloader_config": str(cfg_path_repo),
        "sanity_metrics": str(sanity_path_repo),
        "session_gap_thresholds": str(gaps_path_repo),
    },
    "protocol_reused_from_06": {
        "K_LIST": K_LIST,
        "MAX_PREFIX_LEN": MAX_PREFIX_LEN,
        "PAD_ID_TARGET": PAD_ID_TARGET,
        "PAD_ID_SOURCE": PAD_ID_SOURCE,
        "CAP_ENABLED": CAP_ENABLED,
        "CAP_SESSION_LEN": CAP_SESSION_LEN,
        "CAP_STRATEGY": CAP_STRATEGY,
        "pad_excluded_from_ranking": True,
    },
    "model": {
        "name": "Session-KNN",
        "similarity": "cosine_binary",
        "pos_weighting": str(SKNN_CFG["pos_weighting"]),
        "k_neighbors": int(SKNN_CFG["k_neighbors"]),
        "candidate_sessions_cap": int(SKNN_CFG["candidate_sessions_cap"]),
    },
    "source_indexing": {
        "sample_size_sessions": int(SOURCE_INDEX_CFG["sample_size_sessions"]),
        "sample_mod": int(SOURCE_INDEX_CFG["sample_mod"]),
        "max_shards_to_scan": SOURCE_INDEX_CFG["max_shards_to_scan"],
        "kept_sessions": int(len(source_index_sessions)),
        "scanned_sessions": int(scanned_sessions),
        "shards_scanned": int(shard_i),
    },
    "source_eval": SOURCE_EVAL_CFG,
}

results = {
    "target": {
        "val": t_val_knn,
        "test": t_test_knn,
    },
    "source": {
        "val": s_val_knn,
        "test": s_test_knn,
    },
}

save_json(run_meta, REPORT_DIR / "run_meta.json")
save_json(results, REPORT_DIR / "results.json")

# Update meta.json
meta_path = REPO_ROOT / "meta.json"
meta = load_json(meta_path) if meta_path.exists() else {"artifacts": {}}

meta.setdefault("artifacts", {})
meta["artifacts"].setdefault("session_knn_baseline", {})
meta["artifacts"]["session_knn_baseline"][RUN_TAG] = {
    "target_run_tag": TARGET_TAG,
    "source_run_tag": SOURCE_TAG,
    "report_dir": str(REPORT_DIR),
    "results_json": str(REPORT_DIR / "results.json"),
    "run_meta_json": str(REPORT_DIR / "run_meta.json"),
}
meta["updated_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
save_json(meta, meta_path)

print("[08-10] ✅ Wrote report files under:", REPORT_DIR)
print("[08-10] ✅ Updated meta.json:", meta_path)

print("\n[08-10] CHECKPOINT I")
print("Paste: report dir + key metrics summary after you run all cells.")


[08-10] ✅ Wrote report files under: C:\mooc-coldstart-session-meta\reports\08_session_knn_baseline\20260102_140713
[08-10] ✅ Updated meta.json: C:\mooc-coldstart-session-meta\meta.json

[08-10] CHECKPOINT I
Paste: report dir + key metrics summary after you run all cells.


Footer summary

In [13]:
# [CELL 08-11] Footer summary

print("========== 08 Session-KNN Baseline Summary ==========")
print("RUN_TAG:", RUN_TAG)
print("--- TARGET ---")
print("VAL :", t_val_knn)
print("TEST:", t_test_knn)
print("--- SOURCE ---")
print("VAL :", s_val_knn)
print("TEST:", s_test_knn)
print("Report dir:", REPORT_DIR)
print("====================================================")


RUN_TAG: 20260102_140713
--- TARGET ---
VAL : {'HR@5': 0.30687830687830686, 'HR@10': 0.48148148148148145, 'HR@20': 0.6243386243386243, 'MRR@5': 0.11940035273368603, 'MRR@10': 0.14264508272444779, 'MRR@20': 0.15248653060646283, 'NDCG@5': 0.1662376128266329, 'NDCG@10': 0.22261874958963768, 'NDCG@20': 0.25863209838882517, '_n_examples': 189}
TEST: {'HR@5': 0.255, 'HR@10': 0.37, 'HR@20': 0.49, 'MRR@5': 0.08908333333333332, 'MRR@10': 0.10474999999999998, 'MRR@20': 0.1125188543826315, 'NDCG@5': 0.1301628535815534, 'NDCG@10': 0.16767645572690335, 'NDCG@20': 0.1973000384141053, '_n_examples': 200}
--- SOURCE ---
VAL : {'HR@5': 0.9701633333333334, 'HR@10': 0.9728133333333333, 'HR@20': 0.9742666666666666, 'MRR@5': 0.939244777777809, 'MRR@10': 0.9396163240741069, 'MRR@20': 0.9397188795157997, 'NDCG@5': 0.947181040573676, 'NDCG@10': 0.9480558182513753, 'NDCG@20': 0.9484253802103635, '_n_pairs': 300000, '_n_sessions_seen': 18120, '_n_unk_labels': 2, '_pair_cap': 300000, '_note': 'Evaluation capped 