Notebook header + imports (MostPop baseline)

In [1]:
# [CELL 07-00] Notebook header + imports (MostPop baseline)

import os
import json
import math
import time
from pathlib import Path
from datetime import datetime

import numpy as np

# torch is required (target tensors)
import torch

# pandas/pyarrow for source parquet shards (streaming)
import pandas as pd

print("[07-00] Imports OK")
print("[07-00] torch:", torch.__version__)
print("[07-00] pandas:", pd.__version__)


[07-00] Imports OK
[07-00] torch: 2.9.1+cpu
[07-00] pandas: 2.3.3


Locate repo root + define RUN_TAG + load protocol/config artifacts

In [2]:
# [CELL 07-01] Locate repo root + define RUN_TAG + load protocol/config artifacts

def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists() and (p / "meta.json").exists():
            return p
    # fallback: PROJECT_STATE only
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            return p
    raise FileNotFoundError("Could not locate repo root (expected PROJECT_STATE.md and meta.json).")

REPO_ROOT = find_repo_root(Path.cwd().resolve())
print("[07-01] REPO_ROOT:", REPO_ROOT)

# New run tag for 07 (do NOT touch earlier run tags)
RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
print("[07-01] RUN_TAG:", RUN_TAG)

# Fixed upstream run tags (must not change)
TARGET_TAG = "20251229_163357"
SOURCE_TAG = "20251229_232834"

# Load artifacts from repo (preferred) OR fallback to local uploaded copies (only for reading config values)
def load_json(path: Path) -> dict:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

cfg_path_repo = REPO_ROOT / "data/processed/supervised" / f"dataloader_config_{TARGET_TAG}_{SOURCE_TAG}.json"
sanity_path_repo = REPO_ROOT / "data/processed/supervised" / f"sanity_metrics_{TARGET_TAG}_{SOURCE_TAG}.json"
gaps_path_repo = REPO_ROOT / "data/processed/normalized_events" / "session_gap_thresholds.json"

# If repo paths aren't available (rare), you can point these to the uploaded copies manually.
print("[07-01] Expect config:", cfg_path_repo)
print("[07-01] Expect sanity:", sanity_path_repo)
print("[07-01] Expect gaps:", gaps_path_repo)

dataloader_cfg = load_json(cfg_path_repo)
sanity_metrics = load_json(sanity_path_repo)
session_gaps = load_json(gaps_path_repo)

print("[07-01] Loaded dataloader_config keys:", list(dataloader_cfg.keys()))
print("[07-01] Loaded sanity_metrics keys:", list(sanity_metrics.keys()))
print("[07-01] Loaded session_gap_thresholds keys:", list(session_gaps.keys()))

# Enforce fixed decisions
assert session_gaps["target"]["primary_threshold_seconds"] == 1800, "Target gap must be 30m (1800s)."
assert session_gaps["source"]["primary_threshold_seconds"] == 600, "Source gap must be 10m (600s)."

print("[07-01] ✅ Session gaps confirmed: target=30m, source=10m")

print("\n[07-01] CHECKPOINT A")
print("1) Confirm REPO_ROOT exists and points to mooc-coldstart-session-meta")
print("2) Confirm the three repo JSON files load without errors")


[07-01] REPO_ROOT: C:\mooc-coldstart-session-meta
[07-01] RUN_TAG: 20260102_133019
[07-01] Expect config: C:\mooc-coldstart-session-meta\data\processed\supervised\dataloader_config_20251229_163357_20251229_232834.json
[07-01] Expect sanity: C:\mooc-coldstart-session-meta\data\processed\supervised\sanity_metrics_20251229_163357_20251229_232834.json
[07-01] Expect gaps: C:\mooc-coldstart-session-meta\data\processed\normalized_events\session_gap_thresholds.json
[07-01] Loaded dataloader_config keys: ['target', 'source', 'protocol']
[07-01] Loaded sanity_metrics keys: ['run_tag_target', 'run_tag_source', 'created_at', 'target', 'source', 'notes']
[07-01] Loaded session_gap_thresholds keys: ['generated_from_run_tag', 'generated_at', 'target', 'source', 'decision_notes']
[07-01] ✅ Session gaps confirmed: target=30m, source=10m

[07-01] CHECKPOINT A
1) Confirm REPO_ROOT exists and points to mooc-coldstart-session-meta
2) Confirm the three repo JSON files load without errors


Resolve known artifact paths (target tensors + source sequences)

In [3]:
# [CELL 07-02] Resolve known artifact paths (target tensors + source sequences)

def must_exist(p: Path, label: str):
    if not p.exists():
        raise FileNotFoundError(f"{label} not found: {p}")
    return p

# TARGET tensors (05B output)
TARGET_TENSOR_DIR = REPO_ROOT / "data/processed/tensor_target"
target_train_pt = TARGET_TENSOR_DIR / f"target_tensor_train_{TARGET_TAG}.pt"
target_val_pt   = TARGET_TENSOR_DIR / f"target_tensor_val_{TARGET_TAG}.pt"
target_test_pt  = TARGET_TENSOR_DIR / f"target_tensor_test_{TARGET_TAG}.pt"
target_vocab_json = TARGET_TENSOR_DIR / f"target_vocab_items_{TARGET_TAG}.json"
target_tensor_meta_json = TARGET_TENSOR_DIR / f"target_tensor_metadata_{TARGET_TAG}.json"

# SOURCE sequences (05C output)
SOURCE_SEQ_ROOT = REPO_ROOT / "data/processed/session_sequences" / f"source_sessions_{SOURCE_TAG}"
source_train_dir = SOURCE_SEQ_ROOT / "train"
source_val_dir   = SOURCE_SEQ_ROOT / "val"
source_test_dir  = SOURCE_SEQ_ROOT / "test"
source_vocab_json = SOURCE_SEQ_ROOT / f"source_vocab_items_{SOURCE_TAG}.json"

print("[07-02] Target train pt:", target_train_pt)
print("[07-02] Source train dir:", source_train_dir)

# Existence checks (hard fail early)
must_exist(target_train_pt, "target_train_pt")
must_exist(target_val_pt, "target_val_pt")
must_exist(target_test_pt, "target_test_pt")
must_exist(target_vocab_json, "target_vocab_json")
must_exist(target_tensor_meta_json, "target_tensor_meta_json")

must_exist(source_train_dir, "source_train_dir")
must_exist(source_val_dir, "source_val_dir")
must_exist(source_test_dir, "source_test_dir")
must_exist(source_vocab_json, "source_vocab_json")

print("[07-02] ✅ All required artifacts exist")

print("\n[07-02] CHECKPOINT B")
print("If any path is missing, paste the exact error here (do NOT proceed).")


[07-02] Target train pt: C:\mooc-coldstart-session-meta\data\processed\tensor_target\target_tensor_train_20251229_163357.pt
[07-02] Source train dir: C:\mooc-coldstart-session-meta\data\processed\session_sequences\source_sessions_20251229_232834\train
[07-02] ✅ All required artifacts exist

[07-02] CHECKPOINT B
If any path is missing, paste the exact error here (do NOT proceed).


Reuse protocol from 06: metric suite + PAD exclusion + K list

In [12]:
# [CELL 07-03] Reuse protocol from 06: metric suite + PAD exclusion + K list

K_LIST = [5, 10, 20]
MAX_K = max(K_LIST)

proto = dataloader_cfg["protocol"]
MAX_PREFIX_LEN = int(proto["max_prefix_len"])
CAP_ENABLED = bool(proto["source_long_session_policy"]["enabled"])
CAP_SESSION_LEN = int(proto["source_long_session_policy"]["cap_session_len"])
CAP_STRATEGY = str(proto["source_long_session_policy"]["cap_strategy"])

print("[07-03] Protocol from 06:")
print("  MAX_PREFIX_LEN:", MAX_PREFIX_LEN)
print("  CAP_ENABLED:", CAP_ENABLED)
print("  CAP_SESSION_LEN:", CAP_SESSION_LEN)
print("  CAP_STRATEGY:", CAP_STRATEGY)
print("  K_LIST:", K_LIST)

# Load vocab files to get PAD/UNK
target_vocab = load_json(target_vocab_json)
source_vocab = load_json(source_vocab_json)

def get_special_id(vocab_obj: dict, token_key: str, fallback: int, name: str) -> int:
    tok = vocab_obj.get(token_key, None)
    if tok is None:
        print(f"[07-03] {name}: missing {token_key}, fallback id={fallback}")
        return fallback
    mapping = vocab_obj.get("vocab", {})
    if isinstance(mapping, dict) and tok in mapping and isinstance(mapping[tok], int):
        return int(mapping[tok])
    print(f"[07-03] {name}: could not resolve {token_key}='{tok}' in vocab mapping, fallback id={fallback}")
    return fallback

PAD_ID_TARGET = get_special_id(target_vocab, "pad_token", 0, "TARGET")
UNK_ID_TARGET = get_special_id(target_vocab, "unk_token", 1, "TARGET")
PAD_ID_SOURCE = get_special_id(source_vocab, "pad_token", 0, "SOURCE")
UNK_ID_SOURCE = get_special_id(source_vocab, "unk_token", 1, "SOURCE")

print("[07-03] PAD_ID_TARGET:", PAD_ID_TARGET, "| UNK_ID_TARGET:", UNK_ID_TARGET)
print("[07-03] PAD_ID_SOURCE:", PAD_ID_SOURCE, "| UNK_ID_SOURCE:", UNK_ID_SOURCE)

assert PAD_ID_TARGET == 0, "Target PAD must be 0 to match 06."
assert PAD_ID_SOURCE == 0, "Source PAD must be 0 to match 06."

def init_metrics():
    return {f"{m}@{k}": 0.0 for m in ["HR", "MRR", "NDCG"] for k in K_LIST}

def update_metrics_from_rank(metrics: dict, rank0: int):
    # rank0: 0-based rank position within top-MAX_K list; if not found => None
    if rank0 is None:
        return
    r = rank0 + 1  # 1-based
    for k in K_LIST:
        if r <= k:
            metrics[f"HR@{k}"] += 1.0
            metrics[f"MRR@{k}"] += 1.0 / r
            metrics[f"NDCG@{k}"] += 1.0 / math.log2(r + 1.0)

def finalize_metrics(metrics: dict, n: int) -> dict:
    out = {}
    for k, v in metrics.items():
        out[k] = float(v / n) if n > 0 else 0.0
    return out

print("[07-03] ✅ Metric functions ready (HR/MRR/NDCG @ K={5,10,20}, PAD excluded via ranking construction)")


[07-03] Protocol from 06:
  MAX_PREFIX_LEN: 20
  CAP_ENABLED: True
  CAP_SESSION_LEN: 200
  CAP_STRATEGY: take_last
  K_LIST: [5, 10, 20]
[07-03] SOURCE: missing pad_token, fallback id=0
[07-03] SOURCE: missing unk_token, fallback id=1
[07-03] PAD_ID_TARGET: 0 | UNK_ID_TARGET: 1
[07-03] PAD_ID_SOURCE: 0 | UNK_ID_SOURCE: 1
[07-03] ✅ Metric functions ready (HR/MRR/NDCG @ K={5,10,20}, PAD excluded via ranking construction)


PyTorch 2.6+ compatibility: load project-generated .pt artifacts

In [13]:
# [CELL 07-03A] PyTorch 2.6+ compatibility: load project-generated .pt artifacts
# Our .pt artifacts are produced inside this repo, so we can safely set weights_only=False.

import pickle

def torch_load_repo_artifact(path, map_location="cpu"):
    path = str(path)
    try:
        obj = torch.load(path, map_location=map_location, weights_only=False)
        print(f"[07-03A] torch.load OK (weights_only=False): {path}")
        return obj
    except TypeError:
        # Older torch versions don't have weights_only argument
        obj = torch.load(path, map_location=map_location)
        print(f"[07-03A] torch.load OK (no weights_only arg): {path}")
        return obj

print("[07-03A] ✅ Loader ready")
print("[07-03A] CHECKPOINT: rerun from CELL 07-04 after this.")


[07-03A] ✅ Loader ready
[07-03A] CHECKPOINT: rerun from CELL 07-04 after this.


Robust vocab loader: infer vocab_size from stored structures

In [14]:
# [CELL 07-03B] Robust vocab loader: infer vocab_size from stored structures (includes 'vocab' key)

def infer_vocab_size(vocab: dict, name: str) -> int:
    """
    Infer vocab size from common formats:
    - vocab_size / n_items
    - id2item (list or dict)
    - item2id (dict)
    - items (list)
    - vocab (dict)  <-- your format
    """
    # direct keys
    for k in ["vocab_size", "n_items", "num_items", "size"]:
        if k in vocab:
            vs = int(vocab[k])
            print(f"[07-03B] {name}: vocab_size from key '{k}' = {vs}")
            return vs

    # your format: vocab
    if "vocab" in vocab and isinstance(vocab["vocab"], dict):
        d = vocab["vocab"]
        # Determine whether mapping is token->id or id->token
        sample_key = next(iter(d.keys())) if len(d) else None
        sample_val = d[sample_key] if sample_key is not None else None

        if sample_key is None:
            print(f"[07-03B] {name}: vocab empty -> vocab_size=0")
            return 0

        # token -> id
        if isinstance(sample_val, int):
            ids = list(d.values())
            vs = max(ids) + 1 if len(ids) else 0
            print(f"[07-03B] {name}: vocab_size from max(vocab values)+1 (token->id) = {vs}")
            return vs

        # id(str/int) -> token
        try:
            keys_int = [int(k) for k in d.keys()]
            vs = max(keys_int) + 1 if len(keys_int) else 0
            print(f"[07-03B] {name}: vocab_size from max(vocab keys)+1 (id->token) = {vs}")
            return vs
        except Exception:
            pass

        # fallback: just number of entries
        vs = len(d)
        print(f"[07-03B] {name}: vocab_size fallback len(vocab) = {vs}")
        return vs

    # id2item
    if "id2item" in vocab:
        v = vocab["id2item"]
        if isinstance(v, list):
            vs = len(v)
            print(f"[07-03B] {name}: vocab_size from len(id2item list) = {vs}")
            return vs
        if isinstance(v, dict):
            keys = [int(x) for x in v.keys()]
            vs = (max(keys) + 1) if keys else 0
            print(f"[07-03B] {name}: vocab_size from max(id2item dict keys)+1 = {vs}")
            return vs

    # items list
    if "items" in vocab and isinstance(vocab["items"], list):
        vs = len(vocab["items"])
        print(f"[07-03B] {name}: vocab_size from len(items) = {vs}")
        return vs

    # item2id
    if "item2id" in vocab and isinstance(vocab["item2id"], dict):
        d = vocab["item2id"]
        ids = list(d.values())
        vs = (max(ids) + 1) if len(ids) else 0
        print(f"[07-03B] {name}: vocab_size from max(item2id values)+1 = {vs}")
        return vs

    raise KeyError(f"[07-03B] {name}: Could not infer vocab_size. Keys={list(vocab.keys())}")

# infer sizes for target + source
vocab_size_target = infer_vocab_size(target_vocab, "TARGET")
vocab_size_source = infer_vocab_size(source_vocab, "SOURCE")

print("[07-03B] ✅ vocab sizes inferred:",
      "vocab_size_target=", vocab_size_target,
      "| vocab_size_source=", vocab_size_source)

print("\n[07-03B] CHECKPOINT: paste these two inferred sizes + the 'token->id' or 'id->token' message.")


[07-03B] TARGET: vocab_size from max(vocab values)+1 (token->id) = 747
[07-03B] SOURCE: vocab_size from key 'vocab_size' = 1620
[07-03B] ✅ vocab sizes inferred: vocab_size_target= 747 | vocab_size_source= 1620

[07-03B] CHECKPOINT: paste these two inferred sizes + the 'token->id' or 'id->token' message.


TARGET MostPop (A): popularity from TARGET train split

In [None]:
# [CELL 07-04] TARGET MostPop (A): popularity from TARGET train split
# Popularity definition (target): count next-item labels in target TRAIN split.

train_obj = torch_load_repo_artifact(target_train_pt, map_location="cpu")
print("[07-04] Loaded target train .pt type:", type(train_obj))

# Robustly find labels tensor
if isinstance(train_obj, dict):
    keys = list(train_obj.keys())
    print("[07-04] Keys:", keys)
    if "labels" in train_obj:
        train_labels = train_obj["labels"]
    elif "y" in train_obj:
        train_labels = train_obj["y"]
    else:
        raise KeyError(f"Could not find labels in target train object keys={keys}")
else:
    raise TypeError("Unexpected target train object type; expected dict with labels.")

train_labels = torch.as_tensor(train_labels).detach().cpu().long()
print("[07-04] train_labels shape:", tuple(train_labels.shape), "dtype:", train_labels.dtype)

# Count label frequencies
# vocab_size_target = int(target_vocab["vocab_size"]) vocab_size_target inferred in [CELL 07-03B]
counts = np.zeros(vocab_size_target, dtype=np.int64)

labels_np = train_labels.numpy()
if labels_np.min() < 0 or labels_np.max() >= vocab_size_target:
    raise ValueError(f"Target labels out of range: min={labels_np.min()} max={labels_np.max()} vocab={vocab_size_target}")

# exclude PAD if it ever appears (shouldn't)
labels_np = labels_np[labels_np != PAD_ID_TARGET]
np.add.at(counts, labels_np, 1)

# Build MostPop ranking (top MAX_K), excluding PAD
forbidden = {PAD_ID_TARGET}

all_items = np.arange(vocab_size_target, dtype=np.int64)
valid_mask = np.ones(vocab_size_target, dtype=bool)
for fid in forbidden:
    if 0 <= fid < vocab_size_target:
        valid_mask[fid] = False

valid_items = all_items[valid_mask]
valid_counts = counts[valid_mask]

# argsort descending by count then item_id for stability
order = np.lexsort((valid_items, -valid_counts))
ranked_items = valid_items[order]
top_items_target = ranked_items[:MAX_K].tolist()

print("[07-04] Target MostPop top20:", top_items_target)
print("[07-04] Target MostPop top20 counts:", [int(counts[i]) for i in top_items_target])

print("\n[07-04] CHECKPOINT C")
print("Paste the printed top20 + counts if you want me to sanity-check distribution before evaluation.")


[07-03A] torch.load OK (weights_only=False): C:\mooc-coldstart-session-meta\data\processed\tensor_target\target_tensor_train_20251229_163357.pt
[07-04] Loaded target train .pt type: <class 'dict'>
[07-04] Keys: ['input_ids', 'attn_mask', 'labels', 'session_id', 'user_id', 't', 'split']
[07-04] train_labels shape: (1944,) dtype: torch.int64
[07-04] Target MostPop top20: [410, 408, 308, 85, 230, 309, 562, 532, 310, 533, 84, 231, 563, 229, 536, 126, 151, 311, 531, 534]
[07-04] Target MostPop top20 counts: [30, 23, 20, 19, 18, 18, 18, 17, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11]

[07-04] CHECKPOINT C
Paste the printed top20 + counts if you want me to sanity-check distribution before evaluation.


TARGET eval on VAL + TEST

In [17]:
# [CELL 07-05] TARGET eval on VAL + TEST

def eval_target_split(pt_path: Path, top_items: list[int], split_name: str) -> dict:
    obj = torch_load_repo_artifact(pt_path, map_location="cpu")
    if not isinstance(obj, dict):
        raise TypeError(f"{split_name}: expected dict .pt, got {type(obj)}")

    if "labels" in obj:
        labels = obj["labels"]
    elif "y" in obj:
        labels = obj["y"]
    else:
        raise KeyError(f"{split_name}: could not find labels key")

    labels = torch.as_tensor(labels).detach().cpu().long().numpy()
    labels = labels[labels != PAD_ID_TARGET]

    top_index = {item_id: idx for idx, item_id in enumerate(top_items)}
    metrics = init_metrics()
    n = 0

    for y in labels:
        n += 1
        rank0 = top_index.get(int(y), None)
        update_metrics_from_rank(metrics, rank0)

    out = finalize_metrics(metrics, n)
    out["_n_examples"] = int(n)
    return out

t_val = eval_target_split(target_val_pt, top_items_target, "target_val")
t_test = eval_target_split(target_test_pt, top_items_target, "target_test")

print("[07-05] TARGET VAL:", t_val)
print("[07-05] TARGET TEST:", t_test)

print("\n[07-05] CHECKPOINT D")
print("Paste TARGET VAL/TEST metrics here so we lock them in before source.")


[07-03A] torch.load OK (weights_only=False): C:\mooc-coldstart-session-meta\data\processed\tensor_target\target_tensor_val_20251229_163357.pt
[07-03A] torch.load OK (weights_only=False): C:\mooc-coldstart-session-meta\data\processed\tensor_target\target_tensor_test_20251229_163357.pt
[07-05] TARGET VAL: {'HR@5': 0.07407407407407407, 'HR@10': 0.13227513227513227, 'HR@20': 0.1746031746031746, 'MRR@5': 0.05026455026455026, 'MRR@10': 0.058704963466868224, 'MRR@20': 0.06130174366942758, 'NDCG@5': 0.05620747419730002, 'NDCG@10': 0.07570020394614688, 'NDCG@20': 0.08597813493276235, '_n_examples': 189}
[07-05] TARGET TEST: {'HR@5': 0.075, 'HR@10': 0.125, 'HR@20': 0.185, 'MRR@5': 0.045, 'MRR@10': 0.05177579365079365, 'MRR@20': 0.0553870221840423, 'NDCG@5': 0.05258000942002037, 'NDCG@10': 0.06885437485718231, 'NDCG@20': 0.08332921918090552, '_n_examples': 200}

[07-05] CHECKPOINT D
Paste TARGET VAL/TEST metrics here so we lock them in before source.


SOURCE MostPop (B): popularity from SOURCE train session sequences (streaming / memory-safe)
- Handles source sequences whose `items` are strings (course IDs), by mapping via source_vocab.
- Must reuse protocol from 06: cap long sessions if enabled, PAD excluded from counting/ranking.


In [20]:
# [CELL 07-06] SOURCE MostPop (B): popularity from SOURCE train session sequences (streaming / memory-safe)
# Handles source sequences whose `items` are strings (course IDs), by mapping via source_vocab.
# Must reuse protocol from 06: cap long sessions if enabled, PAD excluded from counting/ranking.

import glob
import time
import pandas as pd
import numpy as np

def list_parquet_shards(dir_path: Path) -> list[Path]:
    files = sorted([Path(p) for p in glob.glob(str(dir_path / "sessions_b*.parquet"))])
    if len(files) == 0:
        raise FileNotFoundError(f"No shards found under {dir_path} (expected sessions_b*.parquet)")
    return files

train_shards = list_parquet_shards(source_train_dir)
val_shards   = list_parquet_shards(source_val_dir)
test_shards  = list_parquet_shards(source_test_dir)

print("[07-06] Source shards counts:", "train=", len(train_shards), "val=", len(val_shards), "test=", len(test_shards))

# Detect sequence column from first shard
probe = pd.read_parquet(train_shards[0])
print("[07-06] Probe columns:", list(probe.columns))

def detect_sequence_column(df: pd.DataFrame) -> str:
    for c in ["items", "item_ids", "sequence", "seq", "course_ids"]:
        if c in df.columns:
            return c
    for c in df.columns:
        s = df[c].dropna()
        if len(s) == 0:
            continue
        v = s.iloc[0]
        if isinstance(v, (list, tuple, np.ndarray)):
            return c
    raise KeyError(f"Could not detect sequence column from columns={list(df.columns)}")

seq_col = detect_sequence_column(probe)
print("[07-06] Detected source sequence column:", seq_col)
if len(probe) > 0:
    first_seq = probe[seq_col].iloc[0]
    print("[07-06] Probe first seq type:", type(first_seq), "| len:", (len(first_seq) if first_seq is not None else None))
    if isinstance(first_seq, np.ndarray) and len(first_seq) > 0:
        print("[07-06] Probe first element type:", type(first_seq[0]), "| value sample:", first_seq[0])

# ---- Build mapping: token(string) -> id(int) from source_vocab ----
print("[07-06] source_vocab keys:", list(source_vocab.keys()))

def build_token_to_id(vocab_obj: dict) -> dict:
    # 1) preferred: vocab_obj["vocab"] token->id
    if "vocab" in vocab_obj and isinstance(vocab_obj["vocab"], dict):
        d = vocab_obj["vocab"]
        # token->id if values are int
        sample_k = next(iter(d.keys())) if len(d) else None
        if sample_k is not None and isinstance(d[sample_k], int):
            return d
        # if keys are ids and values are tokens, invert
        try:
            sample_key_int = int(sample_k)
            # invert id->token to token->id
            inv = {v: int(k) for k, v in d.items()}
            return inv
        except Exception:
            pass

    # 2) fallback: item2id
    if "item2id" in vocab_obj and isinstance(vocab_obj["item2id"], dict):
        return vocab_obj["item2id"]

    # 3) fallback: items list (position is id)
    if "items" in vocab_obj and isinstance(vocab_obj["items"], list):
        return {tok: i for i, tok in enumerate(vocab_obj["items"])}

    raise KeyError(f"[07-06] Could not build token_to_id from source_vocab. Keys={list(vocab_obj.keys())}")

source_token_to_id = build_token_to_id(source_vocab)
print("[07-06] Built source_token_to_id size:", len(source_token_to_id))

# Inspect id range to confirm PAD/UNK reservation (protocol requires PAD excluded)
ids = list(source_token_to_id.values())
min_id = int(min(ids)) if len(ids) else None
max_id = int(max(ids)) if len(ids) else None
print("[07-06] source_token_to_id id-range:", {"min_id": min_id, "max_id": max_id, "vocab_size_source": vocab_size_source})

# Resolve PAD/UNK for source robustly:
# - If explicit pad_id/unk_id exist, use them.
# - Else if min_id >= 2, assume PAD=0 UNK=1 (reserved).
# - Else STOP (cannot guarantee protocol correctness).
def resolve_pad_unk_source(vocab_obj: dict, min_id: int | None) -> tuple[int, int]:
    if "pad_id" in vocab_obj and "unk_id" in vocab_obj:
        return int(vocab_obj["pad_id"]), int(vocab_obj["unk_id"])
    if min_id is not None and min_id >= 2:
        return 0, 1
    raise ValueError(
        "[07-06] CHECKPOINT STOP: Source vocab ids appear to start at 0/1 (min_id < 2) "
        "but pad_id/unk_id are not explicitly stored. We cannot safely exclude PAD per protocol 06.\n"
        f"source_vocab keys={list(vocab_obj.keys())}\n"
        f"min_id={min_id}, max_id={max_id}, vocab_size_source={vocab_size_source}\n"
        "Paste this error + the printed keys/range."
    )

PAD_ID_SOURCE, UNK_ID_SOURCE = resolve_pad_unk_source(source_vocab, min_id)
print("[07-06] ✅ Resolved SOURCE PAD/UNK:", {"PAD_ID_SOURCE": PAD_ID_SOURCE, "UNK_ID_SOURCE": UNK_ID_SOURCE})

# Mapping function (handles numpy arrays of strings)
def map_seq_to_ids(seq) -> np.ndarray:
    """
    Map a sequence (np.ndarray/list) of tokens (strings or ints) to int ids.
    Unknown tokens -> UNK_ID_SOURCE.
    """
    if seq is None:
        return np.array([], dtype=np.int64)

    # ensure list-like
    if isinstance(seq, np.ndarray):
        seq_list = seq.tolist()
    else:
        seq_list = list(seq)

    if len(seq_list) == 0:
        return np.array([], dtype=np.int64)

    # fast-path: already ints
    if isinstance(seq_list[0], (int, np.integer)):
        return np.asarray(seq_list, dtype=np.int64)

    # token strings -> ids
    out = np.fromiter((source_token_to_id.get(tok, UNK_ID_SOURCE) for tok in seq_list), dtype=np.int64)
    return out

# Count popularity from TRAIN shards
src_counts = np.zeros(vocab_size_source, dtype=np.int64)
t0 = time.time()
n_sessions = 0
n_events = 0
n_unk_mapped = 0  # diagnostic

for i, fp in enumerate(train_shards, 1):
    df = pd.read_parquet(fp, columns=[seq_col])

    for seq in df[seq_col].values:
        if seq is None:
            continue

        # cap long sessions before mapping (slice works for arrays/lists)
        if CAP_ENABLED and len(seq) > CAP_SESSION_LEN and CAP_STRATEGY == "take_last":
            seq = seq[-CAP_SESSION_LEN:]

        arr = map_seq_to_ids(seq)

        # exclude PAD
        arr = arr[arr != PAD_ID_SOURCE]
        if arr.size == 0:
            continue

        # range check
        if arr.min() < 0 or arr.max() >= vocab_size_source:
            raise ValueError(f"[07-06] Source mapped id out of range in {fp}: min={arr.min()} max={arr.max()} vocab={vocab_size_source}")

        n_sessions += 1
        n_events += int(arr.size)
        n_unk_mapped += int((arr == UNK_ID_SOURCE).sum())

        np.add.at(src_counts, arr, 1)

    if i % 50 == 0:
        dt = time.time() - t0
        print(f"[07-06] Counted shards {i}/{len(train_shards)} | sessions={n_sessions:,} events={n_events:,} unk={n_unk_mapped:,} | elapsed={dt:.1f}s")

# Build MostPop top items (exclude PAD)
forbidden_src = {PAD_ID_SOURCE}

all_src_items = np.arange(vocab_size_source, dtype=np.int64)
valid_mask = np.ones(vocab_size_source, dtype=bool)
for fid in forbidden_src:
    if 0 <= fid < vocab_size_source:
        valid_mask[fid] = False

valid_items = all_src_items[valid_mask]
valid_counts = src_counts[valid_mask]

order = np.lexsort((valid_items, -valid_counts))  # count desc, id asc
ranked_items = valid_items[order]
top_items_source = ranked_items[:MAX_K].tolist()

print("[07-06] Source MostPop top20:", top_items_source[:20])
print("[07-06] Source MostPop top20 counts:", [int(src_counts[i]) for i in top_items_source[:20]])
print("[07-06] sessions_counted:", f"{n_sessions:,}", "| events_counted:", f"{n_events:,}", "| unk_mapped:", f"{n_unk_mapped:,}")

print("\n[07-06] CHECKPOINT E")
print("Paste: seq_col, PAD/UNK resolved, top20+counts, sessions/events/unk_mapped.")


[07-06] Source shards counts: train= 1024 val= 1024 test= 1024
[07-06] Probe columns: ['domain', 'user_id', 'session_id', 'session_length', 'start_ts', 'end_ts', 'items', 'split']
[07-06] Detected source sequence column: items
[07-06] Probe first seq type: <class 'numpy.ndarray'> | len: 71
[07-06] Probe first element type: <class 'str'> | value sample: course-v1:TsinghuaX+00690212X+sp
[07-06] source_vocab keys: ['run_tag_source', 'built_from', 'vocab_size', 'pad_id', 'unk_id', 'item2id']
[07-06] Built source_token_to_id size: 1620
[07-06] source_token_to_id id-range: {'min_id': 0, 'max_id': 1619, 'vocab_size_source': 1620}
[07-06] ✅ Resolved SOURCE PAD/UNK: {'PAD_ID_SOURCE': 0, 'UNK_ID_SOURCE': 1}
[07-06] Counted shards 50/1024 | sessions=325,575 events=5,721,409 unk=0 | elapsed=8.8s
[07-06] Counted shards 100/1024 | sessions=650,531 events=11,461,691 unk=0 | elapsed=17.7s
[07-06] Counted shards 150/1024 | sessions=977,651 events=17,233,757 unk=0 | elapsed=26.5s
[07-06] Counted shards 

SOURCE eval on VAL + TEST (streaming transitions)

In [21]:
# [CELL 07-07] SOURCE eval on VAL + TEST (streaming transitions)
# Match Notebook 06 protocol:
# for each session seq length L: pairs t=1..L-1, label = seq[t]
# For MostPop, input unused but label counting must be exact.
# Uses the SAME map_seq_to_ids() and PAD/UNK resolved in 07-06.

def eval_source_split(shards: list[Path], top_items: list[int], split_name: str) -> dict:
    top_index = {item_id: idx for idx, item_id in enumerate(top_items)}
    metrics = init_metrics()

    n_pairs = 0
    n_sessions_seen = 0
    n_unk_labels = 0

    t0 = time.time()
    for i, fp in enumerate(shards, 1):
        df = pd.read_parquet(fp, columns=[seq_col])

        for seq in df[seq_col].values:
            if seq is None:
                continue

            if CAP_ENABLED and len(seq) > CAP_SESSION_LEN and CAP_STRATEGY == "take_last":
                seq = seq[-CAP_SESSION_LEN:]

            arr = map_seq_to_ids(seq)

            # exclude PAD anywhere (shouldn't appear in raw, but protocol says exclude)
            arr = arr[arr != PAD_ID_SOURCE]
            L = int(arr.size)
            if L < 2:
                continue

            n_sessions_seen += 1

            # labels are arr[1:]
            labels = arr[1:]
            n_pairs += int(labels.size)
            n_unk_labels += int((labels == UNK_ID_SOURCE).sum())

            # MostPop ranking check
            for y in labels:
                rank0 = top_index.get(int(y), None)
                update_metrics_from_rank(metrics, rank0)

        if i % 50 == 0:
            dt = time.time() - t0
            print(f"[07-07] {split_name}: shards {i}/{len(shards)} | sessions={n_sessions_seen:,} pairs={n_pairs:,} unk_labels={n_unk_labels:,} | elapsed={dt:.1f}s")

    out = finalize_metrics(metrics, n_pairs)
    out["_n_pairs"] = int(n_pairs)
    out["_n_sessions_seen"] = int(n_sessions_seen)
    out["_n_unk_labels"] = int(n_unk_labels)
    return out

s_val = eval_source_split(val_shards, top_items_source, "source_val")
s_test = eval_source_split(test_shards, top_items_source, "source_test")

print("[07-07] SOURCE VAL:", s_val)
print("[07-07] SOURCE TEST:", s_test)

print("\n[07-07] CHECKPOINT F")
print("Paste SOURCE VAL/TEST metrics (incl _n_pairs/_n_sessions_seen/_n_unk_labels) before writing reports.")


[07-07] source_val: shards 50/1024 | sessions=40,753 pairs=674,824 unk_labels=2 | elapsed=1.8s
[07-07] source_val: shards 100/1024 | sessions=81,398 pairs=1,338,427 unk_labels=2 | elapsed=3.6s
[07-07] source_val: shards 150/1024 | sessions=122,208 pairs=2,013,484 unk_labels=2 | elapsed=5.7s
[07-07] source_val: shards 200/1024 | sessions=163,159 pairs=2,698,196 unk_labels=2 | elapsed=8.2s
[07-07] source_val: shards 250/1024 | sessions=203,996 pairs=3,374,411 unk_labels=2 | elapsed=10.2s
[07-07] source_val: shards 300/1024 | sessions=244,632 pairs=4,051,466 unk_labels=2 | elapsed=11.9s
[07-07] source_val: shards 350/1024 | sessions=285,035 pairs=4,725,515 unk_labels=2 | elapsed=13.7s
[07-07] source_val: shards 400/1024 | sessions=325,724 pairs=5,406,295 unk_labels=2 | elapsed=15.4s
[07-07] source_val: shards 450/1024 | sessions=366,361 pairs=6,081,977 unk_labels=2 | elapsed=17.9s
[07-07] source_val: shards 500/1024 | sessions=407,196 pairs=6,756,214 unk_labels=4 | elapsed=20.2s
[07-07] s

Write report artifacts to reports/07_mostpop_baseline/<RUN_TAG>/ + update meta.json

In [22]:
# [CELL 07-08] Write report artifacts to reports/07_mostpop_baseline/<RUN_TAG>/ + update meta.json
# Writes:
# - run_meta.json
# - results.json
# - target_mostpop_top20.csv  (top MAX_K, despite name)
# - source_mostpop_top20.csv  (top MAX_K, despite name)
# Updates repo meta.json (append-only record for this run)

from datetime import datetime

REPORT_DIR = REPO_ROOT / "reports" / "07_mostpop_baseline" / RUN_TAG
REPORT_DIR.mkdir(parents=True, exist_ok=True)

def save_json(obj: dict, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2, ensure_ascii=False)

run_meta = {
    "run_tag": RUN_TAG,
    "created_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "inputs": {
        "target_run_tag": TARGET_TAG,
        "source_run_tag": SOURCE_TAG,
        "target_train_pt": str(target_train_pt),
        "target_val_pt": str(target_val_pt),
        "target_test_pt": str(target_test_pt),
        "target_vocab_json": str(target_vocab_json),
        "target_tensor_metadata_json": str(target_tensor_meta_json),
        "source_train_dir": str(source_train_dir),
        "source_val_dir": str(source_val_dir),
        "source_test_dir": str(source_test_dir),
        "source_vocab_json": str(source_vocab_json),
        "dataloader_config": str(cfg_path_repo),
        "sanity_metrics": str(sanity_path_repo),
        "session_gap_thresholds": str(gaps_path_repo),
    },
    "protocol": {
        "K_LIST": K_LIST,
        "MAX_PREFIX_LEN": MAX_PREFIX_LEN,
        "PAD_ID_TARGET": PAD_ID_TARGET,
        "PAD_ID_SOURCE": PAD_ID_SOURCE,
        "UNK_ID_TARGET": UNK_ID_TARGET,
        "UNK_ID_SOURCE": UNK_ID_SOURCE,
        "vocab_size_target": int(vocab_size_target),
        "vocab_size_source": int(vocab_size_source),
        "source_long_session_policy": {
            "enabled": CAP_ENABLED,
            "cap_session_len": CAP_SESSION_LEN,
            "cap_strategy": CAP_STRATEGY,
        },
        "popularity_definition": {
            "target": "count next-item labels in target TRAIN split",
            "source": "count all item occurrences in source TRAIN session sequences (mapped via source_vocab)",
        },
        "pad_excluded_from_ranking": True,
    },
    "source_mapping_diagnostics": {
        "seq_col": seq_col,
        "source_vocab_keys": list(source_vocab.keys()),
        "source_token_to_id_size": int(len(source_token_to_id)),
        "source_id_range": {"min_id": int(min_id) if min_id is not None else None, "max_id": int(max_id) if max_id is not None else None},
        "unk_mapped_in_train_counts": int(n_unk_mapped),
    },
}

results = {
    "target": {
        "mostpop_top_items@20": top_items_target[:20],
        "val": t_val,
        "test": t_test,
    },
    "source": {
        "mostpop_top_items@20": top_items_source[:20],
        "val": s_val,
        "test": s_test,
        "train_counts_summary": {
            "sessions_counted": int(n_sessions),
            "events_counted": int(n_events),
            "unk_mapped": int(n_unk_mapped),
            "seq_col": seq_col,
            "n_train_shards": int(len(train_shards)),
        },
    },
}

save_json(run_meta, REPORT_DIR / "run_meta.json")
save_json(results, REPORT_DIR / "results.json")

# Save top items CSVs (top MAX_K)
pd.DataFrame({
    "rank": np.arange(1, MAX_K + 1),
    "item_id": top_items_target,
    "train_label_count": [int(counts[i]) for i in top_items_target],
}).to_csv(REPORT_DIR / "target_mostpop_top20.csv", index=False)

pd.DataFrame({
    "rank": np.arange(1, MAX_K + 1),
    "item_id": top_items_source,
    "train_event_count": [int(src_counts[i]) for i in top_items_source],
}).to_csv(REPORT_DIR / "source_mostpop_top20.csv", index=False)

print("[07-08] ✅ Wrote report files under:", REPORT_DIR)

# Update meta.json (append-only record)
meta_path = REPO_ROOT / "meta.json"
if meta_path.exists():
    with open(meta_path, "r", encoding="utf-8") as f:
        meta = json.load(f)
else:
    meta = {"artifacts": {}}

meta.setdefault("artifacts", {})
meta["artifacts"].setdefault("mostpop_baseline", {})
meta["artifacts"]["mostpop_baseline"][RUN_TAG] = {
    "target_run_tag": TARGET_TAG,
    "source_run_tag": SOURCE_TAG,
    "report_dir": str(REPORT_DIR),
    "results_json": str(REPORT_DIR / "results.json"),
    "run_meta_json": str(REPORT_DIR / "run_meta.json"),
}
meta["updated_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

save_json(meta, meta_path)
print("[07-08] ✅ Updated meta.json:", meta_path)

print("\n[07-08] CHECKPOINT G")
print("Paste: report_dir path + confirm meta.json updated.")


[07-08] ✅ Wrote report files under: C:\mooc-coldstart-session-meta\reports\07_mostpop_baseline\20260102_133019
[07-08] ✅ Updated meta.json: C:\mooc-coldstart-session-meta\meta.json

[07-08] CHECKPOINT G
Paste: report_dir path + confirm meta.json updated.


Footer summary

In [23]:
# [CELL 07-09] Footer summary

print("========== 07 MostPop Baseline Summary ==========")
print("RUN_TAG:", RUN_TAG)
print("--- TARGET ---")
print("VAL :", t_val)
print("TEST:", t_test)
print("--- SOURCE ---")
print("VAL :", s_val)
print("TEST:", s_test)
print("Report dir:", REPORT_DIR)
print("===============================================")


RUN_TAG: 20260102_133019
--- TARGET ---
VAL : {'HR@5': 0.07407407407407407, 'HR@10': 0.13227513227513227, 'HR@20': 0.1746031746031746, 'MRR@5': 0.05026455026455026, 'MRR@10': 0.058704963466868224, 'MRR@20': 0.06130174366942758, 'NDCG@5': 0.05620747419730002, 'NDCG@10': 0.07570020394614688, 'NDCG@20': 0.08597813493276235, '_n_examples': 189}
TEST: {'HR@5': 0.075, 'HR@10': 0.125, 'HR@20': 0.185, 'MRR@5': 0.045, 'MRR@10': 0.05177579365079365, 'MRR@20': 0.0553870221840423, 'NDCG@5': 0.05258000942002037, 'NDCG@10': 0.06885437485718231, 'NDCG@20': 0.08332921918090552, '_n_examples': 200}
--- SOURCE ---
VAL : {'HR@5': 0.11741418178221939, 'HR@10': 0.18701743950367639, 'HR@20': 0.27324388679120887, 'MRR@5': 0.06704973488080263, 'MRR@10': 0.07616040761226384, 'MRR@20': 0.08207514133001786, 'NDCG@5': 0.0794066625164993, 'NDCG@10': 0.10173664076730178, 'NDCG@20': 0.1234496055743828, '_n_pairs': 13802285, '_n_sessions_seen': 833042, '_n_unk_labels': 9}
TEST: {'HR@5': 0.11623230488663819, 'HR@10': 