# Notebook 03: Build Vocabulary and Pairs (MARS)

**Purpose:** Build item vocabulary and create prefix->label pairs for next-item prediction.

**Input:** `data/processed/mars/sessions/sessions.parquet`

**Output:**
- `data/processed/mars/vocab/item2id.json`
- `data/processed/mars/pairs/pairs_all.parquet`

In [1]:
# [CELL 03-00] Bootstrap

import os
import json
import time
import uuid
from pathlib import Path
from datetime import datetime
from typing import Any, Dict, List
from collections import Counter

import numpy as np
import pandas as pd

t0 = datetime.now()
print(f"[CELL 03-00] start={t0.isoformat(timespec='seconds')}")

def find_repo_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            return p
    raise RuntimeError("Could not find PROJECT_STATE.md")

REPO_ROOT = find_repo_root(Path.cwd())
print("[CELL 03-00] REPO_ROOT:", REPO_ROOT)

PATHS = {
    "DATA_PROCESSED": REPO_ROOT / "data" / "processed",
    "REPORTS": REPO_ROOT / "reports",
}

def cell_start(cell_id: str, title: str, **kwargs) -> float:
    t = time.time()
    print(f"\n[{cell_id}] {title}")
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    return t

def cell_end(cell_id: str, t0: float, **kwargs) -> None:
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    print(f"[{cell_id}] elapsed={time.time()-t0:.2f}s")

def write_json_atomic(path: Path, obj: Any, indent: int = 2) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = path.with_suffix(path.suffix + f".tmp_{uuid.uuid4().hex}")
    with tmp.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=indent)
    tmp.replace(path)

NOTEBOOK_NAME = "03_build_vocab_and_pairs_mars"
RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_ID = uuid.uuid4().hex

OUT_DIR = PATHS["REPORTS"] / NOTEBOOK_NAME / RUN_TAG
OUT_DIR.mkdir(parents=True, exist_ok=True)
REPORT_PATH = OUT_DIR / "report.json"

report = {"run_id": RUN_ID, "notebook": NOTEBOOK_NAME, "metrics": {}, "key_findings": []}

print("[CELL 03-00] done")

[CELL 03-00] start=2026-01-12T22:20:17
[CELL 03-00] REPO_ROOT: C:\anonymous-users-mooc-session-meta
[CELL 03-00] done


In [2]:
# [CELL 03-01] Load sessionized data

t0 = cell_start("CELL 03-01", "Load sessionized data")

INPUT_PATH = PATHS["DATA_PROCESSED"] / "mars" / "sessions" / "sessions.parquet"
df = pd.read_parquet(INPUT_PATH)

print(f"[CELL 03-01] Loaded: {INPUT_PATH}")
print(f"[CELL 03-01] Shape: {df.shape}")
print(f"[CELL 03-01] Users: {df['user_id'].nunique():,}")
print(f"[CELL 03-01] Items: {df['item_id'].nunique():,}")

cell_end("CELL 03-01", t0)


[CELL 03-01] Load sessionized data
[CELL 03-01] Loaded: C:\anonymous-users-mooc-session-meta\data\processed\mars\sessions\sessions.parquet
[CELL 03-01] Shape: (3655, 6)
[CELL 03-01] Users: 822
[CELL 03-01] Items: 776
[CELL 03-01] elapsed=0.05s


In [3]:
# [CELL 03-02] Build vocabulary (item2id mapping)

t0 = cell_start("CELL 03-02", "Build vocabulary")

# Count item frequencies
item_counts = Counter(df["item_id"])

# Create mapping (0 reserved for padding)
sorted_items = sorted(item_counts.keys())
item2id = {item: idx + 1 for idx, item in enumerate(sorted_items)}  # 0 = padding
id2item = {idx: item for item, idx in item2id.items()}

n_items = len(item2id) + 1  # +1 for padding

print(f"[CELL 03-02] Vocabulary size: {n_items} (including padding)")
print(f"[CELL 03-02] Unique items: {len(item2id)}")

# Save vocabulary
VOCAB_DIR = PATHS["DATA_PROCESSED"] / "mars" / "vocab"
VOCAB_DIR.mkdir(parents=True, exist_ok=True)

write_json_atomic(VOCAB_DIR / "item2id.json", item2id)
write_json_atomic(VOCAB_DIR / "id2item.json", {str(k): v for k, v in id2item.items()})

print(f"[CELL 03-02] Saved: {VOCAB_DIR / 'item2id.json'}")

report["metrics"]["vocab_size"] = n_items

cell_end("CELL 03-02", t0)


[CELL 03-02] Build vocabulary
[CELL 03-02] Vocabulary size: 777 (including padding)
[CELL 03-02] Unique items: 776
[CELL 03-02] Saved: C:\anonymous-users-mooc-session-meta\data\processed\mars\vocab\item2id.json
[CELL 03-02] elapsed=0.01s


In [4]:
# [CELL 03-03] Create prefix->label pairs

t0 = cell_start("CELL 03-03", "Create prefix->label pairs")

# Map items to IDs
df["item_idx"] = df["item_id"].map(item2id)

# Group by user and create pairs
pairs = []
pair_id = 0

for user_id, user_df in df.groupby("user_id"):
    user_df = user_df.sort_values("ts_epoch")
    items = user_df["item_idx"].tolist()
    timestamps = user_df["ts_epoch"].tolist()
    
    # Create pairs: prefix -> next item
    for i in range(1, len(items)):
        prefix = items[:i]
        label = items[i]
        label_ts = timestamps[i]
        
        pairs.append({
            "pair_id": pair_id,
            "user_id": user_id,
            "prefix": prefix,
            "label": label,
            "label_ts_epoch": label_ts,
            "prefix_len": len(prefix),
        })
        pair_id += 1

df_pairs = pd.DataFrame(pairs)

print(f"[CELL 03-03] Total pairs: {len(df_pairs):,}")
print(f"[CELL 03-03] Prefix length distribution:")
print(df_pairs["prefix_len"].describe())

report["metrics"]["n_pairs"] = len(df_pairs)

cell_end("CELL 03-03", t0)


[CELL 03-03] Create prefix->label pairs
[CELL 03-03] Total pairs: 2,833
[CELL 03-03] Prefix length distribution:
count    2833.000000
mean       18.144370
std        24.546638
min         1.000000
25%         2.000000
50%         8.000000
75%        24.000000
max       133.000000
Name: prefix_len, dtype: float64
[CELL 03-03] elapsed=0.32s


In [5]:
# [CELL 03-04] Save pairs

t0 = cell_start("CELL 03-04", "Save pairs")

PAIRS_DIR = PATHS["DATA_PROCESSED"] / "mars" / "pairs"
PAIRS_DIR.mkdir(parents=True, exist_ok=True)

out_file = PAIRS_DIR / "pairs_all.parquet"
df_pairs.to_parquet(out_file, index=False)

print(f"[CELL 03-04] Saved: {out_file}")

write_json_atomic(REPORT_PATH, report)

print("\n" + "="*60)
print("NOTEBOOK 03 COMPLETE: MARS Vocab & Pairs")
print("="*60)
print(f"Vocabulary: {n_items} items")
print(f"Pairs: {len(df_pairs):,}")

cell_end("CELL 03-04", t0)


[CELL 03-04] Save pairs
[CELL 03-04] Saved: C:\anonymous-users-mooc-session-meta\data\processed\mars\pairs\pairs_all.parquet

NOTEBOOK 03 COMPLETE: MARS Vocab & Pairs
Vocabulary: 777 items
Pairs: 2,833
[CELL 03-04] elapsed=0.03s
