# Notebook 05: Episode Index (MARS)

**Purpose:** Create episodic meta-learning indices (K-shot support + Q-shot query).

**Input:** `data/processed/mars/pairs/pairs_{train,val,test}.parquet`

**Output:** `data/processed/mars/episodes/episodes_{train,val,test}_K{K}_Q{Q}.parquet`

**Episode Structure:**
- K=5 support pairs (for adaptation)
- Q=10 query pairs (for evaluation)
- Chronological: support_max_ts < query_min_ts

In [1]:
# [CELL 05-00] Bootstrap

import os
import json
import time
import uuid
from pathlib import Path
from datetime import datetime
from typing import Any, List

import numpy as np
import pandas as pd

t0 = datetime.now()
print(f"[CELL 05-00] start={t0.isoformat(timespec='seconds')}")

def find_repo_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            return p
    raise RuntimeError("Could not find PROJECT_STATE.md")

REPO_ROOT = find_repo_root(Path.cwd())
PATHS = {
    "DATA_PROCESSED": REPO_ROOT / "data" / "processed",
    "REPORTS": REPO_ROOT / "reports",
}

def cell_start(cell_id: str, title: str, **kwargs) -> float:
    t = time.time()
    print(f"\n[{cell_id}] {title}")
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    return t

def cell_end(cell_id: str, t0: float, **kwargs) -> None:
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    print(f"[{cell_id}] elapsed={time.time()-t0:.2f}s")

def write_json_atomic(path: Path, obj: Any, indent: int = 2) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = path.with_suffix(path.suffix + f".tmp_{uuid.uuid4().hex}")
    with tmp.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=indent)
    tmp.replace(path)

SEED = 20260112
np.random.seed(SEED)

# Episode config
K = 5   # Support set size
Q = 10  # Query set size
MIN_PAIRS = K + Q  # Minimum pairs per user

print(f"[CELL 05-00] K={K}, Q={Q}, MIN_PAIRS={MIN_PAIRS}")
print("[CELL 05-00] done")

[CELL 05-00] start=2026-01-12T22:21:05
[CELL 05-00] K=5, Q=10, MIN_PAIRS=15
[CELL 05-00] done


In [2]:
# [CELL 05-01] Load pairs

t0 = cell_start("CELL 05-01", "Load pairs")

PAIRS_DIR = PATHS["DATA_PROCESSED"] / "mars" / "pairs"

df_train = pd.read_parquet(PAIRS_DIR / "pairs_train.parquet")
df_val = pd.read_parquet(PAIRS_DIR / "pairs_val.parquet")
df_test = pd.read_parquet(PAIRS_DIR / "pairs_test.parquet")

print(f"[CELL 05-01] Train pairs: {len(df_train):,} ({df_train['user_id'].nunique():,} users)")
print(f"[CELL 05-01] Val pairs: {len(df_val):,} ({df_val['user_id'].nunique():,} users)")
print(f"[CELL 05-01] Test pairs: {len(df_test):,} ({df_test['user_id'].nunique():,} users)")

cell_end("CELL 05-01", t0)


[CELL 05-01] Load pairs
[CELL 05-01] Train pairs: 2,388 (340 users)
[CELL 05-01] Val pairs: 204 (42 users)
[CELL 05-01] Test pairs: 241 (44 users)
[CELL 05-01] elapsed=0.04s


In [3]:
# [CELL 05-02] Filter users with enough pairs

t0 = cell_start("CELL 05-02", "Filter users with enough pairs")

def filter_eligible_users(df, min_pairs):
    user_counts = df.groupby("user_id").size()
    eligible = user_counts[user_counts >= min_pairs].index
    return set(eligible)

train_eligible = filter_eligible_users(df_train, MIN_PAIRS)
val_eligible = filter_eligible_users(df_val, MIN_PAIRS)
test_eligible = filter_eligible_users(df_test, MIN_PAIRS)

print(f"[CELL 05-02] Train eligible: {len(train_eligible):,} users")
print(f"[CELL 05-02] Val eligible: {len(val_eligible):,} users")
print(f"[CELL 05-02] Test eligible: {len(test_eligible):,} users (cold-start)")

cell_end("CELL 05-02", t0)


[CELL 05-02] Filter users with enough pairs
[CELL 05-02] Train eligible: 36 users
[CELL 05-02] Val eligible: 2 users
[CELL 05-02] Test eligible: 4 users (cold-start)
[CELL 05-02] elapsed=0.01s


In [4]:
# [CELL 05-03] Create episodes function

t0 = cell_start("CELL 05-03", "Define episode creation function")

def create_episodes(df, eligible_users, K, Q, max_episodes_per_user=None):
    """Create K-shot support + Q-shot query episodes."""
    episodes = []
    episode_id = 0
    
    for user_id in eligible_users:
        user_pairs = df[df["user_id"] == user_id].sort_values("label_ts_epoch")
        
        if len(user_pairs) < K + Q:
            continue
        
        # Sliding window to create multiple episodes per user
        n_episodes = 0
        for start in range(0, len(user_pairs) - K - Q + 1, Q):
            if max_episodes_per_user and n_episodes >= max_episodes_per_user:
                break
                
            support_pairs = user_pairs.iloc[start:start + K]
            query_pairs = user_pairs.iloc[start + K:start + K + Q]
            
            # Verify chronological order
            support_max_ts = support_pairs["label_ts_epoch"].max()
            query_min_ts = query_pairs["label_ts_epoch"].min()
            
            if support_max_ts >= query_min_ts:
                continue  # Skip if chronological constraint violated
            
            episodes.append({
                "episode_id": episode_id,
                "user_id": user_id,
                "support_pair_ids": support_pairs["pair_id"].tolist(),
                "query_pair_ids": query_pairs["pair_id"].tolist(),
                "support_max_ts": int(support_max_ts),
                "query_min_ts": int(query_min_ts),
            })
            episode_id += 1
            n_episodes += 1
    
    return pd.DataFrame(episodes)

print("[CELL 05-03] Episode creation function defined")
cell_end("CELL 05-03", t0)


[CELL 05-03] Define episode creation function
[CELL 05-03] Episode creation function defined
[CELL 05-03] elapsed=0.00s


In [5]:
# [CELL 05-04] Create episodes for train/val/test

t0 = cell_start("CELL 05-04", "Create episodes")

# Train: multiple episodes per user (sliding window)
episodes_train = create_episodes(df_train, train_eligible, K, Q, max_episodes_per_user=None)
print(f"[CELL 05-04] Train episodes: {len(episodes_train):,}")

# Val/Test: one episode per user
episodes_val = create_episodes(df_val, val_eligible, K, Q, max_episodes_per_user=1)
print(f"[CELL 05-04] Val episodes: {len(episodes_val):,}")

episodes_test = create_episodes(df_test, test_eligible, K, Q, max_episodes_per_user=1)
print(f"[CELL 05-04] Test episodes: {len(episodes_test):,} (cold-start)")

cell_end("CELL 05-04", t0)


[CELL 05-04] Create episodes
[CELL 05-04] Train episodes: 106
[CELL 05-04] Val episodes: 2
[CELL 05-04] Test episodes: 4 (cold-start)
[CELL 05-04] elapsed=0.04s


In [6]:
# [CELL 05-05] Save episodes

t0 = cell_start("CELL 05-05", "Save episodes")

EPISODES_DIR = PATHS["DATA_PROCESSED"] / "mars" / "episodes"
EPISODES_DIR.mkdir(parents=True, exist_ok=True)

episodes_train.to_parquet(EPISODES_DIR / f"episodes_train_K{K}_Q{Q}.parquet", index=False)
episodes_val.to_parquet(EPISODES_DIR / f"episodes_val_K{K}_Q{Q}.parquet", index=False)
episodes_test.to_parquet(EPISODES_DIR / f"episodes_test_K{K}_Q{Q}.parquet", index=False)

print(f"[CELL 05-05] Saved episodes to: {EPISODES_DIR}")

print("\n" + "="*60)
print("NOTEBOOK 05 COMPLETE: MARS Episode Index")
print("="*60)
print(f"K={K}, Q={Q}")
print(f"Train episodes: {len(episodes_train):,}")
print(f"Val episodes: {len(episodes_val):,}")
print(f"Test episodes: {len(episodes_test):,} (cold-start)")

cell_end("CELL 05-05", t0)


[CELL 05-05] Save episodes
[CELL 05-05] Saved episodes to: C:\anonymous-users-mooc-session-meta\data\processed\mars\episodes

NOTEBOOK 05 COMPLETE: MARS Episode Index
K=5, Q=10
Train episodes: 106
Val episodes: 2
Test episodes: 4 (cold-start)
[CELL 05-05] elapsed=0.02s
