# Notebook 04: User Split (MARS)

**Purpose:** Create user-level train/val/test split for cold-start evaluation.

**Input:** `data/processed/mars/pairs/pairs_all.parquet`

**Output:**
- `data/processed/mars/user_splits/user_split.json`
- `data/processed/mars/pairs/pairs_{train,val,test}.parquet`

**Split:** 80% train / 10% val / 10% test (user-level, disjoint)

In [1]:
# [CELL 04-00] Bootstrap

import os
import json
import time
import uuid
from pathlib import Path
from datetime import datetime
from typing import Any

import numpy as np
import pandas as pd

t0 = datetime.now()
print(f"[CELL 04-00] start={t0.isoformat(timespec='seconds')}")

def find_repo_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            return p
    raise RuntimeError("Could not find PROJECT_STATE.md")

REPO_ROOT = find_repo_root(Path.cwd())
PATHS = {
    "DATA_PROCESSED": REPO_ROOT / "data" / "processed",
    "REPORTS": REPO_ROOT / "reports",
}

def cell_start(cell_id: str, title: str, **kwargs) -> float:
    t = time.time()
    print(f"\n[{cell_id}] {title}")
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    return t

def cell_end(cell_id: str, t0: float, **kwargs) -> None:
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    print(f"[{cell_id}] elapsed={time.time()-t0:.2f}s")

def write_json_atomic(path: Path, obj: Any, indent: int = 2) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = path.with_suffix(path.suffix + f".tmp_{uuid.uuid4().hex}")
    with tmp.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=indent)
    tmp.replace(path)

SEED = 20260112
np.random.seed(SEED)

print("[CELL 04-00] done")

[CELL 04-00] start=2026-01-12T22:20:45
[CELL 04-00] done


In [2]:
# [CELL 04-01] Load pairs

t0 = cell_start("CELL 04-01", "Load pairs")

INPUT_PATH = PATHS["DATA_PROCESSED"] / "mars" / "pairs" / "pairs_all.parquet"
df = pd.read_parquet(INPUT_PATH)

print(f"[CELL 04-01] Loaded: {INPUT_PATH}")
print(f"[CELL 04-01] Total pairs: {len(df):,}")
print(f"[CELL 04-01] Unique users: {df['user_id'].nunique():,}")

cell_end("CELL 04-01", t0)


[CELL 04-01] Load pairs
[CELL 04-01] Loaded: C:\anonymous-users-mooc-session-meta\data\processed\mars\pairs\pairs_all.parquet
[CELL 04-01] Total pairs: 2,833
[CELL 04-01] Unique users: 426
[CELL 04-01] elapsed=0.73s


In [3]:
# [CELL 04-02] Create user-level split (80/10/10)

t0 = cell_start("CELL 04-02", "Create user-level split")

# Get all unique users
all_users = df["user_id"].unique()
np.random.shuffle(all_users)

n_users = len(all_users)
n_train = int(n_users * 0.8)
n_val = int(n_users * 0.1)

train_users = set(all_users[:n_train])
val_users = set(all_users[n_train:n_train + n_val])
test_users = set(all_users[n_train + n_val:])

print(f"[CELL 04-02] Train users: {len(train_users):,}")
print(f"[CELL 04-02] Val users: {len(val_users):,}")
print(f"[CELL 04-02] Test users: {len(test_users):,}")

# Verify disjoint
assert len(train_users & val_users) == 0, "Train/Val overlap!"
assert len(train_users & test_users) == 0, "Train/Test overlap!"
assert len(val_users & test_users) == 0, "Val/Test overlap!"
print("[CELL 04-02] Verified: splits are disjoint")

cell_end("CELL 04-02", t0)


[CELL 04-02] Create user-level split
[CELL 04-02] Train users: 340
[CELL 04-02] Val users: 42
[CELL 04-02] Test users: 44
[CELL 04-02] Verified: splits are disjoint
[CELL 04-02] elapsed=0.00s


In [4]:
# [CELL 04-03] Split pairs by user

t0 = cell_start("CELL 04-03", "Split pairs by user")

df_train = df[df["user_id"].isin(train_users)].copy()
df_val = df[df["user_id"].isin(val_users)].copy()
df_test = df[df["user_id"].isin(test_users)].copy()

print(f"[CELL 04-03] Train pairs: {len(df_train):,}")
print(f"[CELL 04-03] Val pairs: {len(df_val):,}")
print(f"[CELL 04-03] Test pairs: {len(df_test):,}")

cell_end("CELL 04-03", t0)


[CELL 04-03] Split pairs by user
[CELL 04-03] Train pairs: 2,388
[CELL 04-03] Val pairs: 204
[CELL 04-03] Test pairs: 241
[CELL 04-03] elapsed=0.01s


In [5]:
# [CELL 04-04] Save splits

t0 = cell_start("CELL 04-04", "Save splits")

PAIRS_DIR = PATHS["DATA_PROCESSED"] / "mars" / "pairs"
SPLIT_DIR = PATHS["DATA_PROCESSED"] / "mars" / "user_splits"
SPLIT_DIR.mkdir(parents=True, exist_ok=True)

# Save pairs
df_train.to_parquet(PAIRS_DIR / "pairs_train.parquet", index=False)
df_val.to_parquet(PAIRS_DIR / "pairs_val.parquet", index=False)
df_test.to_parquet(PAIRS_DIR / "pairs_test.parquet", index=False)

# Save user split
user_split = {
    "seed": SEED,
    "train_users": [int(u) for u in train_users],
    "val_users": [int(u) for u in val_users],
    "test_users": [int(u) for u in test_users],
}
write_json_atomic(SPLIT_DIR / "user_split.json", user_split)

print(f"[CELL 04-04] Saved pairs to: {PAIRS_DIR}")
print(f"[CELL 04-04] Saved split to: {SPLIT_DIR / 'user_split.json'}")

print("\n" + "="*60)
print("NOTEBOOK 04 COMPLETE: MARS User Split")
print("="*60)
print(f"Train: {len(train_users):,} users, {len(df_train):,} pairs")
print(f"Val: {len(val_users):,} users, {len(df_val):,} pairs")
print(f"Test: {len(test_users):,} users, {len(df_test):,} pairs (cold-start)")

cell_end("CELL 04-04", t0)


[CELL 04-04] Save splits
[CELL 04-04] Saved pairs to: C:\anonymous-users-mooc-session-meta\data\processed\mars\pairs
[CELL 04-04] Saved split to: C:\anonymous-users-mooc-session-meta\data\processed\mars\user_splits\user_split.json

NOTEBOOK 04 COMPLETE: MARS User Split
Train: 340 users, 2,388 pairs
Val: 42 users, 204 pairs
Test: 44 users, 241 pairs (cold-start)
[CELL 04-04] elapsed=0.03s
