Goal (only): create a deterministic user-level split (train/val/test users disjoint), save the split files under:

data/processed/mars/user_splits/

users_train.json

users_val.json

users_test.json

user_split_map.parquet

pairs_train.parquet, pairs_val.parquet, pairs_test.parquet (optional but recommended for baselines)

events_train.parquet, events_val.parquet, events_test.parquet (optional but recommended)

No sessionization here (already done). No episodes here (Notebook 05).

Bootstrap + logger

In [1]:
# [CELL 04-00] Bootstrap: repo root + paths + logger

import json
import time
import uuid
import hashlib
from pathlib import Path
from datetime import datetime
from typing import Any, Dict, List

import numpy as np
import pandas as pd

print(f"[CELL 04-00] start={datetime.now().isoformat(timespec='seconds')}")
print("[CELL 04-00] CWD:", Path.cwd().resolve())

def find_repo_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            return p
    raise RuntimeError("Could not find PROJECT_STATE.md. Open notebook from within the repo.")

REPO_ROOT = find_repo_root(Path.cwd())
print("[CELL 04-00] REPO_ROOT:", REPO_ROOT)

PATHS = {
    "META_REGISTRY": REPO_ROOT / "meta.json",
    "DATA_INTERIM": REPO_ROOT / "data" / "interim",
    "DATA_PROCESSED": REPO_ROOT / "data" / "processed",
    "REPORTS": REPO_ROOT / "reports",
}
for k, v in PATHS.items():
    print(f"[CELL 04-00] {k}={v}")

def cell_start(cell_id: str, title: str, **kwargs: Any) -> float:
    t = time.time()
    print(f"\n[{cell_id}] {title}")
    print(f"[{cell_id}] start={datetime.now().isoformat(timespec='seconds')}")
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    return t

def cell_end(cell_id: str, t0: float, **kwargs: Any) -> None:
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    print(f"[{cell_id}] elapsed={time.time()-t0:.2f}s")
    print(f"[{cell_id}] done")

print("[CELL 04-00] done")


[CELL 04-00] start=2026-01-06T22:30:03
[CELL 04-00] CWD: C:\anonymous-users-mooc-session-meta\notebooks
[CELL 04-00] REPO_ROOT: C:\anonymous-users-mooc-session-meta
[CELL 04-00] META_REGISTRY=C:\anonymous-users-mooc-session-meta\meta.json
[CELL 04-00] DATA_INTERIM=C:\anonymous-users-mooc-session-meta\data\interim
[CELL 04-00] DATA_PROCESSED=C:\anonymous-users-mooc-session-meta\data\processed
[CELL 04-00] REPORTS=C:\anonymous-users-mooc-session-meta\reports
[CELL 04-00] done


JSON IO (Timestamp-safe) + hashing

In [2]:
# [CELL 04-01] JSON IO + hashing (Timestamp-safe)

t0 = cell_start("CELL 04-01", "JSON IO + hashing (Timestamp-safe)")

def _json_default(o):
    try:
        import pandas as pd
        if isinstance(o, (pd.Timestamp,)):
            return o.isoformat()
    except Exception:
        pass
    try:
        import numpy as np
        if isinstance(o, (np.integer,)):
            return int(o)
        if isinstance(o, (np.floating,)):
            return float(o)
        if isinstance(o, (np.bool_,)):
            return bool(o)
    except Exception:
        pass
    try:
        from datetime import datetime, date
        if isinstance(o, (datetime, date)):
            return o.isoformat()
    except Exception:
        pass
    return str(o)

def write_json_atomic(path: Path, obj: Any, indent: int = 2) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = path.with_suffix(path.suffix + f".tmp_{uuid.uuid4().hex}")
    with tmp.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=indent, default=_json_default)
    tmp.replace(path)

def read_json(path: Path) -> Any:
    if not path.exists():
        raise RuntimeError(f"Missing JSON file: {path}")
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def sha256_file(path: Path, chunk_size: int = 1024 * 1024) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        while True:
            b = f.read(chunk_size)
            if not b:
                break
            h.update(b)
    return h.hexdigest()

def safe_artifact_record(path: Path) -> Dict[str, Any]:
    rec = {"path": str(path), "bytes": int(path.stat().st_size), "sha256": None, "sha256_error": None}
    try:
        rec["sha256"] = sha256_file(path)
    except PermissionError as e:
        rec["sha256_error"] = f"PermissionError: {e}"
        print("[CELL 04-01] WARN: locked, cannot hash now:", path)
    return rec

cell_end("CELL 04-01", t0)



[CELL 04-01] JSON IO + hashing (Timestamp-safe)
[CELL 04-01] start=2026-01-06T22:30:03
[CELL 04-01] elapsed=0.00s
[CELL 04-01] done


Start run + init report/config/manifest + meta.json append

In [3]:
# [CELL 04-02] Start run + init files + meta.json append-only

t0 = cell_start("CELL 04-02", "Start run")

NOTEBOOK_NAME = "04_user_split_mars"
RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_ID = uuid.uuid4().hex

OUT_DIR = PATHS["REPORTS"] / NOTEBOOK_NAME / RUN_TAG
OUT_DIR.mkdir(parents=True, exist_ok=True)

REPORT_PATH = OUT_DIR / "report.json"
CONFIG_PATH = OUT_DIR / "config.json"
MANIFEST_PATH = OUT_DIR / "manifest.json"

DUCKDB_PATH = PATHS["DATA_INTERIM"] / "mars.duckdb"
EVENTS_VIEW = "mars_events_sessionized"
PAIRS_VIEW = "mars_pairs"

OUT_SPLIT_DIR = PATHS["DATA_PROCESSED"] / "mars" / "user_splits"
OUT_SPLIT_DIR.mkdir(parents=True, exist_ok=True)

CFG = {
    "notebook": NOTEBOOK_NAME,
    "run_id": RUN_ID,
    "run_tag": RUN_TAG,
    "inputs": {"duckdb_path": str(DUCKDB_PATH), "events_view": EVENTS_VIEW, "pairs_view": PAIRS_VIEW},
    "outputs": {"user_splits_dir": str(OUT_SPLIT_DIR), "reports_out_dir": str(OUT_DIR)},
    "split": {
        "seed": 20260106,
        "train_frac": 0.80,
        "val_frac": 0.10,
        "test_frac": 0.10,
        "strategy": "random_user_split",   # deterministic shuffle
    }
}
write_json_atomic(CONFIG_PATH, CFG)

report = {
    "run_id": RUN_ID,
    "notebook": NOTEBOOK_NAME,
    "run_tag": RUN_TAG,
    "created_at": datetime.now().isoformat(timespec="seconds"),
    "repo_root": str(REPO_ROOT),
    "metrics": {},
    "key_findings": [],
    "sanity_samples": {},
    "data_fingerprints": {},
    "notes": [],
}
write_json_atomic(REPORT_PATH, report)

manifest = {"run_id": RUN_ID, "notebook": NOTEBOOK_NAME, "run_tag": RUN_TAG, "artifacts": []}
write_json_atomic(MANIFEST_PATH, manifest)

# meta.json append-only
META_PATH = PATHS["META_REGISTRY"]
if not META_PATH.exists():
    write_json_atomic(META_PATH, {"schema_version": 1, "runs": []})
meta = read_json(META_PATH)
meta["runs"].append({
    "run_id": RUN_ID,
    "notebook": NOTEBOOK_NAME,
    "run_tag": RUN_TAG,
    "out_dir": str(OUT_DIR),
    "created_at": datetime.now().isoformat(timespec="seconds"),
})
write_json_atomic(META_PATH, meta)

cell_end("CELL 04-02", t0, out_dir=str(OUT_DIR))



[CELL 04-02] Start run
[CELL 04-02] start=2026-01-06T22:30:25
[CELL 04-02] out_dir=C:\anonymous-users-mooc-session-meta\reports\04_user_split_mars\20260106_223025
[CELL 04-02] elapsed=0.01s
[CELL 04-02] done


Load user stats from DuckDB (events + pairs)

In [4]:
# [CELL 04-03] Load user stats (events + sessions + pairs)

t0 = cell_start("CELL 04-03", "Load user stats from DuckDB")

import duckdb

if not Path(DUCKDB_PATH).exists():
    raise RuntimeError(f"Missing DuckDB: {DUCKDB_PATH}. Run Notebooks 01-03 first.")

con = duckdb.connect(str(DUCKDB_PATH), read_only=True)

# basic existence checks
for view in [EVENTS_VIEW, PAIRS_VIEW]:
    try:
        con.execute(f"SELECT COUNT(*) FROM {view}").fetchone()
    except Exception as e:
        raise RuntimeError(f"Missing view {view}. Ensure Notebook 02/03 completed.") from e

user_stats = con.execute(f"""
SELECT
  e.user_id AS user_id,
  COUNT(*) AS n_events,
  COUNT(DISTINCT e.session_id) AS n_sessions,
  COALESCE(p.n_pairs, 0) AS n_pairs
FROM {EVENTS_VIEW} e
LEFT JOIN (
  SELECT user_id, COUNT(*) AS n_pairs
  FROM {PAIRS_VIEW}
  GROUP BY 1
) p
ON e.user_id = p.user_id
GROUP BY 1, p.n_pairs
ORDER BY user_id
""").fetchdf()

if user_stats.shape[0] == 0:
    raise RuntimeError("No users found in events view. Cannot split.")

print("[CELL 04-03] user_stats shape:", user_stats.shape)
print("[CELL 04-03] head(5):")
print(user_stats.head(5).to_string(index=False))

# quick quantiles for report
def q(series: pd.Series) -> Dict[str, float]:
    s = series.astype(float)
    return {
        "min": float(s.min()),
        "p50": float(s.quantile(0.50)),
        "p90": float(s.quantile(0.90)),
        "p95": float(s.quantile(0.95)),
        "p99": float(s.quantile(0.99)),
        "max": float(s.max()),
        "mean": float(s.mean()),
    }

stats_summary = {
    "n_users": int(user_stats.shape[0]),
    "events": q(user_stats["n_events"]),
    "sessions": q(user_stats["n_sessions"]),
    "pairs": q(user_stats["n_pairs"]),
}
print("[CELL 04-03] stats_summary:", stats_summary)

cell_end("CELL 04-03", t0, n_users=int(user_stats.shape[0]))



[CELL 04-03] Load user stats from DuckDB
[CELL 04-03] start=2026-01-06T22:30:42
[CELL 04-03] user_stats shape: (822, 4)
[CELL 04-03] head(5):
 user_id  n_events  n_sessions  n_pairs
     672         3           1        2
     856         1           1        0
    3928         1           1        0
    4160         1           1        0
    4448         2           1        1
[CELL 04-03] stats_summary: {'n_users': 822, 'events': {'min': 1.0, 'p50': 2.0, 'p90': 8.0, 'p95': 16.0, 'p99': 59.6899999999996, 'max': 134.0, 'mean': 4.451338199513382}, 'sessions': {'min': 1.0, 'p50': 1.0, 'p90': 3.0, 'p95': 4.0, 'p99': 8.789999999999964, 'max': 39.0, 'mean': 1.608272506082725}, 'pairs': {'min': 0.0, 'p50': 0.0, 'p90': 5.899999999999977, 'p95': 11.949999999999932, 'p99': 49.789999999999964, 'max': 123.0, 'mean': 2.843065693430657}}
[CELL 04-03] n_users=822
[CELL 04-03] elapsed=0.15s
[CELL 04-03] done


Deterministic user split (train/val/test disjoint)

In [5]:
# [CELL 04-04] Deterministic user split (disjoint users)

t0 = cell_start("CELL 04-04", "Create deterministic user split")

seed = int(CFG["split"]["seed"])
train_frac = float(CFG["split"]["train_frac"])
val_frac = float(CFG["split"]["val_frac"])
test_frac = float(CFG["split"]["test_frac"])

if not np.isclose(train_frac + val_frac + test_frac, 1.0):
    raise RuntimeError("train/val/test fractions must sum to 1.0")

users = user_stats["user_id"].astype(str).tolist()
rng = np.random.default_rng(seed)
perm = rng.permutation(len(users))
users_shuf = [users[i] for i in perm]

n = len(users_shuf)
n_train = int(round(n * train_frac))
n_val = int(round(n * val_frac))
# ensure exact partition
n_train = min(n_train, n)
n_val = min(n_val, n - n_train)
n_test = n - n_train - n_val

train_users = users_shuf[:n_train]
val_users = users_shuf[n_train:n_train + n_val]
test_users = users_shuf[n_train + n_val:]

# checks
a, b, c = set(train_users), set(val_users), set(test_users)
if (a & b) or (a & c) or (b & c):
    raise RuntimeError("User split overlap detected (should be impossible).")
if len(a) + len(b) + len(c) != n:
    raise RuntimeError("User split sizes do not sum to total users.")

print("[CELL 04-04] n_users_total:", n)
print("[CELL 04-04] n_train:", len(train_users))
print("[CELL 04-04] n_val:", len(val_users))
print("[CELL 04-04] n_test:", len(test_users))

cell_end("CELL 04-04", t0, n_train=len(train_users), n_val=len(val_users), n_test=len(test_users))



[CELL 04-04] Create deterministic user split
[CELL 04-04] start=2026-01-06T22:31:03
[CELL 04-04] n_users_total: 822
[CELL 04-04] n_train: 658
[CELL 04-04] n_val: 82
[CELL 04-04] n_test: 82
[CELL 04-04] n_train=658
[CELL 04-04] n_val=82
[CELL 04-04] n_test=82
[CELL 04-04] elapsed=0.00s
[CELL 04-04] done


Save split files + split map parquet

In [6]:
# [CELL 04-05] Save split files + split map

t0 = cell_start("CELL 04-05", "Write user split artifacts")

train_path = OUT_SPLIT_DIR / "users_train.json"
val_path = OUT_SPLIT_DIR / "users_val.json"
test_path = OUT_SPLIT_DIR / "users_test.json"
split_map_path = OUT_SPLIT_DIR / "user_split_map.parquet"

write_json_atomic(train_path, train_users)
write_json_atomic(val_path, val_users)
write_json_atomic(test_path, test_users)

split_map = pd.DataFrame({
    "user_id": train_users + val_users + test_users,
    "split": (["train"] * len(train_users)) + (["val"] * len(val_users)) + (["test"] * len(test_users))
})
split_map.to_parquet(split_map_path, index=False, compression="zstd")

print("[CELL 04-05] wrote:", train_path)
print("[CELL 04-05] wrote:", val_path)
print("[CELL 04-05] wrote:", test_path)
print("[CELL 04-05] wrote:", split_map_path)

cell_end("CELL 04-05", t0)



[CELL 04-05] Write user split artifacts
[CELL 04-05] start=2026-01-06T22:31:22
[CELL 04-05] wrote: C:\anonymous-users-mooc-session-meta\data\processed\mars\user_splits\users_train.json
[CELL 04-05] wrote: C:\anonymous-users-mooc-session-meta\data\processed\mars\user_splits\users_val.json
[CELL 04-05] wrote: C:\anonymous-users-mooc-session-meta\data\processed\mars\user_splits\users_test.json
[CELL 04-05] wrote: C:\anonymous-users-mooc-session-meta\data\processed\mars\user_splits\user_split_map.parquet
[CELL 04-05] elapsed=0.20s
[CELL 04-05] done


Materialize split-specific Parquets (events + pairs)

In [9]:
# [CELL 04-06] Write split-specific Parquets (events + pairs)  ✅ FIXED (no f-string backslash)

t0 = cell_start("CELL 04-06", "Write split Parquets for events + pairs")

events_train_out = OUT_SPLIT_DIR / "events_train.parquet"
events_val_out   = OUT_SPLIT_DIR / "events_val.parquet"
events_test_out  = OUT_SPLIT_DIR / "events_test.parquet"

pairs_train_out = OUT_SPLIT_DIR / "pairs_train.parquet"
pairs_val_out   = OUT_SPLIT_DIR / "pairs_val.parquet"
pairs_test_out  = OUT_SPLIT_DIR / "pairs_test.parquet"

def esc_sql_str(s: str) -> str:
    # Escape single quotes for SQL string literals
    return s.replace("'", "''")

def users_values_sql(users_list: List[str]) -> str:
    """
    Returns a SQL subquery:
      SELECT * FROM (VALUES ('u1'),('u2'),...) AS t(user_id)
    If empty: returns SELECT NULL WHERE FALSE
    """
    if len(users_list) == 0:
        return "SELECT NULL AS user_id WHERE FALSE"
    rows = ",".join(["('" + esc_sql_str(u) + "')" for u in users_list])
    return f"SELECT * FROM (VALUES {rows}) AS t(user_id)"

def esc_path(p: Path) -> str:
    return esc_sql_str(str(p))

train_users_sql = users_values_sql(train_users)
val_users_sql   = users_values_sql(val_users)
test_users_sql  = users_values_sql(test_users)

# Export from DuckDB (fast, consistent)
con.execute(f"""
COPY (
  SELECT e.*
  FROM {EVENTS_VIEW} e
  JOIN ({train_users_sql}) u ON CAST(e.user_id AS VARCHAR) = u.user_id
) TO '{esc_path(events_train_out)}' (FORMAT PARQUET);
""")

con.execute(f"""
COPY (
  SELECT e.*
  FROM {EVENTS_VIEW} e
  JOIN ({val_users_sql}) u ON CAST(e.user_id AS VARCHAR) = u.user_id
) TO '{esc_path(events_val_out)}' (FORMAT PARQUET);
""")

con.execute(f"""
COPY (
  SELECT e.*
  FROM {EVENTS_VIEW} e
  JOIN ({test_users_sql}) u ON CAST(e.user_id AS VARCHAR) = u.user_id
) TO '{esc_path(events_test_out)}' (FORMAT PARQUET);
""")

con.execute(f"""
COPY (
  SELECT p.*
  FROM {PAIRS_VIEW} p
  JOIN ({train_users_sql}) u ON CAST(p.user_id AS VARCHAR) = u.user_id
) TO '{esc_path(pairs_train_out)}' (FORMAT PARQUET);
""")

con.execute(f"""
COPY (
  SELECT p.*
  FROM {PAIRS_VIEW} p
  JOIN ({val_users_sql}) u ON CAST(p.user_id AS VARCHAR) = u.user_id
) TO '{esc_path(pairs_val_out)}' (FORMAT PARQUET);
""")

con.execute(f"""
COPY (
  SELECT p.*
  FROM {PAIRS_VIEW} p
  JOIN ({test_users_sql}) u ON CAST(p.user_id AS VARCHAR) = u.user_id
) TO '{esc_path(pairs_test_out)}' (FORMAT PARQUET);
""")

print("[CELL 04-06] wrote:", events_train_out)
print("[CELL 04-06] wrote:", events_val_out)
print("[CELL 04-06] wrote:", events_test_out)
print("[CELL 04-06] wrote:", pairs_train_out)
print("[CELL 04-06] wrote:", pairs_val_out)
print("[CELL 04-06] wrote:", pairs_test_out)

cell_end("CELL 04-06", t0)



[CELL 04-06] Write split Parquets for events + pairs
[CELL 04-06] start=2026-01-06T22:33:15
[CELL 04-06] wrote: C:\anonymous-users-mooc-session-meta\data\processed\mars\user_splits\events_train.parquet
[CELL 04-06] wrote: C:\anonymous-users-mooc-session-meta\data\processed\mars\user_splits\events_val.parquet
[CELL 04-06] wrote: C:\anonymous-users-mooc-session-meta\data\processed\mars\user_splits\events_test.parquet
[CELL 04-06] wrote: C:\anonymous-users-mooc-session-meta\data\processed\mars\user_splits\pairs_train.parquet
[CELL 04-06] wrote: C:\anonymous-users-mooc-session-meta\data\processed\mars\user_splits\pairs_val.parquet
[CELL 04-06] wrote: C:\anonymous-users-mooc-session-meta\data\processed\mars\user_splits\pairs_test.parquet
[CELL 04-06] elapsed=0.08s
[CELL 04-06] done


Leakage checks (users disjoint + row counts)

In [12]:
# [CELL 04-07] Leakage checks: disjoint users + counts  ✅ FIXED (reuse existing con; no new connection)

t0 = cell_start("CELL 04-07", "Leakage checks (disjoint users)")

def esc_sql_str(s: str) -> str:
    return s.replace("'", "''")

def esc_path(p: Path) -> str:
    return esc_sql_str(str(p))

# Ensure split_map has unique users
assert split_map["user_id"].nunique() == split_map.shape[0], "Duplicate user_id in split_map"

# Reuse existing connection `con` if it exists and is open; otherwise open read-only
try:
    con.execute("SELECT 1").fetchone()
    print("[CELL 04-07] using existing DuckDB connection: con")
except Exception:
    import duckdb
    con = duckdb.connect(str(DUCKDB_PATH), read_only=True)
    print("[CELL 04-07] opened DuckDB connection: con (read_only=True)")

ev_tr = esc_path(events_train_out)
ev_va = esc_path(events_val_out)
ev_te = esc_path(events_test_out)

# Compute overlaps directly from Parquet (no CREATE VIEW needed)
overlaps = con.execute(f"""
WITH
tr AS (SELECT DISTINCT CAST(user_id AS VARCHAR) AS user_id FROM read_parquet('{ev_tr}')),
va AS (SELECT DISTINCT CAST(user_id AS VARCHAR) AS user_id FROM read_parquet('{ev_va}')),
te AS (SELECT DISTINCT CAST(user_id AS VARCHAR) AS user_id FROM read_parquet('{ev_te}'))
SELECT
  (SELECT COUNT(*) FROM tr JOIN va USING(user_id)) AS over_tr_va,
  (SELECT COUNT(*) FROM tr JOIN te USING(user_id)) AS over_tr_te,
  (SELECT COUNT(*) FROM va JOIN te USING(user_id)) AS over_va_te
""").fetchdf().iloc[0].to_dict()

over_tr_va = int(overlaps["over_tr_va"])
over_tr_te = int(overlaps["over_tr_te"])
over_va_te = int(overlaps["over_va_te"])

print("[CELL 04-07] overlap train-val:", over_tr_va)
print("[CELL 04-07] overlap train-test:", over_tr_te)
print("[CELL 04-07] overlap val-test:", over_va_te)

if over_tr_va or over_tr_te or over_va_te:
    raise RuntimeError("User leakage detected across splits (should be 0).")

# Row counts
n_ev_tr = int(con.execute(f"SELECT COUNT(*) FROM read_parquet('{ev_tr}')").fetchone()[0])
n_ev_va = int(con.execute(f"SELECT COUNT(*) FROM read_parquet('{ev_va}')").fetchone()[0])
n_ev_te = int(con.execute(f"SELECT COUNT(*) FROM read_parquet('{ev_te}')").fetchone()[0])

print("[CELL 04-07] events counts:", {"train": n_ev_tr, "val": n_ev_va, "test": n_ev_te})

cell_end("CELL 04-07", t0, n_ev_tr=n_ev_tr, n_ev_va=n_ev_va, n_ev_te=n_ev_te)



[CELL 04-07] Leakage checks (disjoint users)
[CELL 04-07] start=2026-01-06T22:35:32
[CELL 04-07] using existing DuckDB connection: con
[CELL 04-07] overlap train-val: 0
[CELL 04-07] overlap train-test: 0
[CELL 04-07] overlap val-test: 0
[CELL 04-07] events counts: {'train': 3016, 'val': 300, 'test': 343}
[CELL 04-07] n_ev_tr=3016
[CELL 04-07] n_ev_va=300
[CELL 04-07] n_ev_te=343
[CELL 04-07] elapsed=0.04s
[CELL 04-07] done


Register DuckDB views for split datasets + close connections

In [13]:
# [CELL 04-08] Register DuckDB views for split datasets

t0 = cell_start("CELL 04-08", "Register split views in DuckDB")

# Close the read-only con from earlier (avoid lock)
try:
    con.close()
except Exception:
    pass

import duckdb
conw = duckdb.connect(str(DUCKDB_PATH), read_only=False)

def esc(p: Path) -> str:
    return str(p).replace("'", "''")

# Drop and re-create stable views
for v in ["mars_events_train","mars_events_val","mars_events_test","mars_pairs_train","mars_pairs_val","mars_pairs_test"]:
    conw.execute(f"DROP VIEW IF EXISTS {v};")

conw.execute(f"CREATE VIEW mars_events_train AS SELECT * FROM read_parquet('{esc(events_train_out)}');")
conw.execute(f"CREATE VIEW mars_events_val   AS SELECT * FROM read_parquet('{esc(events_val_out)}');")
conw.execute(f"CREATE VIEW mars_events_test  AS SELECT * FROM read_parquet('{esc(events_test_out)}');")

conw.execute(f"CREATE VIEW mars_pairs_train AS SELECT * FROM read_parquet('{esc(pairs_train_out)}');")
conw.execute(f"CREATE VIEW mars_pairs_val   AS SELECT * FROM read_parquet('{esc(pairs_val_out)}');")
conw.execute(f"CREATE VIEW mars_pairs_test  AS SELECT * FROM read_parquet('{esc(pairs_test_out)}');")

# quick check
chk = {
    "pairs_train": int(conw.execute("SELECT COUNT(*) FROM mars_pairs_train").fetchone()[0]),
    "pairs_val": int(conw.execute("SELECT COUNT(*) FROM mars_pairs_val").fetchone()[0]),
    "pairs_test": int(conw.execute("SELECT COUNT(*) FROM mars_pairs_test").fetchone()[0]),
}
print("[CELL 04-08] view row checks:", chk)

conw.close()
print("[CELL 04-08] closed DuckDB connection")

cell_end("CELL 04-08", t0)



[CELL 04-08] Register split views in DuckDB
[CELL 04-08] start=2026-01-06T22:35:51
[CELL 04-08] view row checks: {'pairs_train': 1932, 'pairs_val': 191, 'pairs_test': 214}
[CELL 04-08] closed DuckDB connection
[CELL 04-08] elapsed=0.09s
[CELL 04-08] done


Update report + manifest

In [14]:
# [CELL 04-09] Write report + manifest

t0 = cell_start("CELL 04-09", "Write report + manifest")

report = read_json(REPORT_PATH)
manifest = read_json(MANIFEST_PATH)

# artifacts
for p in [train_path, val_path, test_path, split_map_path,
          events_train_out, events_val_out, events_test_out,
          pairs_train_out, pairs_val_out, pairs_test_out]:
    manifest["artifacts"].append(safe_artifact_record(p))

report["key_findings"].append("Created deterministic user-level train/val/test splits with disjoint users and exported split-specific events/pairs parquets.")
report["sanity_samples"]["user_stats_summary"] = stats_summary
report["sanity_samples"]["split_sizes"] = {"train": len(train_users), "val": len(val_users), "test": len(test_users)}
report["sanity_samples"]["split_users_head"] = {
    "train": train_users[:5],
    "val": val_users[:5],
    "test": test_users[:5],
}
report["notes"].append("User-level split is mandatory: train/val/test users are disjoint. Item identities are not treated as cold-start.")
report["notes"].append("Episode eligibility (min interactions for K-shot + query) will be enforced in Notebook 05, not during splitting.")

write_json_atomic(REPORT_PATH, report)
write_json_atomic(MANIFEST_PATH, manifest)

print("[CELL 04-09] updated:", REPORT_PATH)
print("[CELL 04-09] updated:", MANIFEST_PATH)

cell_end("CELL 04-09", t0, n_artifacts=len(manifest["artifacts"]))



[CELL 04-09] Write report + manifest
[CELL 04-09] start=2026-01-06T22:36:25
[CELL 04-09] updated: C:\anonymous-users-mooc-session-meta\reports\04_user_split_mars\20260106_223025\report.json
[CELL 04-09] updated: C:\anonymous-users-mooc-session-meta\reports\04_user_split_mars\20260106_223025\manifest.json
[CELL 04-09] n_artifacts=10
[CELL 04-09] elapsed=0.06s
[CELL 04-09] done
