# Notebook 02: Sessionize MARS Dataset

**Purpose:** Create sessions from user interactions using time-based splitting.

**Input:** `data/interim/mars/interactions.parquet`

**Output:** `data/processed/mars/sessions/sessions.parquet`

**Session Definition:**
- Gap threshold: 30 minutes (same as XuetangX)
- Interactions within 30 min grouped into same session

In [1]:
# [CELL 02-00] Bootstrap

import os
import sys
import json
import time
import uuid
from pathlib import Path
from datetime import datetime
from typing import Any, Dict, List

import numpy as np
import pandas as pd

t0 = datetime.now()
print(f"[CELL 02-00] start={t0.isoformat(timespec='seconds')}")

def find_repo_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            return p
    raise RuntimeError("Could not find PROJECT_STATE.md")

REPO_ROOT = find_repo_root(Path.cwd())
print("[CELL 02-00] REPO_ROOT:", REPO_ROOT)

PATHS = {
    "META_REGISTRY": REPO_ROOT / "meta.json",
    "DATA_INTERIM": REPO_ROOT / "data" / "interim",
    "DATA_PROCESSED": REPO_ROOT / "data" / "processed",
    "REPORTS": REPO_ROOT / "reports",
}

def cell_start(cell_id: str, title: str, **kwargs) -> float:
    t = time.time()
    print(f"\n[{cell_id}] {title}")
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    return t

def cell_end(cell_id: str, t0: float, **kwargs) -> None:
    for k, v in kwargs.items():
        print(f"[{cell_id}] {k}={v}")
    print(f"[{cell_id}] elapsed={time.time()-t0:.2f}s")

def write_json_atomic(path: Path, obj: Any, indent: int = 2) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    tmp = path.with_suffix(path.suffix + f".tmp_{uuid.uuid4().hex}")
    with tmp.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=indent)
    tmp.replace(path)

print("[CELL 02-00] done")

[CELL 02-00] start=2026-01-12T22:17:51
[CELL 02-00] REPO_ROOT: C:\anonymous-users-mooc-session-meta
[CELL 02-00] done


In [2]:
# [CELL 02-01] Setup run tagging

t0 = cell_start("CELL 02-01", "Setup run tagging")

NOTEBOOK_NAME = "02_sessionize_mars"
RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_ID = uuid.uuid4().hex

OUT_DIR = PATHS["REPORTS"] / NOTEBOOK_NAME / RUN_TAG
OUT_DIR.mkdir(parents=True, exist_ok=True)

REPORT_PATH = OUT_DIR / "report.json"

report = {
    "run_id": RUN_ID,
    "notebook": NOTEBOOK_NAME,
    "run_tag": RUN_TAG,
    "metrics": {},
    "key_findings": [],
}

cell_end("CELL 02-01", t0)


[CELL 02-01] Setup run tagging
[CELL 02-01] elapsed=0.00s


In [3]:
# [CELL 02-02] Load interactions

t0 = cell_start("CELL 02-02", "Load interactions")

INPUT_PATH = PATHS["DATA_INTERIM"] / "mars" / "interactions.parquet"
df = pd.read_parquet(INPUT_PATH)

print(f"[CELL 02-02] Loaded: {INPUT_PATH}")
print(f"[CELL 02-02] Shape: {df.shape}")
print(f"[CELL 02-02] Users: {df['user_id'].nunique():,}")

cell_end("CELL 02-02", t0)


[CELL 02-02] Load interactions
[CELL 02-02] Loaded: C:\anonymous-users-mooc-session-meta\data\interim\mars\interactions.parquet
[CELL 02-02] Shape: (3655, 5)
[CELL 02-02] Users: 822
[CELL 02-02] elapsed=0.05s


In [4]:
# [CELL 02-03] Create sessions using 30-minute gap threshold

t0 = cell_start("CELL 02-03", "Create sessions")

GAP_THRESHOLD = 30 * 60  # 30 minutes in seconds

# Sort by user and timestamp
df = df.sort_values(["user_id", "ts_epoch"]).reset_index(drop=True)

# Calculate time gaps within each user
df["prev_ts"] = df.groupby("user_id")["ts_epoch"].shift(1)
df["gap"] = df["ts_epoch"] - df["prev_ts"]

# New session starts when gap > threshold or first interaction of user
df["new_session"] = (df["gap"] > GAP_THRESHOLD) | (df["gap"].isna())

# Assign session IDs
df["session_id"] = df["new_session"].cumsum()

print(f"[CELL 02-03] Total sessions: {df['session_id'].nunique():,}")

# Session statistics
session_sizes = df.groupby("session_id").size()
print(f"[CELL 02-03] Session sizes:")
print(f"  - Min: {session_sizes.min()}")
print(f"  - Max: {session_sizes.max()}")
print(f"  - Mean: {session_sizes.mean():.2f}")
print(f"  - Median: {session_sizes.median():.2f}")

report["metrics"]["n_sessions"] = int(df["session_id"].nunique())
report["metrics"]["gap_threshold_minutes"] = GAP_THRESHOLD // 60

cell_end("CELL 02-03", t0)


[CELL 02-03] Create sessions
[CELL 02-03] Total sessions: 1,322
[CELL 02-03] Session sizes:
  - Min: 1
  - Max: 50
  - Mean: 2.76
  - Median: 1.00
[CELL 02-03] elapsed=0.01s


In [5]:
# [CELL 02-04] Save sessionized data

t0 = cell_start("CELL 02-04", "Save sessionized data")

# Prepare output
df_out = df[["user_id", "item_id", "ts_epoch", "session_id", "watch_percentage", "rating"]].copy()

# Save
OUT_PATH = PATHS["DATA_PROCESSED"] / "mars" / "sessions"
OUT_PATH.mkdir(parents=True, exist_ok=True)

out_file = OUT_PATH / "sessions.parquet"
df_out.to_parquet(out_file, index=False)

print(f"[CELL 02-04] Saved: {out_file}")
print(f"[CELL 02-04] Shape: {df_out.shape}")

write_json_atomic(REPORT_PATH, report)

print("\n" + "="*60)
print("NOTEBOOK 02 COMPLETE: MARS Sessionization")
print("="*60)
print(f"Sessions: {df['session_id'].nunique():,}")
print(f"Output: {out_file}")

cell_end("CELL 02-04", t0)


[CELL 02-04] Save sessionized data
[CELL 02-04] Saved: C:\anonymous-users-mooc-session-meta\data\processed\mars\sessions\sessions.parquet
[CELL 02-04] Shape: (3655, 6)

NOTEBOOK 02 COMPLETE: MARS Sessionization
Sessions: 1,322
Output: C:\anonymous-users-mooc-session-meta\data\processed\mars\sessions\sessions.parquet
[CELL 02-04] elapsed=0.01s
