Bootstrap: locate repo root (Windows-safe) + basic env info

In [1]:
# [CELL 05A-00] Bootstrap: locate repo root (Windows-safe) + basic env info

import os
import sys
import platform
from pathlib import Path
import pandas as pd

CWD = Path.cwd().resolve()
print("CWD:", CWD)
print("Python:", sys.version.split()[0])
print("Platform:", platform.platform())
print("pandas:", pd.__version__)

def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            return p
    for p in [start, *start.parents]:
        if (p / ".git").exists():
            return p
    return start

REPO_ROOT = find_repo_root(CWD)
print("REPO_ROOT:", REPO_ROOT)

DATA_DIR = REPO_ROOT / "data"
PROC_DIR = DATA_DIR / "processed"
SUP_DIR  = PROC_DIR / "supervised"
SUP_DIR.mkdir(parents=True, exist_ok=True)

print("DATA_DIR:", DATA_DIR)
print("SUP_DIR:", SUP_DIR)


CWD: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\notebooks
Python: 3.11.14
Platform: Windows-10-10.0.22621-SP0
pandas: 2.3.3
REPO_ROOT: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta
DATA_DIR: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data
SUP_DIR: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\supervised


Config: input prefix-samples file + derive RUN_TAG + split params

In [2]:
# [CELL 05A-01] Config: input prefix-samples file + derive RUN_TAG + split params

from pathlib import Path
import re
import numpy as np

# ✅ Use your real produced file (from screenshot)
IN_PREFIX = SUP_DIR / "target_prefix_samples_20251229_163357.parquet"

if not IN_PREFIX.exists():
    raise FileNotFoundError(f"Missing IN_PREFIX: {IN_PREFIX.resolve()}")

# Derive RUN_TAG from filename suffix like ..._YYYYMMDD_HHMMSS
m = re.search(r"target_prefix_samples_(\d{8}_\d{6})$", IN_PREFIX.stem)
RUN_TAG = m.group(1) if m else "run"
print("IN_PREFIX:", IN_PREFIX.resolve())
print("RUN_TAG:", RUN_TAG)

# Split ratios (session-level, no leakage)
SPLIT = {
    "train": 0.80,
    "val":   0.10,
    "test":  0.10,
}
assert abs(sum(SPLIT.values()) - 1.0) < 1e-9

# Deterministic assignment seed (only used for hashing fallback)
SEED = 20251229

# Output files
OUT_TRAIN = SUP_DIR / f"target_prefix_train_{RUN_TAG}.parquet"
OUT_VAL   = SUP_DIR / f"target_prefix_val_{RUN_TAG}.parquet"
OUT_TEST  = SUP_DIR / f"target_prefix_test_{RUN_TAG}.parquet"

print("OUT_TRAIN:", OUT_TRAIN.name)
print("OUT_VAL:", OUT_VAL.name)
print("OUT_TEST:", OUT_TEST.name)


IN_PREFIX: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\supervised\target_prefix_samples_20251229_163357.parquet
RUN_TAG: 20251229_163357
OUT_TRAIN: target_prefix_train_20251229_163357.parquet
OUT_VAL: target_prefix_val_20251229_163357.parquet
OUT_TEST: target_prefix_test_20251229_163357.parquet


Load prefix samples + schema checks + quick profiling  (CHECKPOINT 1)

In [4]:
# [CELL 05A-02] Load prefix samples + schema checks + quick profiling  (CHECKPOINT 1) — FIXED

import time
import pandas as pd

t0 = time.time()
df = pd.read_parquet(IN_PREFIX)
print("Loaded:", df.shape)
print("Columns:", df.columns.tolist())

# Required columns (minimum viable for split + training later)
required = [
    "domain", "user_id", "session_id",
    "prefix_items", "label_item", "prefix_len"
]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# Optional columns we may have
optional_cols = ["session_length", "t", "start_ts", "end_ts"]
print("\nOptional present:", {c: (c in df.columns) for c in optional_cols})

# Parse timestamps if present (for duration logs)
if "start_ts" in df.columns and "end_ts" in df.columns:
    df["start_ts"] = pd.to_datetime(df["start_ts"], utc=True, errors="coerce")
    df["end_ts"]   = pd.to_datetime(df["end_ts"], utc=True, errors="coerce")
    bad = int(df["start_ts"].isna().sum() + df["end_ts"].isna().sum())
    if bad:
        print(f"⚠️  Warning: {bad} bad timestamps in start_ts/end_ts (NaT).")
    else:
        df["session_duration_sec"] = (df["end_ts"] - df["start_ts"]).dt.total_seconds()
        dur = df["session_duration_sec"].dropna()
        if len(dur):
            print("\nSession duration (sec) quantiles:",
                  dur.quantile([0.5, 0.9, 0.99]).to_dict())

# High-level stats
n_rows = len(df)
n_sessions = df["session_id"].nunique()
n_users = df["user_id"].nunique()
n_items_label = df["label_item"].nunique()

print("\nStats:")
print("  rows:", f"{n_rows:,}")
print("  sessions:", f"{n_sessions:,}")
print("  users:", f"{n_users:,}")
print("  unique label items:", f"{n_items_label:,}")

# Domain distribution
print("\nDomain counts:", df["domain"].value_counts(dropna=False).to_dict())

# Extra debug: peek at first 3 rows (selected cols)
peek_cols = [c for c in ["domain","user_id","session_id","t","prefix_len","label_item","start_ts","end_ts"] if c in df.columns]
print("\nHead (selected cols):")
print(df[peek_cols].head(3))

print("\nLoad+checks seconds:", round(time.time() - t0, 2))

print("\nCHECKPOINT 1 ✅")
print("Please paste the output of this cell (Loaded/Columns/Stats/Domain counts + Optional present).")


Loaded: (2333, 9)
Columns: ['domain', 'user_id', 'session_id', 't', 'prefix_items', 'prefix_len', 'label_item', 'start_ts', 'end_ts']

Optional present: {'session_length': False, 't': True, 'start_ts': True, 'end_ts': True}

Session duration (sec) quantiles: {0.5: 1814.0, 0.9: 6018.0, 0.99: 13098.0}

Stats:
  rows: 2,333
  sessions: 561
  users: 378
  unique label items: 704

Domain counts: {'target': 2333}

Head (selected cols):
   domain  user_id  session_id  t  prefix_len  label_item  \
0  target   104074  t_104074_2  1           1       52616   
1  target   104074  t_104074_2  2           2       52615   
2  target   104074  t_104074_2  3           3       52610   

                   start_ts                    end_ts  
0 2019-07-22 15:27:08+00:00 2019-07-22 15:52:02+00:00  
1 2019-07-22 15:27:08+00:00 2019-07-22 15:52:02+00:00  
2 2019-07-22 15:27:08+00:00 2019-07-22 15:52:02+00:00  

Load+checks seconds: 0.17

CHECKPOINT 1 ✅
Please paste the output of this cell (Loaded/Columns/S

Deterministic session-level split assignment (no leakage)

In [5]:
# [CELL 05A-03] Deterministic session-level split assignment (no leakage)

import hashlib

# We split by session_id so no prefix from same session leaks into different splits.
sessions = df["session_id"].astype(str).unique()
print("Unique sessions:", len(sessions))

# Deterministic hash -> uniform [0,1)
def session_u01(s: str) -> float:
    h = hashlib.md5((str(SEED) + "||" + s).encode("utf-8")).hexdigest()
    # use first 8 hex chars -> 32-bit int
    v = int(h[:8], 16)
    return v / 0xFFFFFFFF

u = np.array([session_u01(s) for s in sessions], dtype=np.float64)

# Map to split buckets
train_cut = SPLIT["train"]
val_cut = SPLIT["train"] + SPLIT["val"]

sess_split = {}
for sid, r in zip(sessions, u):
    if r < train_cut:
        sess_split[sid] = "train"
    elif r < val_cut:
        sess_split[sid] = "val"
    else:
        sess_split[sid] = "test"

# Attach split column
df["split"] = df["session_id"].astype(str).map(sess_split)

# Sanity: any unmapped?
null_split = df["split"].isna().sum()
if null_split:
    raise ValueError(f"Found {null_split} rows with NULL split (unexpected).")

print("Split counts (rows):", df["split"].value_counts().to_dict())
print("Split counts (sessions):", df.drop_duplicates("session_id")["split"].value_counts().to_dict())


Unique sessions: 561
Split counts (rows): {'train': 1944, 'test': 200, 'val': 189}
Split counts (sessions): {'train': 446, 'val': 60, 'test': 55}


Split QA: overlaps, per-split stats, label coverage  (CHECKPOINT 2)

In [7]:
# [CELL 05A-04] Split QA: overlaps, per-split stats, label coverage  (CHECKPOINT 2) — FIXED

import pandas as pd

def split_stats(name: str, part: pd.DataFrame):
    d = {
        "rows": len(part),
        "sessions": part["session_id"].nunique(),
        "users": part["user_id"].nunique(),
        "label_items": part["label_item"].nunique(),
        "domain_counts": part["domain"].value_counts().to_dict(),
        "avg_prefix_len": float(part["prefix_len"].mean()) if len(part) else 0.0,
        "max_prefix_len": int(part["prefix_len"].max()) if len(part) else 0,
    }

    # duration if available
    if "start_ts" in part.columns and "end_ts" in part.columns:
        st = pd.to_datetime(part["start_ts"], utc=True, errors="coerce")
        et = pd.to_datetime(part["end_ts"], utc=True, errors="coerce")
        dur = (et - st).dt.total_seconds().dropna()
        if len(dur):
            d["median_session_duration_sec"] = float(dur.median())
            d["p90_session_duration_sec"] = float(dur.quantile(0.9))
    return d

train_df = df[df["split"] == "train"].copy()
val_df   = df[df["split"] == "val"].copy()
test_df  = df[df["split"] == "test"].copy()

print("Train:", split_stats("train", train_df))
print("Val:",   split_stats("val",   val_df))
print("Test:",  split_stats("test",  test_df))

# No leakage check: session sets disjoint
s_tr = set(train_df["session_id"].astype(str).unique())
s_va = set(val_df["session_id"].astype(str).unique())
s_te = set(test_df["session_id"].astype(str).unique())

print("\nLeakage checks (sessions):")
print("  train∩val:", len(s_tr & s_va))
print("  train∩test:", len(s_tr & s_te))
print("  val∩test:", len(s_va & s_te))

if len(s_tr & s_va) or len(s_tr & s_te) or len(s_va & s_te):
    raise AssertionError("Session leakage detected: splits share session_id(s).")

# Label coverage gaps (val/test labels not seen in train)
train_labels = set(train_df["label_item"].astype(str).unique())
val_labels   = set(val_df["label_item"].astype(str).unique())
test_labels  = set(test_df["label_item"].astype(str).unique())

print("\nLabel coverage:")
print("  train labels:", len(train_labels))
print("  val labels:", len(val_labels), "| not in train:", len(val_labels - train_labels))
print("  test labels:", len(test_labels), "| not in train:", len(test_labels - train_labels))

print("\nCHECKPOINT 2 ✅")
print("Please paste the output of this cell (stats + leakage checks + label coverage).")


Train: {'rows': 1944, 'sessions': 446, 'users': 307, 'label_items': 650, 'domain_counts': {'target': 1944}, 'avg_prefix_len': 6.370884773662551, 'max_prefix_len': 20, 'median_session_duration_sec': 1785.0, 'p90_session_duration_sec': 5873.0}
Val: {'rows': 189, 'sessions': 60, 'users': 58, 'label_items': 153, 'domain_counts': {'target': 189}, 'avg_prefix_len': 5.243386243386244, 'max_prefix_len': 20, 'median_session_duration_sec': 2146.0, 'p90_session_duration_sec': 6448.0}
Test: {'rows': 200, 'sessions': 55, 'users': 51, 'label_items': 158, 'domain_counts': {'target': 200}, 'avg_prefix_len': 7.355, 'max_prefix_len': 20, 'median_session_duration_sec': 1934.0, 'p90_session_duration_sec': 13098.0}

Leakage checks (sessions):
  train∩val: 0
  train∩test: 0
  val∩test: 0

Label coverage:
  train labels: 650
  val labels: 153 | not in train: 23
  test labels: 158 | not in train: 31

CHECKPOINT 2 ✅
Please paste the output of this cell (stats + leakage checks + label coverage).


Write split parquets (train/val/test) + final confirmations

In [8]:
# [CELL 05A-05] Write split parquets (train/val/test) + final confirmations

# Drop helper column before writing if you want; I keep it because it helps debugging.
# If you prefer clean schema, uncomment the next 3 lines.
# train_df = train_df.drop(columns=["split"])
# val_df   = val_df.drop(columns=["split"])
# test_df  = test_df.drop(columns=["split"])

train_df.to_parquet(OUT_TRAIN, index=False)
val_df.to_parquet(OUT_VAL, index=False)
test_df.to_parquet(OUT_TEST, index=False)

print("Wrote:")
print(" ", OUT_TRAIN.resolve())
print(" ", OUT_VAL.resolve())
print(" ", OUT_TEST.resolve())

# Confirm readable and counts match
rt = pd.read_parquet(OUT_TRAIN)
rv = pd.read_parquet(OUT_VAL)
rs = pd.read_parquet(OUT_TEST)

print("\nRe-load counts:")
print("  train:", rt.shape, "sessions:", rt["session_id"].nunique())
print("  val:",   rv.shape, "sessions:", rv["session_id"].nunique())
print("  test:",  rs.shape, "sessions:", rs["session_id"].nunique())

print("\nDone ✅ 05A split created.")


Wrote:
  D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\supervised\target_prefix_train_20251229_163357.parquet
  D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\supervised\target_prefix_val_20251229_163357.parquet
  D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\supervised\target_prefix_test_20251229_163357.parquet

Re-load counts:
  train: (1944, 11) sessions: 446
  val: (189, 11) sessions: 60
  test: (200, 11) sessions: 55

Done ✅ 05A split created.
