##### Sessionize (target + source) and build prefix→label samples (target)
**Repo:** `mooc-coldstart-session-meta`  
**Strict order:** this notebook must run after 04.  
**Decisions (locked):** `target_gap = 30m (1800s)`, `source_gap = 10m (600s)`.

This notebook:
1) Validates `session_gap_thresholds.json` matches the locked decisions.  
2) Sessionizes **target** (MARS explicit-only variant) and writes sessionized events + session sequences.  
3) Builds **target** supervised samples (prefix → next-item label).  
4) Sessionizes **source** (XuetangX) using DuckDB (scales to large data) and writes sessionized events.

> Guardrail: **no toy/synthetic data**; everything is computed from the real parquet inputs.

In [1]:
# [CELL 05-00] Imports + versions

import os
import sys
import json
import time
import math
import hashlib
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

import duckdb
import pyarrow as pa
import pyarrow.parquet as pq

print("python:", sys.version)
print("pandas:", pd.__version__)
print("duckdb:", duckdb.__version__)
print("pyarrow:", pa.__version__)

python: 3.11.14 | packaged by Anaconda, Inc. | (main, Oct 21 2025, 18:30:03) [MSC v.1929 64 bit (AMD64)]
pandas: 2.3.3
duckdb: 1.4.3
pyarrow: 22.0.0


In [2]:
# [CELL 05-01] Bootstrap: locate repo root reliably (Windows-safe)

from pathlib import Path

CWD = Path.cwd().resolve()
print("Initial CWD:", CWD)

def find_repo_root(start: Path) -> Path:
    """Search upward for repo root. Priority: PROJECT_STATE.md."""
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists():
            print(f"  Found PROJECT_STATE.md in: {p}")
            return p
    # fallback: git
    for p in [start, *start.parents]:
        if (p / ".git").exists():
            print(f"  Found .git in: {p}")
            return p
    raise FileNotFoundError("Could not locate repo root (PROJECT_STATE.md or .git).")

REPO_ROOT = find_repo_root(CWD)
print("REPO_ROOT:", REPO_ROOT)

DATA_DIR = REPO_ROOT / "data"
PROC_DIR = DATA_DIR / "processed"
NORM_DIR = PROC_DIR / "normalized_events"

IN_TARGET = NORM_DIR / "events_target_norm.parquet"
IN_SOURCE = NORM_DIR / "events_source_norm.parquet"
IN_THRESH = NORM_DIR / "session_gap_thresholds.json"

OUT_SESS_DIR = PROC_DIR / "sessionized"
OUT_SUP_DIR  = PROC_DIR / "supervised"
OUT_SESS_DIR.mkdir(parents=True, exist_ok=True)
OUT_SUP_DIR.mkdir(parents=True, exist_ok=True)

RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
print("RUN_TAG:", RUN_TAG)

print("IN_TARGET:", IN_TARGET)
print("IN_SOURCE:", IN_SOURCE)
print("IN_THRESH:", IN_THRESH)

Initial CWD: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\notebooks
  Found PROJECT_STATE.md in: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta
REPO_ROOT: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta
RUN_TAG: 20251229_163357
IN_TARGET: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\normalized_events\events_target_norm.parquet
IN_SOURCE: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\normalized_events\events_source_norm.parquet
IN_THRESH: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\normalized_events\session_gap_thresholds.json


In [3]:
# [CELL 05-02] Validate inputs exist + validate locked thresholds

for p in [IN_TARGET, IN_SOURCE, IN_THRESH]:
    if not p.exists():
        raise FileNotFoundError(f"Missing required input: {p}")

thresholds = json.loads(IN_THRESH.read_text(encoding="utf-8"))
print("Loaded thresholds file:", IN_THRESH)
print(json.dumps(thresholds, indent=2)[:1000])

# Locked decisions for this project step:
LOCK_TARGET_S = 1800  # 30m
LOCK_SOURCE_S = 600   # 10m

t = thresholds.get("target", {})
s = thresholds.get("source", {})

t_s = int(t.get("primary_threshold_seconds", -1))
s_s = int(s.get("primary_threshold_seconds", -1))

if t_s != LOCK_TARGET_S or s_s != LOCK_SOURCE_S:
    raise ValueError(
        "session_gap_thresholds.json does NOT match locked decisions for Notebook 05.\n"
        f"Expected target={LOCK_TARGET_S}s, source={LOCK_SOURCE_S}s; got target={t_s}s, source={s_s}s.\n"
        "Fix: update data/processed/normalized_events/session_gap_thresholds.json then re-run."
    )

print("✅ Thresholds validated:", {"target_s": t_s, "source_s": s_s})

Loaded thresholds file: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\normalized_events\session_gap_thresholds.json
{
  "generated_from_run_tag": "20251229_154018",
  "generated_at": "2025-12-29T08:20:10",
  "target": {
    "primary_threshold_seconds": 1800,
    "primary_threshold_label": "30m"
  },
  "source": {
    "primary_threshold_seconds": 600,
    "primary_threshold_label": "10m",
    "sampling": {
      "SAMPLE_MOD": 100,
      "MIN_EVENTS_PER_USER": 2,
      "sample_users": 6891,
      "sample_counts": {
        "n_events": 1698038,
        "n_users": 6891,
        "n_items": 1476,
        "min_ts": "2015-07-31T23:59:21",
        "max_ts": "2017-07-31T23:51:34"
      }
    }
  },
  "decision_notes": {
    "source_threshold_override": "Override source primary gap to 10m (600s) to reduce session explosion; coverage-based recommendation remains 5m."
  }
}
✅ Thresholds validated: {'target_s': 1800, 'source_s': 600}


In [5]:
# [CELL 05-03] Load target events (explicit-only variant) + basic checks (robust)

import time
import pandas as pd
import numpy as np

t0 = time.time()

# --- Load ---
df_t = pd.read_parquet(IN_TARGET)
print("Loaded target events:", df_t.shape)

# --- Normalize column names (safe) ---
orig_cols = df_t.columns.tolist()
df_t.columns = [str(c).strip() for c in df_t.columns]
print("Columns:", df_t.columns.tolist())

# --- Helper: rename from possible alternatives ---
def _rename_first_match(df, target, candidates):
    if target in df.columns:
        return
    for c in candidates:
        if c in df.columns:
            df.rename(columns={c: target}, inplace=True)
            return

# Try to map typical variants (in case the parquet isn't perfectly standardized)
_rename_first_match(df_t, "user_id", ["user", "userid", "learner_id", "student_id", "uid"])
_rename_first_match(df_t, "item_id", ["item", "itemid", "course_id", "resource_id", "object_id", "iid"])
_rename_first_match(df_t, "timestamp", ["ts", "time", "event_time", "datetime", "created_at"])

# Domain is often missing in target (single domain). If missing, create it.
if "domain" not in df_t.columns:
    df_t["domain"] = TARGET_DOMAIN if "TARGET_DOMAIN" in globals() else "target"

# --- Required columns check ---
required_cols = ["domain", "user_id", "item_id", "timestamp"]
missing = [c for c in required_cols if c not in df_t.columns]
if missing:
    raise ValueError(
        f"Target missing required columns: {missing}\n"
        f"Existing columns: {df_t.columns.tolist()}\n"
        f"Original columns: {orig_cols}"
    )

# --- Timestamp parsing (handles int seconds/ms + strings) ---
ts = df_t["timestamp"]

if pd.api.types.is_datetime64_any_dtype(ts):
    # already datetime
    df_t["timestamp"] = pd.to_datetime(ts, utc=True, errors="coerce")
elif pd.api.types.is_numeric_dtype(ts):
    # numeric epoch: decide seconds vs milliseconds by magnitude
    vals = ts.dropna().astype("int64")
    if len(vals) == 0:
        df_t["timestamp"] = pd.NaT
    else:
        mx = int(vals.max())
        # heuristic: > 1e12 => ms, else => seconds
        unit = "ms" if mx > 1_000_000_000_000 else "s"
        df_t["timestamp"] = pd.to_datetime(ts, unit=unit, utc=True, errors="coerce")
else:
    # strings / objects
    df_t["timestamp"] = pd.to_datetime(ts, utc=True, errors="coerce")

bad_ts = int(df_t["timestamp"].isna().sum())
if bad_ts:
    # show a few bad rows to debug quickly
    bad_rows = df_t.loc[df_t["timestamp"].isna(), ["user_id", "item_id", "timestamp"]].head(10)
    raise ValueError(f"Target has {bad_ts} rows with invalid timestamp. نمونه:\n{bad_rows}")

# --- Type cleanup for ids (avoid mixed types) ---
df_t["user_id"] = df_t["user_id"].astype(str)
df_t["item_id"] = df_t["item_id"].astype(str)
df_t["domain"]  = df_t["domain"].astype(str)

# --- Sort for sessionization correctness ---
df_t = df_t.sort_values(["user_id", "timestamp", "item_id"]).reset_index(drop=True)

print("Target domain counts:", df_t["domain"].value_counts(dropna=False).to_dict())
print("n_users:", df_t["user_id"].nunique(), "n_items:", df_t["item_id"].nunique())
print("time range:", df_t["timestamp"].min(), "→", df_t["timestamp"].max())
print("Load+checks seconds:", round(time.time() - t0, 2))


Loaded target events: (3655, 5)
Columns: ['user_id', 'item_id', 'timestamp', 'signal_type', 'value']
Target domain counts: {'target': 3655}
n_users: 822 n_items: 776
time range: 2018-09-28 14:38:15+00:00 → 2021-09-20 16:26:06+00:00
Load+checks seconds: 0.1


In [6]:
# [CELL 05-04] Sessionize target in pandas (fast: target is small)

TARGET_GAP_S = LOCK_TARGET_S  # already validated

# Compute per-user time gaps
df_t["prev_ts"] = df_t.groupby("user_id")["timestamp"].shift(1)
df_t["gap_s"] = (df_t["timestamp"] - df_t["prev_ts"]).dt.total_seconds()

# New session if first event or gap > threshold
df_t["new_session"] = df_t["prev_ts"].isna() | (df_t["gap_s"] > TARGET_GAP_S)

# Session index per user
df_t["session_idx"] = df_t.groupby("user_id")["new_session"].cumsum().astype("int64")

# Stable session_id (string)
df_t["session_id"] = (
    "t_" + df_t["user_id"].astype(str) + "_" + df_t["session_idx"].astype(str)
)

# Sanity checks
n_sessions = df_t["session_id"].nunique()
sess_len = df_t.groupby("session_id").size()
print("Target sessions:", n_sessions)
print("Session length quantiles:", sess_len.quantile([0.5, 0.9, 0.99]).to_dict())
print("Min/Max session length:", int(sess_len.min()), int(sess_len.max()))

# Write sessionized target events
OUT_T_EVENTS = OUT_SESS_DIR / f"target_events_sessionized_{RUN_TAG}.parquet"
df_t_out = df_t.drop(columns=["prev_ts"])
df_t_out.to_parquet(OUT_T_EVENTS, index=False)
print("Wrote:", OUT_T_EVENTS, "| rows:", len(df_t_out))

Target sessions: 1322
Session length quantiles: {0.5: 1.0, 0.9: 6.0, 0.99: 24.789999999999964}
Min/Max session length: 1 50
Wrote: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\sessionized\target_events_sessionized_20251229_163357.parquet | rows: 3655


In [7]:
# [CELL 05-05] Build target session sequences (one row per session)

# Keep only the essential fields in sequence export
seq_df = (
    df_t_out.sort_values(["session_id", "timestamp"])
           .groupby("session_id")
           .agg(
               domain=("domain", "first"),
               user_id=("user_id", "first"),
               start_ts=("timestamp", "first"),
               end_ts=("timestamp", "last"),
               items=("item_id", lambda x: list(map(int, x.tolist()))),
           )
           .reset_index()
)

seq_df["session_len"] = seq_df["items"].apply(len)

OUT_T_SEQS = OUT_SESS_DIR / f"target_sessions_{RUN_TAG}.parquet"
seq_df.to_parquet(OUT_T_SEQS, index=False)
print("Wrote:", OUT_T_SEQS, "| sessions:", len(seq_df))

print("Sequence length quantiles:", seq_df["session_len"].quantile([0.5,0.9,0.99]).to_dict())

Wrote: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\sessionized\target_sessions_20251229_163357.parquet | sessions: 1322
Sequence length quantiles: {0.5: 1.0, 0.9: 6.0, 0.99: 24.789999999999964}


In [9]:
# [CELL 05-06] Build target supervised samples (prefix → next item)

# Config: max prefix length (used later by tensor builder)
MAX_PREFIX_LEN = 20

rows = []
for _, r in seq_df[["session_id","domain","user_id","items","start_ts","end_ts","session_len"]].iterrows():
    items = r["items"]
    L = len(items)
    if L < 2:
        continue
    # For each next-item prediction step
    for t in range(1, L):
        prefix = items[max(0, t - MAX_PREFIX_LEN):t]
        label = items[t]
        rows.append({
            "domain": r["domain"],
            "user_id": int(r["user_id"]),
            "session_id": r["session_id"],
            "t": int(t),
            "prefix_items": prefix,
            "prefix_len": int(len(prefix)),
            "label_item": int(label),
            "start_ts": r["start_ts"],
            "end_ts": r["end_ts"],
        })

samples_t = pd.DataFrame(rows)
print("Target samples:", samples_t.shape)
print("prefix_len quantiles:", samples_t["prefix_len"].quantile([0.5,0.9,0.99]).to_dict())

OUT_T_SAMPLES = OUT_SUP_DIR / f"target_prefix_samples_{RUN_TAG}.parquet"
samples_t.to_parquet(OUT_T_SAMPLES, index=False)
print("Wrote:", OUT_T_SAMPLES)

Target samples: (2333, 9)
prefix_len quantiles: {0.5: 4.0, 0.9: 18.0, 0.99: 20.0}
Wrote: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\supervised\target_prefix_samples_20251229_163357.parquet


In [21]:
# [CELL 05-07] Source sessionize in DuckDB (memory-safe: bucket by user hash and append)

import json
import duckdb
from pathlib import Path
import os

# --- Resolve gaps path and load source gap seconds ---
if "SESSION_GAPS_PATH" in globals():
    gaps_path = Path(SESSION_GAPS_PATH)
else:
    gaps_path = (Path(PROC_DIR) / "normalized_events" / "session_gap_thresholds.json") if "PROC_DIR" in globals() \
        else Path("data/processed/normalized_events/session_gap_thresholds.json")

with open(gaps_path, "r", encoding="utf-8") as f:
    gaps = json.load(f)

gap_s = int(gaps["source"]["primary_threshold_seconds"])
print("Using session gaps file:", gaps_path.resolve())
print("Using source gap seconds:", gap_s)

# --- DuckDB setup (tuned for memory) ---
con = duckdb.connect(database=":memory:")
con.execute("SET threads=1;")  # big win for memory
con.execute("SET preserve_insertion_order=false;")
con.execute("PRAGMA memory_limit='7GB';")  # keep slightly under the max to avoid pin failures
con.execute("PRAGMA enable_object_cache=false;")

src_path = Path(IN_SOURCE).as_posix().replace("'", "''")

# Inject constant domain
SRC_DOMAIN = "source"
con.execute(f"""
CREATE OR REPLACE VIEW src_raw AS
SELECT
  user_id,
  item_id,
  timestamp
FROM read_parquet('{src_path}');
""")

# Ensure OUT path parent exists
OUT_S_EVENTS.parent.mkdir(parents=True, exist_ok=True)

# We'll write chunk outputs then stitch into final parquet
tmp_dir = OUT_S_EVENTS.parent / "_tmp_src_sessionize"
tmp_dir.mkdir(parents=True, exist_ok=True)

N_BUCKETS = 64  # increase if still OOM; 64 is usually safe
tmp_files = []

print(f"Sessionizing source in {N_BUCKETS} buckets into: {tmp_dir}")

for b in range(N_BUCKETS):
    tmp_out = tmp_dir / f"src_events_sessionized_b{b:03d}.parquet"
    tmp_files.append(tmp_out)

    # Note: hash(user_id) works for strings too.
    sql_bucket = f"""
    COPY (
      WITH src AS (
        SELECT
          '{SRC_DOMAIN}' AS domain,
          user_id,
          item_id,
          timestamp
        FROM src_raw
        WHERE (hash(user_id) % {N_BUCKETS}) = {b}
      ),
      x AS (
        SELECT
          domain,
          user_id,
          item_id,
          timestamp,
          LAG(timestamp) OVER (PARTITION BY user_id ORDER BY timestamp, item_id) AS prev_ts
        FROM src
      ),
      y AS (
        SELECT
          *,
          CASE
            WHEN prev_ts IS NULL THEN 1
            WHEN (epoch(timestamp) - epoch(prev_ts)) > {gap_s} THEN 1
            ELSE 0
          END AS new_sess
        FROM x
      ),
      z AS (
        SELECT
          *,
          SUM(new_sess) OVER (
            PARTITION BY user_id
            ORDER BY timestamp, item_id
            ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
          ) AS session_num
        FROM y
      )
      SELECT
        domain,
        user_id,
        item_id,
        timestamp,
        CONCAT(user_id, '::', CAST(session_num AS VARCHAR)) AS session_id
      FROM z
    ) TO '{tmp_out.as_posix()}' (FORMAT PARQUET);
    """

    con.execute(sql_bucket)

    if (b + 1) % 8 == 0:
        print(f"  done {b+1}/{N_BUCKETS}")

# Stitch: read all tmp parquet and write final OUT_S_EVENTS
out_final = OUT_S_EVENTS.as_posix().replace("'", "''")
tmp_glob = (tmp_dir / "src_events_sessionized_b*.parquet").as_posix().replace("'", "''")

# Remove old final file if exists to avoid confusion
if OUT_S_EVENTS.exists():
    OUT_S_EVENTS.unlink()

con.execute(f"""
COPY (
  SELECT * FROM read_parquet('{tmp_glob}')
) TO '{out_final}' (FORMAT PARQUET);
""")

print("Wrote:", OUT_S_EVENTS)

# (Optional) cleanup tmp files if you want
# for f in tmp_dir.glob("src_events_sessionized_b*.parquet"):
#     f.unlink()
# tmp_dir.rmdir()


Using session gaps file: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\normalized_events\session_gap_thresholds.json
Using source gap seconds: 600
Sessionizing source in 64 buckets into: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\sessionized\_tmp_src_sessionize


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  done 8/64


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  done 16/64


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  done 24/64


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  done 32/64


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  done 40/64


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  done 48/64


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  done 56/64


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  done 64/64


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Wrote: D:\00_DS-ML-Workspace\mooc-coldstart-session-meta\data\processed\sessionized\source_events_sessionized_20251229_163357.parquet


## Outputs produced by this notebook (local)
- `data/processed/sessionized/target_events_sessionized_<RUN_TAG>.parquet`
- `data/processed/sessionized/target_sessions_<RUN_TAG>.parquet`
- `data/processed/supervised/target_prefix_samples_<RUN_TAG>.parquet`
- `data/processed/sessionized/source_events_sessionized_<RUN_TAG>.parquet`

Next notebook (strict order): **05A_split_prefix_target.ipynb**