Bootstrap: locate repo root (Windows-safe) + env info

In [1]:
# [CELL 05C-00] Bootstrap (Windows-safe) + locate repo root

import os, sys, json, time
from pathlib import Path

CWD = Path.cwd().resolve()
print("CWD:", CWD)

def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists() or (p / ".git").exists():
            return p
    raise FileNotFoundError("Could not find repo root (PROJECT_STATE.md or .git)")

REPO_ROOT = find_repo_root(CWD)
print("REPO_ROOT:", REPO_ROOT)

DATA_DIR = REPO_ROOT / "data"
PROC_DIR = DATA_DIR / "processed"
print("DATA_DIR:", DATA_DIR)
print("PROC_DIR:", PROC_DIR)


CWD: C:\mooc-coldstart-session-meta\notebooks
REPO_ROOT: C:\mooc-coldstart-session-meta
DATA_DIR: C:\mooc-coldstart-session-meta\data
PROC_DIR: C:\mooc-coldstart-session-meta\data\processed


Config: input sessionized source + outputs + split params

In [2]:
# [CELL 05C-01] Config: run tag + input sessionized parquet + output session sequences dataset

from pathlib import Path
import time

RUN_TAG = "20251229_232834"  # <-- your source sessionization tag

# Input: sessionized source events (from 05_sessionize_and_prefix_target.ipynb / your sessionize source step)
IN_SRC_SESS = (REPO_ROOT / "data" / "processed" / "sessionized" / f"source_events_sessionized_{RUN_TAG}.parquet")
if not IN_SRC_SESS.exists():
    alt = (REPO_ROOT / "data" / "sessionized" / f"source_events_sessionized_{RUN_TAG}.parquet")
    if alt.exists():
        IN_SRC_SESS = alt
    else:
        raise FileNotFoundError(f"Missing: {IN_SRC_SESS.resolve()} (and fallback: {alt.resolve()})")

# Output: session sequences (one row per session)
OUT_SEQ_DIR = (REPO_ROOT / "data" / "processed" / "session_sequences" / f"source_sessions_{RUN_TAG}")
OUT_SEQ_DIR.mkdir(parents=True, exist_ok=True)

OUT_SEQ_TRAIN = OUT_SEQ_DIR / "train"
OUT_SEQ_VAL   = OUT_SEQ_DIR / "val"
OUT_SEQ_TEST  = OUT_SEQ_DIR / "test"

for d in [OUT_SEQ_TRAIN, OUT_SEQ_VAL, OUT_SEQ_TEST]:
    d.mkdir(parents=True, exist_ok=True)

print("IN_SRC_SESS:", IN_SRC_SESS.resolve())
print("OUT_SEQ_DIR:", OUT_SEQ_DIR.resolve())


IN_SRC_SESS: C:\mooc-coldstart-session-meta\data\processed\sessionized\source_events_sessionized_20251229_232834.parquet
OUT_SEQ_DIR: C:\mooc-coldstart-session-meta\data\processed\session_sequences\source_sessions_20251229_232834


DuckDB setup (spill to disk) + input sanity (REPO_ROOT-safe) (CHECKPOINT 1)

In [3]:
# [CELL 05C-02] DuckDB setup (spill to disk) + view src

import duckdb
from pathlib import Path

duckdb_tmp = OUT_SEQ_DIR / "_duckdb_tmp"
duckdb_tmp.mkdir(parents=True, exist_ok=True)

con = duckdb.connect(database=":memory:")

# Stability in notebooks
con.execute("SET enable_progress_bar=false;")
con.execute("SET preserve_insertion_order=false;")
con.execute("PRAGMA enable_object_cache=false;")
con.execute("SET temp_directory = ?;", [duckdb_tmp.as_posix()])

# Speed/Resources: you have 32GB RAM, so give DuckDB more memory
con.execute("SET threads=4;")                # try 4; if unstable lower to 2
con.execute("PRAGMA memory_limit='20GB';")   # use your RAM, reduce spill

src_path = IN_SRC_SESS.as_posix().replace("'", "''")

con.execute(f"""
CREATE OR REPLACE VIEW src AS
SELECT
  CAST(domain AS VARCHAR) AS domain,
  CAST(user_id AS VARCHAR) AS user_id,
  CAST(item_id AS VARCHAR) AS item_id,
  CAST(timestamp AS TIMESTAMP) AS ts,
  CAST(session_id AS VARCHAR) AS session_id
FROM read_parquet('{src_path}');
""")

stats = con.execute("""
SELECT
  COUNT(*) AS n_events,
  COUNT(DISTINCT user_id) AS n_users,
  COUNT(DISTINCT item_id) AS n_items,
  COUNT(DISTINCT session_id) AS n_sessions,
  MIN(ts) AS min_ts,
  MAX(ts) AS max_ts
FROM src;
""").df()

print(stats)
print("duckdb_tmp:", duckdb_tmp.resolve())
print("\nCHECKPOINT 1 ✅ Paste the stats table.")


    n_events  n_users  n_items  n_sessions              min_ts  \
0  154817413   770283     1628     9996057 2015-07-31 23:59:15   

               max_ts  
0 2017-07-31 23:59:09  
duckdb_tmp: C:\mooc-coldstart-session-meta\data\processed\session_sequences\source_sessions_20251229_232834\_duckdb_tmp

CHECKPOINT 1 ✅ Paste the stats table.


Build session_lengths + deterministic splits (no leakage)

In [4]:
# [CELL 05C-03] Build session_lengths + deterministic splits (no leakage)

t0 = time.time()

con.execute("DROP TABLE IF EXISTS sess_len;")
con.execute("""
CREATE TABLE sess_len AS
SELECT
  session_id,
  ANY_VALUE(domain) AS domain,
  ANY_VALUE(user_id) AS user_id,
  COUNT(*) AS session_length,
  MIN(ts) AS start_ts,
  MAX(ts) AS end_ts
FROM src
GROUP BY session_id;
""")

summary = con.execute("""
SELECT
  COUNT(*) AS total_sessions,
  SUM(CASE WHEN session_length=1 THEN 1 ELSE 0 END) AS singleton_sessions,
  SUM(CASE WHEN session_length>=2 THEN 1 ELSE 0 END) AS eligible_sessions,
  approx_quantile(session_length, [0.5,0.9,0.99]) AS q
FROM sess_len;
""").df()
print(summary)

VAL_FRAC = 0.10
TEST_FRAC = 0.10
val_cut = int(VAL_FRAC * 1000)
test_cut = int(TEST_FRAC * 1000)

con.execute("DROP TABLE IF EXISTS session_splits;")
con.execute(f"""
CREATE TABLE session_splits AS
SELECT
  session_id,
  domain,
  user_id,
  session_length,
  start_ts,
  end_ts,
  CASE
    WHEN session_length < 2 THEN 'drop_singleton'
    WHEN (abs(hash(session_id)) % 1000) < {test_cut} THEN 'test'
    WHEN (abs(hash(session_id)) % 1000) < {test_cut + val_cut} THEN 'val'
    ELSE 'train'
  END AS split
FROM sess_len;
""")

counts = con.execute("""
SELECT split, COUNT(*) AS n_sessions
FROM session_splits
GROUP BY split
ORDER BY split;
""").df()
print(counts)

print("Seconds:", round(time.time() - t0, 2))
print("\nCHECKPOINT 2 ✅ Paste summary + split counts.")


   total_sessions  singleton_sessions  eligible_sessions             q
0         9996057           1655500.0          8340557.0  [7, 36, 127]
            split  n_sessions
0  drop_singleton     1655500
1            test      835233
2           train     6672282
3             val      833042
Seconds: 15.11

CHECKPOINT 2 ✅ Paste summary + split counts.


Write session sequences (bucketed + resume)
This writes one row per session with items list.

In [5]:
# [CELL 05C-04] Write SOURCE session sequences (1 row/session) — bucketed + resume

import time
from pathlib import Path

t0 = time.time()

N_BUCKETS = 1024          # 512–2048 ok; 1024 is a good balance
START_BUCKET = 0          # set to resume if interrupted
END_BUCKET = N_BUCKETS-1
PROGRESS_EVERY = 64

print("N_BUCKETS:", N_BUCKETS, "| bucket range:", START_BUCKET, "→", END_BUCKET)

def write_sequences(split_name: str, out_dir: Path):
    print("\n" + "="*70)
    print("WRITING SEQUENCES:", split_name)
    print("="*70)

    out_dir.mkdir(parents=True, exist_ok=True)

    written = 0
    skipped_exists = 0
    skipped_empty = 0
    t_split = time.time()

    for b in range(START_BUCKET, END_BUCKET + 1):
        out_file = out_dir / f"sessions_b{b:04d}.parquet"
        if out_file.exists() and out_file.stat().st_size > 0:
            skipped_exists += 1
            continue

        # quick skip empty bucket
        n_sess = con.execute(f"""
            SELECT COUNT(*)
            FROM session_splits
            WHERE split='{split_name}'
              AND (abs(hash(session_id)) % {N_BUCKETS}) = {b};
        """).fetchone()[0]

        if n_sess == 0:
            skipped_empty += 1
            continue

        out_sql = out_file.as_posix().replace("'", "''")

        sql = f"""
        COPY (
          WITH eligible AS (
            SELECT session_id, domain, user_id, start_ts, end_ts, session_length
            FROM session_splits
            WHERE split='{split_name}'
              AND (abs(hash(session_id)) % {N_BUCKETS}) = {b}
          ),
          ev AS (
            SELECT
              e.domain,
              e.user_id,
              s.session_id,
              s.ts,
              s.item_id,
              e.start_ts,
              e.end_ts,
              e.session_length
            FROM src s
            JOIN eligible e USING(session_id)
          )
          SELECT
            domain,
            user_id,
            session_id,
            session_length,
            start_ts,
            end_ts,
            list(item_id ORDER BY ts, item_id) AS items,
            '{split_name}' AS split
          FROM ev
          GROUP BY domain, user_id, session_id, session_length, start_ts, end_ts
        ) TO '{out_sql}' (FORMAT PARQUET, COMPRESSION 'SNAPPY');
        """

        con.execute(sql)
        written += 1

        if (b + 1) % PROGRESS_EVERY == 0:
            elapsed = (time.time() - t_split) / 60
            print(f"  {split_name}: {b+1}/{N_BUCKETS} | wrote={written} | empty={skipped_empty} | exists={skipped_exists} | {elapsed:.1f}m")

    print(f"✅ Finished {split_name}: wrote={written} | empty={skipped_empty} | exists={skipped_exists}")
    print("out_dir:", out_dir.resolve())

# smaller first
write_sequences("val",  OUT_SEQ_VAL)
write_sequences("test", OUT_SEQ_TEST)
write_sequences("train",OUT_SEQ_TRAIN)

print("\nAll sequences written.")
print("Total seconds:", round(time.time() - t0, 2))
print("\nCHECKPOINT 3 ✅ Paste the Finished val/test/train lines.")


N_BUCKETS: 1024 | bucket range: 0 → 1023

WRITING SEQUENCES: val
  val: 128/1024 | wrote=6 | empty=0 | exists=122 | 1.2m
  val: 192/1024 | wrote=70 | empty=0 | exists=122 | 13.7m
  val: 256/1024 | wrote=134 | empty=0 | exists=122 | 26.0m
  val: 320/1024 | wrote=198 | empty=0 | exists=122 | 38.3m
  val: 384/1024 | wrote=262 | empty=0 | exists=122 | 50.6m
  val: 448/1024 | wrote=326 | empty=0 | exists=122 | 63.0m
  val: 512/1024 | wrote=390 | empty=0 | exists=122 | 75.7m
  val: 576/1024 | wrote=454 | empty=0 | exists=122 | 89.1m
  val: 640/1024 | wrote=518 | empty=0 | exists=122 | 101.5m
  val: 704/1024 | wrote=582 | empty=0 | exists=122 | 114.1m
  val: 768/1024 | wrote=646 | empty=0 | exists=122 | 126.3m
  val: 832/1024 | wrote=710 | empty=0 | exists=122 | 138.2m
  val: 896/1024 | wrote=774 | empty=0 | exists=122 | 150.1m
  val: 960/1024 | wrote=838 | empty=0 | exists=122 | 161.9m
  val: 1024/1024 | wrote=902 | empty=0 | exists=122 | 173.9m
✅ Finished val: wrote=902 | empty=0 | exists=1

Verify output counts (sessions) + quick sanity

In [None]:
# [CELL 05C-05] Verify output counts (sessions) + quick sanity

val_glob  = (OUT_SEQ_VAL  / "sessions_b*.parquet").as_posix().replace("'", "''")
test_glob = (OUT_SEQ_TEST / "sessions_b*.parquet").as_posix().replace("'", "''")
train_glob= (OUT_SEQ_TRAIN/ "sessions_b*.parquet").as_posix().replace("'", "''")

val_n  = con.execute(f"SELECT COUNT(*) FROM read_parquet('{val_glob}');").fetchone()[0]
test_n = con.execute(f"SELECT COUNT(*) FROM read_parquet('{test_glob}');").fetchone()[0]
train_n= con.execute(f"SELECT COUNT(*) FROM read_parquet('{train_glob}');").fetchone()[0]

print("Session rows written:")
print("  val:  ", f"{val_n:,}")
print("  test: ", f"{test_n:,}")
print("  train:", f"{train_n:,}")

# optional: inspect one row
sample = con.execute(f"""
SELECT session_id, user_id, session_length, list_count(items) AS items_len
FROM read_parquet('{val_glob}')
LIMIT 5;
""").df()
print("\nSample rows:")
print(sample)



Session rows written:
  val:   833,042
  test:  835,233
  train: 6,672,282

Sample rows:
     session_id  user_id  session_length  items_len
0  3071230::792  3071230             113        113
1   1510255::23  1510255               9          9
2    646033::12   646033              12         12
3   2686301::18  2686301               5          5
4   3111881::85  3111881              10         10

CHECKPOINT 4 ✅ Paste session row counts + sample rows.


In [8]:
# [CELL 05C-META] Write meta entry for source session sequences

import json, time
from pathlib import Path

meta = {
  "run_tag": RUN_TAG,
  "created_at": time.strftime("%Y-%m-%d %H:%M:%S"),
  "artifact_type": "session_sequences",
  "dataset": "source",
  "paths": {
    "root": str(OUT_SEQ_DIR.resolve()),
    "train_glob": str((OUT_SEQ_TRAIN / "sessions_b*.parquet").resolve()),
    "val_glob": str((OUT_SEQ_VAL / "sessions_b*.parquet").resolve()),
    "test_glob": str((OUT_SEQ_TEST / "sessions_b*.parquet").resolve()),
  },
  "counts": {
    "train_sessions": int(train_n),
    "val_sessions": int(val_n),
    "test_sessions": int(test_n),
    "eligible_sessions_total": int(train_n + val_n + test_n),
  },
  "notes": {
    "source_gap_seconds": 600,
    "split_rule": "hash(session_id) modulo 1000; test=10%, val=10%, train=80%; singletons dropped",
    "items_list_order": "ORDER BY ts, item_id"
  }
}

OUT_META = OUT_SEQ_DIR / "meta_source_session_sequences.json"
OUT_META.write_text(json.dumps(meta, indent=2), encoding="utf-8")
print("Wrote:", OUT_META.resolve())
print(json.dumps(meta["counts"], indent=2))


Wrote: C:\mooc-coldstart-session-meta\data\processed\session_sequences\source_sessions_20251229_232834\meta_source_session_sequences.json
{
  "train_sessions": 6672282,
  "val_sessions": 833042,
  "test_sessions": 835233,
  "eligible_sessions_total": 8340557
}
