This notebook generates session-based prefix–target training data for three datasets:
Amazon Books 2023, MARS (MOOC), and YooChoose.
It produces dataset-specific prefix–target artifacts and writes a clean sessionization manifest
that serves as the single source of truth for downstream splitting and tensor construction.

⚠️ This notebook does NOT perform train/val/test splitting.

Imports & global configuration

In [4]:
# CELL 1 — Imports & Global Config

import pandas as pd
import numpy as np
from pathlib import Path
import json
from datetime import timedelta

pd.set_option("display.max_columns", 50)

DATA_DIR = Path("../data/processed")
DATA_DIR.mkdir(exist_ok=True)

MAX_PREFIX_LEN = 20          # unified cap across datasets
ROWS_PER_PART = 1_000_000    # for chunked writing

print("[INIT] Data directory:", DATA_DIR.resolve())
print("[INIT] MAX_PREFIX_LEN:", MAX_PREFIX_LEN)


[INIT] Data directory: C:\Users\User\Documents\ml-workspace\session-transfer-mooc\data\processed
[INIT] MAX_PREFIX_LEN: 20


Initialize manifest (IN MEMORY ONLY)

In [5]:
# CELL 2 — Initialize clean manifest (in memory)

manifest = {
    "amazon_prefix_parts": [],
    "yoochoose_prefix_parts": [],
    "mars_prefix_target": None,
    "stats": {}
}

print("[MANIFEST] Initialized empty manifest")


[MANIFEST] Initialized empty manifest


AMAZON BOOKS

Load Amazon interactions

In [9]:
# CELL 3 — Load Amazon interactions

amazon_path = DATA_DIR / "amazon_books_2023_interactions.parquet"
assert amazon_path.exists(), f"Missing file: {amazon_path}"

amazon_df = pd.read_parquet(amazon_path)

required_cols = {"user_id", "item_id", "timestamp"}
assert required_cols.issubset(amazon_df.columns), amazon_df.columns

amazon_df["timestamp"] = pd.to_datetime(amazon_df["timestamp"])
amazon_df = amazon_df.sort_values(["user_id", "timestamp"])

print("[AMAZON] Interactions:", amazon_df.shape)
print("[AMAZON] Columns:", amazon_df.columns.tolist())


[AMAZON] Interactions: (27078467, 6)
[AMAZON] Columns: ['dataset', 'user_id', 'session_id', 'item_id', 'timestamp', 'interaction_type']


Sliding-window sessionization (Amazon)

In [13]:
# CELL 4 — Amazon sliding-window sessionization

WINDOW_SIZE = 50
STRIDE = 1

out_dir = DATA_DIR / "amazon_prefix_parts"
out_dir.mkdir(exist_ok=True)

rows = []
part_id = 0
total_pairs = 0

for uid, g in amazon_df.groupby("user_id"):
    items = g["item_id"].astype(str).tolist()
    if len(items) < 2:
        continue

    for i in range(1, len(items)):
        start = max(0, i - MAX_PREFIX_LEN)
        prefix = items[start:i]
        target = items[i]

        rows.append({
            "prefix": " ".join(prefix),
            "target": target,
            "prefix_len": len(prefix)
        })
        total_pairs += 1

        if len(rows) >= ROWS_PER_PART:
            out_path = out_dir / f"amazon_prefix_target_part{part_id:04d}.parquet"
            pd.DataFrame(rows).to_parquet(out_path, index=False)
            manifest["amazon_prefix_parts"].append(str(out_path))
            print(f"[AMAZON] Wrote {out_path} ({len(rows)} rows)")
            rows.clear()
            part_id += 1

# flush remainder
if rows:
    out_path = out_dir / f"amazon_prefix_target_part{part_id:04d}.parquet"
    pd.DataFrame(rows).to_parquet(out_path, index=False)
    manifest["amazon_prefix_parts"].append(str(out_path))
    print(f"[AMAZON] Wrote {out_path} ({len(rows)} rows)")


[AMAZON] Wrote ..\data\processed\amazon_prefix_parts\amazon_prefix_target_part0000.parquet (1000000 rows)
[AMAZON] Wrote ..\data\processed\amazon_prefix_parts\amazon_prefix_target_part0001.parquet (1000000 rows)
[AMAZON] Wrote ..\data\processed\amazon_prefix_parts\amazon_prefix_target_part0002.parquet (1000000 rows)
[AMAZON] Wrote ..\data\processed\amazon_prefix_parts\amazon_prefix_target_part0003.parquet (1000000 rows)
[AMAZON] Wrote ..\data\processed\amazon_prefix_parts\amazon_prefix_target_part0004.parquet (1000000 rows)
[AMAZON] Wrote ..\data\processed\amazon_prefix_parts\amazon_prefix_target_part0005.parquet (1000000 rows)
[AMAZON] Wrote ..\data\processed\amazon_prefix_parts\amazon_prefix_target_part0006.parquet (1000000 rows)
[AMAZON] Wrote ..\data\processed\amazon_prefix_parts\amazon_prefix_target_part0007.parquet (1000000 rows)
[AMAZON] Wrote ..\data\processed\amazon_prefix_parts\amazon_prefix_target_part0008.parquet (1000000 rows)
[AMAZON] Wrote ..\data\processed\amazon_prefix

Amazon diagnostics  

In [14]:
# CELL 5 — Amazon diagnostics

manifest["stats"]["amazon_prefix_pairs"] = total_pairs
manifest["stats"]["amazon_prefix_parts"] = len(manifest["amazon_prefix_parts"])

print("[AMAZON] Total prefix-target pairs:", total_pairs)
print("[AMAZON] Total parts:", len(manifest["amazon_prefix_parts"]))


[AMAZON] Total prefix-target pairs: 17464455
[AMAZON] Total parts: 45


MARS (MOOC)

Load MARS interactions

In [15]:
# CELL 6 — Load MARS interactions

mars_path = DATA_DIR / "mars_interactions.parquet"
assert mars_path.exists(), f"Missing file: {mars_path}"

mars_df = pd.read_parquet(mars_path)

mars_df["timestamp"] = pd.to_datetime(
    mars_df.get("timestamp", mars_df.get("created_at"))
)

mars_df = mars_df.sort_values(["user_id", "timestamp"])

print("[MARS] Interactions:", mars_df.shape)
print("[MARS] Columns:", mars_df.columns.tolist())


[MARS] Interactions: (3659, 6)
[MARS] Columns: ['dataset', 'user_id', 'session_id', 'item_id', 'timestamp', 'interaction_type']


Sessionization by 1-hour gap

In [16]:
# CELL 7 — MARS sessionization (1-hour gap)

MARS_GAP = timedelta(hours=1)

mars_df["prev_ts"] = mars_df.groupby("user_id")["timestamp"].shift(1)
mars_df["gap"] = mars_df["timestamp"] - mars_df["prev_ts"]
mars_df["new_session"] = mars_df["gap"].isna() | (mars_df["gap"] > MARS_GAP)
mars_df["sess_idx"] = mars_df.groupby("user_id")["new_session"].cumsum()
mars_df["session_id_real"] = (
    mars_df["user_id"].astype(str) + "__s" + mars_df["sess_idx"].astype(str)
)

print("[MARS] Unique sessions:", mars_df["session_id_real"].nunique())


[MARS] Unique sessions: 1275


Prefix–target generation (MARS)

In [17]:
# CELL 8 — MARS prefix-target generation

rows = []
total_pairs = 0

for sid, g in mars_df.groupby("session_id_real"):
    items = g.sort_values("timestamp")["item_id"].astype(str).tolist()
    if len(items) < 2:
        continue

    for i in range(1, len(items)):
        prefix = items[max(0, i - MAX_PREFIX_LEN):i]
        rows.append({
            "dataset": "mars",
            "user_id": g["user_id"].iloc[0],
            "session_id_real": sid,
            "prefix": " ".join(prefix),
            "prefix_len": len(prefix),
            "target": items[i]
        })
        total_pairs += 1

mars_pairs_df = pd.DataFrame(rows)
out_path = DATA_DIR / "mars_prefix_target.parquet"
mars_pairs_df.to_parquet(out_path, index=False)

manifest["mars_prefix_target"] = str(out_path)
manifest["stats"]["mars_prefix_pairs"] = total_pairs

print("[MARS] Prefix-target pairs:", total_pairs)
print("[MARS] Saved to:", out_path)


[MARS] Prefix-target pairs: 2384
[MARS] Saved to: ..\data\processed\mars_prefix_target.parquet


YOOCHOOSE

Load YooChoose interactions

In [18]:
# CELL 9 — Load YooChoose interactions

yoo_path = DATA_DIR / "yoochoose_interactions.parquet"
assert yoo_path.exists(), f"Missing file: {yoo_path}"

yoo_df = pd.read_parquet(yoo_path)

required_cols = {"session_id", "item_id", "timestamp"}
assert required_cols.issubset(yoo_df.columns), yoo_df.columns

yoo_df = yoo_df.sort_values(["session_id", "timestamp"])

print("[YOO] Interactions:", yoo_df.shape)


[YOO] Interactions: (31744233, 6)


Prefix–target generation (YooChoose)

In [19]:
# CELL 10 — YooChoose prefix-target generation

out_dir = DATA_DIR / "yoochoose_prefix_parts"
out_dir.mkdir(exist_ok=True)

rows = []
part_id = 0
total_pairs = 0

for sid, g in yoo_df.groupby("session_id"):
    items = g["item_id"].astype(str).tolist()
    if len(items) < 2:
        continue

    for i in range(1, len(items)):
        prefix = items[max(0, i - MAX_PREFIX_LEN):i]
        rows.append({
            "dataset": "yoochoose",
            "session_id": sid,
            "prefix": " ".join(prefix),
            "prefix_len": len(prefix),
            "target": items[i]
        })
        total_pairs += 1

        if len(rows) >= ROWS_PER_PART:
            out_path = out_dir / f"yoochoose_prefix_target_part{part_id:04d}.parquet"
            pd.DataFrame(rows).to_parquet(out_path, index=False)
            manifest["yoochoose_prefix_parts"].append(str(out_path))
            print(f"[YOO] Wrote {out_path}")
            rows.clear()
            part_id += 1

# flush remainder
if rows:
    out_path = out_dir / f"yoochoose_prefix_target_part{part_id:04d}.parquet"
    pd.DataFrame(rows).to_parquet(out_path, index=False)
    manifest["yoochoose_prefix_parts"].append(str(out_path))
    print(f"[YOO] Wrote {out_path}")

manifest["stats"]["yoochoose_prefix_pairs"] = total_pairs
manifest["stats"]["yoochoose_prefix_parts"] = len(manifest["yoochoose_prefix_parts"])


[YOO] Wrote ..\data\processed\yoochoose_prefix_parts\yoochoose_prefix_target_part0000.parquet
[YOO] Wrote ..\data\processed\yoochoose_prefix_parts\yoochoose_prefix_target_part0001.parquet
[YOO] Wrote ..\data\processed\yoochoose_prefix_parts\yoochoose_prefix_target_part0002.parquet
[YOO] Wrote ..\data\processed\yoochoose_prefix_parts\yoochoose_prefix_target_part0003.parquet
[YOO] Wrote ..\data\processed\yoochoose_prefix_parts\yoochoose_prefix_target_part0004.parquet
[YOO] Wrote ..\data\processed\yoochoose_prefix_parts\yoochoose_prefix_target_part0005.parquet
[YOO] Wrote ..\data\processed\yoochoose_prefix_parts\yoochoose_prefix_target_part0006.parquet
[YOO] Wrote ..\data\processed\yoochoose_prefix_parts\yoochoose_prefix_target_part0007.parquet
[YOO] Wrote ..\data\processed\yoochoose_prefix_parts\yoochoose_prefix_target_part0008.parquet
[YOO] Wrote ..\data\processed\yoochoose_prefix_parts\yoochoose_prefix_target_part0009.parquet
[YOO] Wrote ..\data\processed\yoochoose_prefix_parts\yoochoo

In [21]:
# CELL 10.5 — Deduplicate manifest paths (CRITICAL FIX)

manifest["amazon_prefix_parts"] = sorted(set(manifest["amazon_prefix_parts"]))
manifest["yoochoose_prefix_parts"] = sorted(set(manifest["yoochoose_prefix_parts"]))

print("[MANIFEST FIX] Amazon prefix parts:", len(manifest["amazon_prefix_parts"]))
print("[MANIFEST FIX] YooChoose prefix parts:", len(manifest["yoochoose_prefix_parts"]))


[MANIFEST FIX] Amazon prefix parts: 18
[MANIFEST FIX] YooChoose prefix parts: 24


FINAL MANIFEST WRITE

Write manifest ONCE

In [23]:
# CELL 11 — Write clean sessionization manifest

manifest_path = DATA_DIR / "sessionization_manifest.json"
with open(manifest_path, "w") as f:
    json.dump(manifest, f, indent=2)

print("[MANIFEST] Written to:", manifest_path)
print(json.dumps(manifest, indent=2))


[MANIFEST] Written to: ..\data\processed\sessionization_manifest.json
{
  "amazon_prefix_parts": [
    "..\\data\\processed\\amazon_prefix_parts\\amazon_prefix_target_part0000.parquet",
    "..\\data\\processed\\amazon_prefix_parts\\amazon_prefix_target_part0001.parquet",
    "..\\data\\processed\\amazon_prefix_parts\\amazon_prefix_target_part0002.parquet",
    "..\\data\\processed\\amazon_prefix_parts\\amazon_prefix_target_part0003.parquet",
    "..\\data\\processed\\amazon_prefix_parts\\amazon_prefix_target_part0004.parquet",
    "..\\data\\processed\\amazon_prefix_parts\\amazon_prefix_target_part0005.parquet",
    "..\\data\\processed\\amazon_prefix_parts\\amazon_prefix_target_part0006.parquet",
    "..\\data\\processed\\amazon_prefix_parts\\amazon_prefix_target_part0007.parquet",
    "..\\data\\processed\\amazon_prefix_parts\\amazon_prefix_target_part0008.parquet",
    "..\\data\\processed\\amazon_prefix_parts\\amazon_prefix_target_part0009.parquet",
    "..\\data\\processed\\amazo