In [1]:
import json
from pathlib import Path
import pandas as pd
import numpy as np

DATA_DIR = Path("../data/processed")
MANIFEST_PATH = DATA_DIR / "sessionization_manifest.json"

with open(MANIFEST_PATH, "r") as f:
    manifest = json.load(f)


In [2]:
amazon_parts = sorted(manifest["amazon_prefix_parts"])

n = len(amazon_parts)
train_end = int(0.8 * n)
val_end = int(0.9 * n)

amazon_splits = {
    "train": amazon_parts[:train_end],
    "val": amazon_parts[train_end:val_end],
    "test": amazon_parts[val_end:]
}

print({k: len(v) for k, v in amazon_splits.items()})


{'train': 14, 'val': 2, 'test': 2}


In [3]:
# CELL [05A-AMAZON-01] — Register Amazon splits

if "splits" not in manifest:
    manifest["splits"] = {}

manifest["splits"]["amazon"] = amazon_splits

print("[05A][AMAZON] Amazon splits registered:",
      {k: len(v) for k, v in amazon_splits.items()})


[05A][AMAZON] Amazon splits registered: {'train': 14, 'val': 2, 'test': 2}


In [5]:
# CELL [05A-YOO-01] — Build YooChoose splits

assert "yoochoose_prefix_parts" in manifest, \
    "yoochoose_prefix_parts not found in manifest"

yoo_parts = sorted(manifest["yoochoose_prefix_parts"])

n = len(yoo_parts)
train_end = int(0.8 * n)
val_end = int(0.9 * n)

yoo_splits = {
    "train": yoo_parts[:train_end],
    "val": yoo_parts[train_end:val_end],
    "test": yoo_parts[val_end:]
}

manifest["splits"]["yoochoose"] = yoo_splits

print("[05A][YOO] YooChoose splits registered:",
      {k: len(v) for k, v in yoo_splits.items()})


[05A][YOO] YooChoose splits registered: {'train': 19, 'val': 2, 'test': 3}


In [6]:
# CELL [05A-MARS-01]
from pathlib import Path
import pandas as pd

mars_path = Path(manifest["mars_prefix_target"])
mars_df = pd.read_parquet(mars_path)

print("[05A][MARS] Loaded:", mars_path)
print("[05A][MARS] Shape:", mars_df.shape)
print("[05A][MARS] Columns:", mars_df.columns.tolist())


[05A][MARS] Loaded: ..\data\processed\mars_prefix_target.parquet
[05A][MARS] Shape: (2384, 6)
[05A][MARS] Columns: ['dataset', 'user_id', 'session_id_real', 'prefix', 'prefix_len', 'target']


In [7]:
# CELL [05A-MARS-02]

if "session_id" in mars_df.columns:
    SID_COL = "session_id"
elif "session_id_real" in mars_df.columns:
    SID_COL = "session_id_real"
else:
    raise ValueError(
        "No session identifier found. Expected 'session_id' or 'session_id_real'."
    )

print(f"[05A][MARS] Using session id column: {SID_COL}")


[05A][MARS] Using session id column: session_id_real


In [8]:
# CELL [05A-MARS-03]
import numpy as np

rng = np.random.default_rng(42)

sessions = mars_df[SID_COL].unique()
print("[05A][MARS] Unique sessions:", len(sessions))

rng.shuffle(sessions)

n = len(sessions)
train_end = int(0.7 * n)
val_end = int(0.85 * n)

train_sess = set(sessions[:train_end])
val_sess   = set(sessions[train_end:val_end])
test_sess  = set(sessions[val_end:])

print("[05A][MARS] Split sizes:",
      len(train_sess), len(val_sess), len(test_sess))


[05A][MARS] Unique sessions: 549
[05A][MARS] Split sizes: 384 82 83


In [9]:
# CELL [05A-MARS-04]
from pathlib import Path

mars_out = DATA_DIR / "mars_splits"
mars_out.mkdir(exist_ok=True)

splits = {
    "train": mars_df[mars_df[SID_COL].isin(train_sess)],
    "val":   mars_df[mars_df[SID_COL].isin(val_sess)],
    "test":  mars_df[mars_df[SID_COL].isin(test_sess)],
}

for split, df_split in splits.items():
    out_path = mars_out / f"mars_prefix_target_{split}.parquet"
    df_split.to_parquet(out_path, index=False)
    print(f"[05A][MARS] Saved {split}: {df_split.shape} → {out_path}")


[05A][MARS] Saved train: (1744, 6) → ..\data\processed\mars_splits\mars_prefix_target_train.parquet
[05A][MARS] Saved val: (282, 6) → ..\data\processed\mars_splits\mars_prefix_target_val.parquet
[05A][MARS] Saved test: (358, 6) → ..\data\processed\mars_splits\mars_prefix_target_test.parquet


In [10]:
# CELL [05A-MARS-05] — Register MARS splits

if "splits" not in manifest:
    manifest["splits"] = {}

manifest["splits"]["mars"] = {
    "train": str(mars_out / "mars_prefix_target_train.parquet"),
    "val":   str(mars_out / "mars_prefix_target_val.parquet"),
    "test":  str(mars_out / "mars_prefix_target_test.parquet"),
}

print("[05A][MARS] MARS splits registered:",
      manifest["splits"]["mars"])


[05A][MARS] MARS splits registered: {'train': '..\\data\\processed\\mars_splits\\mars_prefix_target_train.parquet', 'val': '..\\data\\processed\\mars_splits\\mars_prefix_target_val.parquet', 'test': '..\\data\\processed\\mars_splits\\mars_prefix_target_test.parquet'}


In [11]:
# CELL [05A-MARS-06]

total = sum(df.shape[0] for df in splits.values())
assert total == mars_df.shape[0]

assert train_sess.isdisjoint(val_sess)
assert train_sess.isdisjoint(test_sess)
assert val_sess.isdisjoint(test_sess)

print("[05A][MARS] Sanity checks PASSED")


[05A][MARS] Sanity checks PASSED


In [12]:
# CELL [05A-FINAL] — Save manifest with ALL splits

with open(MANIFEST_PATH, "w") as f:
    json.dump(manifest, f, indent=2)

print("[05A] Manifest updated with splits for:",
      list(manifest["splits"].keys()))


[05A] Manifest updated with splits for: ['amazon', 'yoochoose', 'mars']


“no overlap” check

In [13]:
assert set(amazon_splits["train"]).isdisjoint(amazon_splits["val"])
assert set(amazon_splits["train"]).isdisjoint(amazon_splits["test"])
assert set(amazon_splits["val"]).isdisjoint(amazon_splits["test"])

assert set(yoo_splits["train"]).isdisjoint(yoo_splits["val"])
assert set(yoo_splits["train"]).isdisjoint(yoo_splits["test"])
assert set(yoo_splits["val"]).isdisjoint(yoo_splits["test"])


In [14]:
print(manifest["splits"].keys())

dict_keys(['amazon', 'yoochoose', 'mars'])
