Imports & environment notes

In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [2]:
# CELL [05B-01] — Imports & environment notes

import os
import json
from pathlib import Path
from collections import Counter

import pandas as pd
import torch

print("[05B-01] Starting 05B_build_tensor_dataset.ipynb")
print("[05B-01] pandas:", pd.__version__)
print("[05B-01] torch:", torch.__version__)


[05B-01] Starting 05B_build_tensor_dataset.ipynb
[05B-01] pandas: 2.3.3
[05B-01] torch: 2.9.1+cpu


Config, paths, and output folders

In [3]:
# CELL [05B-02] — Config & paths

DATA_DIR = Path("../data/processed")
MANIFEST_PATH = DATA_DIR / "sessionization_manifest.json"

OUT_DIR = DATA_DIR / "tensor_shards_v2"
OUT_DIR.mkdir(exist_ok=True)

VOCAB_DIR = DATA_DIR / "vocab_topn"
VOCAB_DIR.mkdir(exist_ok=True)

MAX_PREFIX_LEN = 20
PAD_ID = 0
UNK_ID = 1

SHARD_SIZE = 250_000  # number of examples per .pt shard

print("[05B-02] DATA_DIR:", DATA_DIR.resolve())
print("[05B-02] OUT_DIR:", OUT_DIR.resolve())
print("[05B-02] VOCAB_DIR:", VOCAB_DIR.resolve())
print("[05B-02] MAX_PREFIX_LEN:", MAX_PREFIX_LEN)
print("[05B-02] SHARD_SIZE:", SHARD_SIZE)


[05B-02] DATA_DIR: C:\Users\User\Documents\ml-workspace\session-transfer-mooc\data\processed
[05B-02] OUT_DIR: C:\Users\User\Documents\ml-workspace\session-transfer-mooc\data\processed\tensor_shards_v2
[05B-02] VOCAB_DIR: C:\Users\User\Documents\ml-workspace\session-transfer-mooc\data\processed\vocab_topn
[05B-02] MAX_PREFIX_LEN: 20
[05B-02] SHARD_SIZE: 250000


Load manifest and verify splits exist

In [4]:
# CELL [05B-03] — Load manifest & validate

with open(MANIFEST_PATH, "r") as f:
    manifest = json.load(f)

assert "splits" in manifest, "[05B-03] ERROR: manifest has no 'splits'. Run 05A again."

splits = manifest["splits"]

print("[05B-03] Splits domains:", list(splits.keys()))
for d in ["amazon", "yoochoose", "mars"]:
    assert d in splits, f"[05B-03] ERROR: Missing splits for domain: {d}"
    print(f"[05B-03] {d} splits keys:", list(splits[d].keys()))


[05B-03] Splits domains: ['amazon', 'yoochoose', 'mars']
[05B-03] amazon splits keys: ['train', 'val', 'test']
[05B-03] yoochoose splits keys: ['train', 'val', 'test']
[05B-03] mars splits keys: ['train', 'val', 'test']


Utility: normalize split file lists

In [5]:
# CELL [05B-04] — Helper to normalize split file lists

def as_file_list(x):
    """Ensure split entries become a list of file paths."""
    if isinstance(x, list):
        return x
    return [x]

def assert_files_exist(files, label):
    missing = [p for p in files if not Path(p).exists()]
    if missing:
        raise FileNotFoundError(f"[05B-04] Missing files for {label}: {missing[:3]} ... total missing={len(missing)}")


Build vocab helper (fast, deterministic, with logs)

In [6]:
# CELL [05B-05] — Vocabulary builder (TRAIN only)

def build_vocab_from_prefix_parts(files, top_k=None, log_every=2):
    """
    Build item2id from prefix+target tokens in parquet files.
    Reserves PAD_ID=0 and UNK_ID=1.
    """
    counter = Counter()
    files = list(files)

    for i, p in enumerate(files, 1):
        df = pd.read_parquet(p, columns=["prefix", "target"])
        for row in df.itertuples(index=False):
            if isinstance(row.prefix, str) and row.prefix:
                counter.update(row.prefix.split())
            counter.update([str(row.target)])

        if (i % log_every == 0) or (i == len(files)):
            print(f"[05B-05][VOCAB] {i}/{len(files)} files | unique_items_so_far={len(counter)}")

    item2id = {"<PAD>": PAD_ID, "<UNK>": UNK_ID}

    for item, _ in counter.most_common(top_k):
        if item not in item2id:
            item2id[item] = len(item2id)

    return item2id


Build and save source vocabulary (Amazon+YooChoose TRAIN)

In [7]:
# CELL [05B-06] — Build SOURCE vocab (amazon+yoochoose TRAIN)

amazon_train_files = as_file_list(splits["amazon"]["train"])
yoo_train_files = as_file_list(splits["yoochoose"]["train"])

assert_files_exist(amazon_train_files, "amazon/train")
assert_files_exist(yoo_train_files, "yoochoose/train")

source_train_files = amazon_train_files + yoo_train_files

print("[05B-06] Source train files:", len(source_train_files))
print("[05B-06] Example file:", source_train_files[0])

item2id_source = build_vocab_from_prefix_parts(
    source_train_files,
    top_k=200_000,
    log_every=2
)

source_vocab_path = VOCAB_DIR / "item2id_source.json"
with open(source_vocab_path, "w") as f:
    json.dump(item2id_source, f)

print("[05B-06] Saved source vocab:", source_vocab_path)
print("[05B-06] Source vocab size:", len(item2id_source))


[05B-06] Source train files: 33
[05B-06] Example file: ..\data\processed\amazon_prefix_parts\amazon_prefix_target_part0000.parquet
[05B-05][VOCAB] 2/33 files | unique_items_so_far=1058402
[05B-05][VOCAB] 4/33 files | unique_items_so_far=1655641
[05B-05][VOCAB] 6/33 files | unique_items_so_far=2109510
[05B-05][VOCAB] 8/33 files | unique_items_so_far=2485306
[05B-05][VOCAB] 10/33 files | unique_items_so_far=2810132
[05B-05][VOCAB] 12/33 files | unique_items_so_far=3094795
[05B-05][VOCAB] 14/33 files | unique_items_so_far=3354653
[05B-05][VOCAB] 16/33 files | unique_items_so_far=3381969
[05B-05][VOCAB] 18/33 files | unique_items_so_far=3387102
[05B-05][VOCAB] 20/33 files | unique_items_so_far=3390741
[05B-05][VOCAB] 22/33 files | unique_items_so_far=3393720
[05B-05][VOCAB] 24/33 files | unique_items_so_far=3396216
[05B-05][VOCAB] 26/33 files | unique_items_so_far=3398247
[05B-05][VOCAB] 28/33 files | unique_items_so_far=3399956
[05B-05][VOCAB] 30/33 files | unique_items_so_far=3401302
[05

Build and save target vocabulary (MARS TRAIN only)

In [8]:
# CELL [05B-07] — Build TARGET vocab (mars TRAIN)

mars_train_file = splits["mars"]["train"]
mars_train_files = as_file_list(mars_train_file)

assert_files_exist(mars_train_files, "mars/train")

print("[05B-07] Mars train file:", mars_train_files[0])

item2id_target = build_vocab_from_prefix_parts(
    mars_train_files,
    top_k=None,
    log_every=1
)

target_vocab_path = VOCAB_DIR / "item2id_target.json"
with open(target_vocab_path, "w") as f:
    json.dump(item2id_target, f)

print("[05B-07] Saved target vocab:", target_vocab_path)
print("[05B-07] Target vocab size:", len(item2id_target))


[05B-07] Mars train file: ..\data\processed\mars_splits\mars_prefix_target_train.parquet
[05B-05][VOCAB] 1/1 files | unique_items_so_far=700
[05B-07] Saved target vocab: ..\data\processed\vocab_topn\item2id_target.json
[05B-07] Target vocab size: 702


Tensorization core (writes shards with logs)

In [9]:
# CELL [05B-08] — Tensorization core

def tensorize_split(domain, split_name, files, item2id, max_len=MAX_PREFIX_LEN):
    """
    Convert prefix->target pairs in parquet files into tensor shards.
    Writes: input_ids, attention_mask, pos_ids, labels, lengths
    """
    files = list(files)
    assert_files_exist(files, f"{domain}/{split_name}")

    shard_id = 0
    n_rows_total = 0

    buffer = {
        "input_ids": [],
        "attention_mask": [],
        "pos_ids": [],
        "labels": [],
        "lengths": []
    }

    def flush():
        nonlocal shard_id
        if not buffer["input_ids"]:
            return

        pt = {k: torch.LongTensor(v) for k, v in buffer.items()}
        out_path = OUT_DIR / f"{domain}_{split_name}_shard_{shard_id:03d}.pt"
        torch.save(pt, out_path)

        print(f"[05B-08][SAVE] {domain}/{split_name} shard={shard_id:03d} "
              f"rows={len(buffer['input_ids'])} -> {out_path.name}")

        shard_id += 1
        for k in buffer:
            buffer[k].clear()

    for fi, p in enumerate(files, 1):
        df = pd.read_parquet(p, columns=["prefix", "target"])
        n_rows_total += len(df)

        for row in df.itertuples(index=False):
            pref_tokens = row.prefix.split() if isinstance(row.prefix, str) and row.prefix else []
            ids = [item2id.get(t, UNK_ID) for t in pref_tokens]

            # right-truncate
            if len(ids) > max_len:
                ids = ids[-max_len:]

            length = len(ids)
            pad_len = max_len - length

            input_ids = [PAD_ID] * pad_len + ids
            attention_mask = [0] * pad_len + [1] * length
            pos_ids = list(range(max_len))
            label = item2id.get(str(row.target), UNK_ID)

            buffer["input_ids"].append(input_ids)
            buffer["attention_mask"].append(attention_mask)
            buffer["pos_ids"].append(pos_ids)
            buffer["labels"].append(label)
            buffer["lengths"].append(length)

            if len(buffer["input_ids"]) >= SHARD_SIZE:
                flush()

        print(f"[05B-08][READ] {domain}/{split_name} file {fi}/{len(files)} "
              f"rows={len(df)} | total_rows_seen={n_rows_total}")

    flush()
    print(f"[05B-08][DONE] {domain}/{split_name} total_rows={n_rows_total} shards={shard_id}")
    return {"total_rows": n_rows_total, "shards": shard_id}


Run tensorization for ALL domains/splits

In [10]:
# CELL [05B-09] — Run tensorization (ALL domains & splits)

results = {}

# SOURCE domains
for domain in ["amazon", "yoochoose"]:
    results[domain] = {}
    for split_name in ["train", "val", "test"]:
        files = as_file_list(splits[domain][split_name])
        print(f"[05B-09] Tensorizing {domain}/{split_name} | files={len(files)}")
        results[domain][split_name] = tensorize_split(domain, split_name, files, item2id_source)

# TARGET domain
domain = "mars"
results[domain] = {}
for split_name in ["train", "val", "test"]:
    files = as_file_list(splits[domain][split_name])  # each is a single parquet path
    print(f"[05B-09] Tensorizing {domain}/{split_name} | files={len(files)}")
    results[domain][split_name] = tensorize_split(domain, split_name, files, item2id_target)


[05B-09] Tensorizing amazon/train | files=14
[05B-08][SAVE] amazon/train shard=000 rows=250000 -> amazon_train_shard_000.pt
[05B-08][SAVE] amazon/train shard=001 rows=250000 -> amazon_train_shard_001.pt
[05B-08][SAVE] amazon/train shard=002 rows=250000 -> amazon_train_shard_002.pt
[05B-08][SAVE] amazon/train shard=003 rows=250000 -> amazon_train_shard_003.pt
[05B-08][READ] amazon/train file 1/14 rows=1000000 | total_rows_seen=1000000
[05B-08][SAVE] amazon/train shard=004 rows=250000 -> amazon_train_shard_004.pt
[05B-08][SAVE] amazon/train shard=005 rows=250000 -> amazon_train_shard_005.pt
[05B-08][SAVE] amazon/train shard=006 rows=250000 -> amazon_train_shard_006.pt
[05B-08][SAVE] amazon/train shard=007 rows=250000 -> amazon_train_shard_007.pt
[05B-08][READ] amazon/train file 2/14 rows=1000000 | total_rows_seen=2000000
[05B-08][SAVE] amazon/train shard=008 rows=250000 -> amazon_train_shard_008.pt
[05B-08][SAVE] amazon/train shard=009 rows=250000 -> amazon_train_shard_009.pt
[05B-08][SA

Save metadata (single source of truth)

In [11]:
# CELL [05B-10] — Save metadata.json

metadata = {
    "max_prefix_len": MAX_PREFIX_LEN,
    "pad_id": PAD_ID,
    "unk_id": UNK_ID,
    "shard_size": SHARD_SIZE,
    "vocab": {
        "source": {
            "path": str(source_vocab_path),
            "size": len(item2id_source)
        },
        "target": {
            "path": str(target_vocab_path),
            "size": len(item2id_target)
        }
    },
    "tensor_output_dir": str(OUT_DIR),
    "tensor_fields": ["input_ids", "attention_mask", "pos_ids", "labels", "lengths"],
    "results": results
}

meta_path = OUT_DIR / "metadata.json"
with open(meta_path, "w") as f:
    json.dump(metadata, f, indent=2)

print("[05B-10] Saved metadata:", meta_path)
print("[05B-10] Summary:", {d: {s: results[d][s]["shards"] for s in results[d]} for d in results})


[05B-10] Saved metadata: ..\data\processed\tensor_shards_v2\metadata.json
[05B-10] Summary: {'amazon': {'train': 56, 'val': 8, 'test': 6}, 'yoochoose': {'train': 76, 'val': 8, 'test': 12}, 'mars': {'train': 1, 'val': 1, 'test': 1}}


Quick sanity check (loads one shard)

In [12]:
# CELL [05B-11] — Sanity check: load one shard

example = OUT_DIR / "mars_train_shard_000.pt"
if example.exists():
    batch = torch.load(example)
    print("[05B-11] Loaded:", example.name)
    for k, v in batch.items():
        print("   ", k, v.shape, v.dtype)
    print("[05B-11] First example input_ids:", batch["input_ids"][0].tolist())
    print("[05B-11] First example attention_mask:", batch["attention_mask"][0].tolist())
    print("[05B-11] First example label:", int(batch["labels"][0]))
else:
    print("[05B-11] Example shard not found yet:", example)


[05B-11] Loaded: mars_train_shard_000.pt
    input_ids torch.Size([1744, 20]) torch.int64
    attention_mask torch.Size([1744, 20]) torch.int64
    pos_ids torch.Size([1744, 20]) torch.int64
    labels torch.Size([1744]) torch.int64
    lengths torch.Size([1744]) torch.int64
[05B-11] First example input_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 124]
[05B-11] First example attention_mask: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
[05B-11] First example label: 226
