In [1]:
# Quick (unsafe) workaround to avoid the libiomp5md.dll crash.
# Use this only to continue working in the notebook quickly.
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
print("Set KMP_DUPLICATE_LIB_OK=TRUE — restart kernel and re-run cells now.")

Set KMP_DUPLICATE_LIB_OK=TRUE — restart kernel and re-run cells now.


In [2]:

import pandas as pd
import torch
from pathlib import Path
import json

MAX_PREFIX_LEN = 20
DATA_DIR = Path("../data/processed")
manifest = json.load(open(DATA_DIR / "sessionization_manifest.json"))
item2id = json.load(open(DATA_DIR / "vocab_topn/item2id_top200000.json"))

pair_parts = manifest["amazon_prefix_parts"]

OUT = DATA_DIR / "tensor_shards"
OUT.mkdir(exist_ok=True)

shard_size = 250_000
buffer = {"prefix": [], "target": [], "length": []}
shard_id = 0

def flush():
    global shard_id
    if not buffer["prefix"]:
        return
    pt = {
        "prefix": torch.LongTensor(buffer["prefix"]),
        "target": torch.LongTensor(buffer["target"]),
        "length": torch.LongTensor(buffer["length"]),
    }
    torch.save(pt, OUT / f"shard_{shard_id:03d}.pt")
    shard_id += 1
    buffer["prefix"].clear()
    buffer["target"].clear()
    buffer["length"].clear()

for part in pair_parts:
    df = pd.read_parquet(part, columns=["prefix","target"])
    for _, row in df.iterrows():
        pref = str(row["prefix"]).split() if isinstance(row["prefix"], str) else []
        ids = [ item2id.get(x, 0) for x in pref ]
        if len(ids) > MAX_PREFIX_LEN:
            ids = ids[-MAX_PREFIX_LEN:]
        padded = [0]*(MAX_PREFIX_LEN - len(ids)) + ids
        
        buffer["prefix"].append(padded)
        buffer["target"].append(item2id.get(str(row["target"]), 0))
        buffer["length"].append(len(ids))
        
        if len(buffer["prefix"]) >= shard_size:
            flush()

flush()
print("Finished creating tensor shards.")


Finished creating tensor shards.
