Bootstrap

In [1]:
# [CELL 13-00] Bootstrap
import os, json, time, random
from pathlib import Path
from datetime import datetime

import numpy as np
import torch

CELL = "13-00"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "meta.json").exists():
            return p
    return start

REPO_ROOT = str(find_repo_root(Path.cwd().resolve()))
RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
REPORT_DIR = os.path.join(REPO_ROOT, "reports", "13_meta_train_on_source_itemset", RUN_TAG)
os.makedirs(REPORT_DIR, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

SEED = 20260105
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"[{CELL}] REPO_ROOT  : {REPO_ROOT}")
print(f"[{CELL}] REPORT_DIR : {REPORT_DIR}")
print(f"[{CELL}] DEVICE     : {DEVICE} | torch={torch.__version__}")
print(f"[{CELL}] SEED       : {SEED}")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-00] Starting... 2026-01-05 11:43:25
[13-00] REPO_ROOT  : C:\mooc-coldstart-session-meta
[13-00] REPORT_DIR : C:\mooc-coldstart-session-meta\reports\13_meta_train_on_source_itemset\20260105_114325
[13-00] DEVICE     : cpu | torch=2.9.1+cpu
[13-00] SEED       : 20260105
[13-00] Done in 0.32s


Load 12A task config + DL config, locate SOURCE tensors/parquet
This cell intentionally doesn’t assume paths. It reads them from your existing configs.

In [2]:
# [CELL 13-01] Load existing configs (12A meta_task_config + dataloader_config) and locate SOURCE artifacts
import time
from datetime import datetime

CELL = "13-01"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

META_TASK_CFG_PATH = os.path.join(REPO_ROOT, "meta_task_config.json")  # if you keep it at root, else update
TASK_COVERAGE_PATH = os.path.join(REPO_ROOT, "task_coverage.json")

# If your files are under reports/12A..., set those exact paths here.
print(f"[{CELL}] META_TASK_CFG_PATH: {META_TASK_CFG_PATH}")
print(f"[{CELL}] TASK_COVERAGE_PATH: {TASK_COVERAGE_PATH}")

if not os.path.exists(META_TASK_CFG_PATH):
    print(f"[{CELL}] ⚠️ meta_task_config.json not found at this path.")
if not os.path.exists(TASK_COVERAGE_PATH):
    print(f"[{CELL}] ⚠️ task_coverage.json not found at this path.")

DL_CFG_PATH = os.path.join(REPO_ROOT, "data", "processed", "supervised", "dataloader_config_20251229_163357_20251229_232834.json")
print(f"[{CELL}] DL_CFG_PATH: {DL_CFG_PATH}")

with open(DL_CFG_PATH, "r", encoding="utf-8") as f:
    DL_CFG = json.load(f)

print(f"[{CELL}] DL_CFG top keys: {list(DL_CFG.keys())}")
SRC_CFG = DL_CFG["source"]
print(f"[{CELL}] SRC_CFG keys: {list(SRC_CFG.keys())}")

# Expected keys (based on your earlier schema issue):
# Either tensor_dir + train_pt/val_pt/test_pt + meta_json
# Or seq_dir + glob patterns + vocab_json
print(f"[{CELL}] SRC_CFG preview: {SRC_CFG}")

print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-01] Starting... 2026-01-05 11:44:11
[13-01] META_TASK_CFG_PATH: C:\mooc-coldstart-session-meta\meta_task_config.json
[13-01] TASK_COVERAGE_PATH: C:\mooc-coldstart-session-meta\task_coverage.json
[13-01] ⚠️ meta_task_config.json not found at this path.
[13-01] ⚠️ task_coverage.json not found at this path.
[13-01] DL_CFG_PATH: C:\mooc-coldstart-session-meta\data\processed\supervised\dataloader_config_20251229_163357_20251229_232834.json
[13-01] DL_CFG top keys: ['target', 'source', 'protocol']
[13-01] SRC_CFG keys: ['run_tag', 'seq_dir', 'train_glob', 'val_glob', 'test_glob', 'vocab_json']
[13-01] SRC_CFG preview: {'run_tag': '20251229_232834', 'seq_dir': 'C:\\mooc-coldstart-session-meta\\data\\processed\\session_sequences\\source_sessions_20251229_232834', 'train_glob': 'C:\\mooc-coldstart-session-meta\\data\\processed\\session_sequences\\source_sessions_20251229_232834\\train\\sessions_b*.parquet', 'val_glob': 'C:\\mooc-coldstart-session-meta\\data\\processed\\session_sequences\\so

Load SOURCE split tensors (preferred) + confirm padding direction and fields
This mirrors what we did in 12C, but for source.

In [7]:
# [CELL 13-02A] Debug: inspect items column type/values from first parquet
import pyarrow.parquet as pq
import numpy as np
import pandas as pd
import time
from datetime import datetime

CELL = "13-02A"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

p0 = train_files[0]
table = pq.read_table(p0, columns=[seq_col])
df0 = table.to_pandas()
print(f"[{CELL}] file={p0}")
print(f"[{CELL}] df0.shape={df0.shape}")

v = df0.iloc[0][seq_col]
print(f"[{CELL}] type(items[0])={type(v)}")
try:
    print(f"[{CELL}] repr(items[0]) head={str(v)[:200]}")
except Exception as e:
    print(f"[{CELL}] repr failed: {e}")

# robust normalize (same as we'll use in scanner)
def normalize_seq(x):
    if x is None:
        return None
    if hasattr(x, "as_py"):
        x = x.as_py()
    if isinstance(x, np.ndarray):
        x = x.tolist()
    if isinstance(x, (list, tuple)):
        return list(x)
    # sometimes pandas stores list column as object with .tolist()
    if hasattr(x, "tolist") and not isinstance(x, (str, bytes)):
        try:
            y = x.tolist()
            if isinstance(y, list):
                return y
        except Exception:
            pass
    # last resort
    return None

seq0 = normalize_seq(v)
print(f"[{CELL}] normalize_seq -> {type(seq0)} len={None if seq0 is None else len(seq0)}")
if seq0 is not None:
    print(f"[{CELL}] first5 tokens={seq0[:5]}")

print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-02A] Starting... 2026-01-05 12:00:06
[13-02A] file=C:\mooc-coldstart-session-meta\data\processed\session_sequences\source_sessions_20251229_232834\train\sessions_b0000.parquet
[13-02A] df0.shape=(6410, 1)
[13-02A] type(items[0])=<class 'numpy.ndarray'>
[13-02A] repr(items[0]) head=['course-v1:TsinghuaX+00690212X+sp' 'course-v1:TsinghuaX+00690212X+sp'
 'course-v1:TsinghuaX+00690212X+sp' 'course-v1:TsinghuaX+00690212X+sp'
 'course-v1:TsinghuaX+00690212X+sp' 'course-v1:TsinghuaX+0
[13-02A] normalize_seq -> <class 'list'> len=71
[13-02A] first5 tokens=['course-v1:TsinghuaX+00690212X+sp', 'course-v1:TsinghuaX+00690212X+sp', 'course-v1:TsinghuaX+00690212X+sp', 'course-v1:TsinghuaX+00690212X+sp', 'course-v1:TsinghuaX+00690212X+sp']
[13-02A] Done in 0.02s


In [8]:
# [CELL 13-02B1] PASS 1: Count seed-item tasks over TRAIN (no refs; memory-safe)
import time
from datetime import datetime
from collections import Counter
import numpy as np
import pyarrow.parquet as pq

CELL = "13-02B1"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

def normalize_seq(x):
    if x is None:
        return None
    if hasattr(x, "as_py"):
        x = x.as_py()
    if isinstance(x, np.ndarray):
        x = x.tolist()
    if isinstance(x, (list, tuple)):
        return list(x)
    if hasattr(x, "tolist") and not isinstance(x, (str, bytes)):
        try:
            y = x.tolist()
            if isinstance(y, list):
                return y
        except Exception:
            pass
    return None

def to_ids(seq_tokens, item2id_map, unk_id: int):
    if seq_tokens is None:
        return None
    out = []
    for x in seq_tokens:
        if x is None:
            continue
        sx = str(x).strip()
        if sx == "":
            continue
        out.append(int(item2id_map.get(sx, unk_id)))
    return out

def seed_first_nonpad(ids, pad_id: int):
    if ids is None or len(ids) == 0:
        return None
    for v in ids:
        iv = int(v)
        if iv != int(pad_id):
            return iv
    return int(pad_id)

task_counts = Counter()
row_count = 0
skip_nonseq = 0
skip_empty = 0
skip_allpad = 0

for fi, path in enumerate(train_files):
    tf = time.time()
    df = pq.read_table(path, columns=[seq_col]).to_pandas()
    n = len(df)

    for r in range(n):
        seq_tokens = normalize_seq(df.iloc[r][seq_col])
        if seq_tokens is None:
            skip_nonseq += 1
            continue

        ids = to_ids(seq_tokens, item2id, UNK_ID_SRC)
        if ids is None or len(ids) == 0:
            skip_empty += 1
            continue

        s = seed_first_nonpad(ids, PAD_ID_SRC)
        if s is None or s == PAD_ID_SRC:
            skip_allpad += 1
            continue

        task_counts[int(s)] += 1

    row_count += n
    if fi < 3 or (fi + 1) % 100 == 0:
        print(f"[{CELL}] scanned {fi+1}/{len(train_files)} rows={row_count} unique_tasks={len(task_counts)} elapsed_file={time.time()-tf:.2f}s")

print(f"[{CELL}] TRAIN PASS1 done. rows={row_count} unique_tasks={len(task_counts)}")
print(f"[{CELL}] skips: nonseq={skip_nonseq} empty={skip_empty} allpad={skip_allpad}")

if len(task_counts) == 0:
    raise RuntimeError(f"[{CELL}] No tasks found. Unexpected given 13-02A sample.")

sizes = sorted(task_counts.values())
p50 = int(np.median(sizes))
print(f"[{CELL}] task_size(min/p50/max)={min(sizes)}/{p50}/{max(sizes)}")
print(f"[{CELL}] top10 tasks: {task_counts.most_common(10)}")

# Save counts artifact (small)
COUNTS_PATH = os.path.join(REPORT_DIR, "task_counts_itemset_pass1.json")
with open(COUNTS_PATH, "w", encoding="utf-8") as f:
    json.dump({str(k): int(v) for k, v in task_counts.items()}, f)

print(f"[{CELL}] ✅ wrote: {COUNTS_PATH}")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-02B1] Starting... 2026-01-05 12:11:05
[13-02B1] scanned 1/1024 rows=6410 unique_tasks=856 elapsed_file=0.29s
[13-02B1] scanned 2/1024 rows=12905 unique_tasks=1005 elapsed_file=0.30s
[13-02B1] scanned 3/1024 rows=19319 unique_tasks=1084 elapsed_file=0.30s
[13-02B1] scanned 100/1024 rows=650531 unique_tasks=1516 elapsed_file=0.28s
[13-02B1] scanned 200/1024 rows=1304244 unique_tasks=1556 elapsed_file=0.22s
[13-02B1] scanned 300/1024 rows=1954910 unique_tasks=1564 elapsed_file=0.27s
[13-02B1] scanned 400/1024 rows=2606674 unique_tasks=1577 elapsed_file=0.23s
[13-02B1] scanned 500/1024 rows=3259886 unique_tasks=1582 elapsed_file=0.21s
[13-02B1] scanned 600/1024 rows=3910276 unique_tasks=1585 elapsed_file=0.23s
[13-02B1] scanned 700/1024 rows=4560993 unique_tasks=1586 elapsed_file=0.24s
[13-02B1] scanned 800/1024 rows=5212194 unique_tasks=1590 elapsed_file=0.29s
[13-02B1] scanned 900/1024 rows=5863399 unique_tasks=1595 elapsed_file=0.29s
[13-02B1] scanned 1000/1024 rows=6515673 unique_t

In [9]:
# [CELL 13-02B2] PASS 2: Build capped refs ONLY for kept tasks (reservoir sampling; reproducible)
import time
from datetime import datetime
from collections import defaultdict
import numpy as np
import pyarrow.parquet as pq

CELL = "13-02B2"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

MIN_TASK_EX = 50
CAP_PER_TASK_REFS = 5000  # bounded memory; adjust only if needed
RNG = np.random.RandomState(SEED)

kept_tasks = [k for k, v in task_counts.items() if v >= MIN_TASK_EX]
kept_tasks = sorted(kept_tasks)
kept_set = set(kept_tasks)

print(f"[{CELL}] MIN_TASK_EX={MIN_TASK_EX}")
print(f"[{CELL}] kept_tasks={len(kept_tasks)} / {len(task_counts)} total tasks")
print(f"[{CELL}] CAP_PER_TASK_REFS={CAP_PER_TASK_REFS}")

if len(kept_tasks) == 0:
    raise RuntimeError(f"[{CELL}] No tasks kept. Lower MIN_TASK_EX.")

# reservoir state per task
task_seen = defaultdict(int)      # how many examples of this task we've seen
task_to_refs = defaultdict(list)  # sampled refs: list[(fi, r)]

row_count = 0
skip_nonseq = 0
skip_empty = 0
skip_allpad = 0
skip_notkept = 0

for fi, path in enumerate(train_files):
    tf = time.time()
    df = pq.read_table(path, columns=[seq_col]).to_pandas()
    n = len(df)

    for r in range(n):
        seq_tokens = normalize_seq(df.iloc[r][seq_col])
        if seq_tokens is None:
            skip_nonseq += 1
            continue

        ids = to_ids(seq_tokens, item2id, UNK_ID_SRC)
        if ids is None or len(ids) == 0:
            skip_empty += 1
            continue

        s = seed_first_nonpad(ids, PAD_ID_SRC)
        if s is None or s == PAD_ID_SRC:
            skip_allpad += 1
            continue

        if s not in kept_set:
            skip_notkept += 1
            continue

        # reservoir sampling per task
        task_seen[s] += 1
        seen = task_seen[s]
        buf = task_to_refs[s]

        if len(buf) < CAP_PER_TASK_REFS:
            buf.append((fi, r))
        else:
            j = RNG.randint(0, seen)
            if j < CAP_PER_TASK_REFS:
                buf[j] = (fi, r)

    row_count += n
    if fi < 3 or (fi + 1) % 100 == 0:
        print(f"[{CELL}] scanned {fi+1}/{len(train_files)} rows={row_count} tasks_with_refs={len(task_to_refs)} elapsed_file={time.time()-tf:.2f}s")

print(f"[{CELL}] TRAIN PASS2 done. rows={row_count}")
print(f"[{CELL}] skips: nonseq={skip_nonseq} empty={skip_empty} allpad={skip_allpad} notkept={skip_notkept}")
print(f"[{CELL}] tasks_with_refs={len(task_to_refs)} (should ~= kept_tasks)")

# sanity: check a couple tasks
sample_tasks = kept_tasks[:3]
for k in sample_tasks:
    print(f"[{CELL}] task={k} true_count={task_counts[k]} sampled_refs={len(task_to_refs.get(k, []))}")

# Save refs as torch file (faster, smaller than JSON)
# store as dict[int, dict[file_idx:list,row_idx:list]]
refs_pack = {}
for k, refs in task_to_refs.items():
    fi_list = [int(a) for a, b in refs]
    ri_list = [int(b) for a, b in refs]
    refs_pack[int(k)] = {"file_idx": fi_list, "row_idx": ri_list}

REFS_PATH = os.path.join(REPORT_DIR, "task_refs_itemset_pass2.pt")
torch.save(
    {"kept_tasks": kept_tasks, "cap": CAP_PER_TASK_REFS, "refs": refs_pack},
    REFS_PATH
)

print(f"[{CELL}] ✅ wrote refs: {REFS_PATH}")

print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-02B2] Starting... 2026-01-05 12:15:35
[13-02B2] MIN_TASK_EX=50
[13-02B2] kept_tasks=1345 / 1597 total tasks
[13-02B2] CAP_PER_TASK_REFS=5000
[13-02B2] scanned 1/1024 rows=6410 tasks_with_refs=852 elapsed_file=0.35s
[13-02B2] scanned 2/1024 rows=12905 tasks_with_refs=999 elapsed_file=0.31s
[13-02B2] scanned 3/1024 rows=19319 tasks_with_refs=1076 elapsed_file=0.30s
[13-02B2] scanned 100/1024 rows=650531 tasks_with_refs=1344 elapsed_file=0.37s
[13-02B2] scanned 200/1024 rows=1304244 tasks_with_refs=1345 elapsed_file=0.25s
[13-02B2] scanned 300/1024 rows=1954910 tasks_with_refs=1345 elapsed_file=0.26s
[13-02B2] scanned 400/1024 rows=2606674 tasks_with_refs=1345 elapsed_file=0.28s
[13-02B2] scanned 500/1024 rows=3259886 tasks_with_refs=1345 elapsed_file=0.27s
[13-02B2] scanned 600/1024 rows=3910276 tasks_with_refs=1345 elapsed_file=0.30s
[13-02B2] scanned 700/1024 rows=4560993 tasks_with_refs=1345 elapsed_file=0.28s
[13-02B2] scanned 800/1024 rows=5212194 tasks_with_refs=1345 elapsed_fi

In [10]:
# [CELL 13-03] Save meta_task_config_itemset.json (used by meta-train cells)
import time
from datetime import datetime
import numpy as np

CELL = "13-03"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

kept_sizes = [int(task_counts[k]) for k in kept_tasks]
print(f"[{CELL}] kept_task_size(min/median/max)={min(kept_sizes)}/{int(np.median(kept_sizes))}/{max(kept_sizes)}")

META_TASK_CFG = {
    "task_key_mode": "seed_item_first_nonpad",
    "min_task_examples": int(MIN_TASK_EX),
    "cap_per_task_refs": int(CAP_PER_TASK_REFS),
    "seq_col": seq_col,
    "label_col": None,
    "pad_id": int(PAD_ID_SRC),
    "unk_id": int(UNK_ID_SRC),
    "vocab_size": int(VOCAB_SIZE_SRC),
    "tasks_kept": [int(k) for k in kept_tasks],
    "task_counts_path": os.path.join(REPORT_DIR, "task_counts_itemset_pass1.json"),
    "task_refs_path": os.path.join(REPORT_DIR, "task_refs_itemset_pass2.pt"),
    "source": {
        "seq_dir": SEQ_DIR,
        "train_glob": TRAIN_GLOB,
        "val_glob": VAL_GLOB,
        "test_glob": TEST_GLOB,
        "vocab_json": VOCAB_JSON,
    },
}

CFG_PATH = os.path.join(REPORT_DIR, "meta_task_config_itemset.json")
with open(CFG_PATH, "w", encoding="utf-8") as f:
    json.dump(META_TASK_CFG, f, indent=2)

print(f"[{CELL}] ✅ wrote config: {CFG_PATH}")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-03] Starting... 2026-01-05 12:36:43
[13-03] kept_task_size(min/median/max)=50/1407/158715
[13-03] ✅ wrote config: C:\mooc-coldstart-session-meta\reports\13_meta_train_on_source_itemset\20260105_114325\meta_task_config_itemset.json
[13-03] Done in 0.01s


Load proto/meta_cfg from 12B report + load itemset task config/refs

In [11]:
# [CELL 13-04] Load 12B proto/meta_cfg + load 13 itemset task config/refs
import os, json, time, glob
from datetime import datetime
import numpy as np
import torch

CELL = "13-04"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# 12B report.json (use your actual path if different)
# You can also point to your 12B run report in reports/12B... if you prefer.
REPORT_12B_JSON = os.path.join(REPO_ROOT, "reports", "12B_meta_train_on_source", "20260104_165117", "report.json")
if not os.path.exists(REPORT_12B_JSON):
    # fallback: if you copied it to root or elsewhere
    REPORT_12B_JSON = os.path.join(REPO_ROOT, "report.json")

print(f"[{CELL}] REPORT_12B_JSON={REPORT_12B_JSON}")
if not os.path.exists(REPORT_12B_JSON):
    raise RuntimeError(f"[{CELL}] 12B report.json not found. Set REPORT_12B_JSON to the correct path.")

with open(REPORT_12B_JSON, "r", encoding="utf-8") as f:
    rep12b = json.load(f)

PROTO = rep12b["proto"]
META_CFG = rep12b["meta_cfg"]
print(f"[{CELL}] Loaded PROTO keys={list(PROTO.keys())}")
print(f"[{CELL}] Loaded META_CFG keys={list(META_CFG.keys())}")
print(f"[{CELL}] META_CFG: meta_steps={META_CFG['meta_steps']} meta_batch_tasks={META_CFG['meta_batch_tasks']} inner_steps={META_CFG['inner_steps']} inner_lr={META_CFG['inner_lr']} meta_lr={META_CFG['meta_lr']}")

# 13 itemset config produced in 13-03
ITEMSET_CFG_PATH = os.path.join(REPORT_DIR, "meta_task_config_itemset.json")
print(f"[{CELL}] ITEMSET_CFG_PATH={ITEMSET_CFG_PATH}")
with open(ITEMSET_CFG_PATH, "r", encoding="utf-8") as f:
    ITEMSET_CFG = json.load(f)

TASK_REFS_PATH = ITEMSET_CFG["task_refs_path"]
print(f"[{CELL}] TASK_REFS_PATH={TASK_REFS_PATH}")
pack = torch.load(TASK_REFS_PATH, map_location="cpu")

KEPT_TASKS = list(map(int, pack["kept_tasks"]))
REFS = pack["refs"]  # dict: task -> {"file_idx": [...], "row_idx":[...]}

print(f"[{CELL}] kept_tasks={len(KEPT_TASKS)} | refs_tasks={len(REFS)} | cap={pack.get('cap')}")

# Files lists
TRAIN_GLOB = ITEMSET_CFG["source"]["train_glob"]
VAL_GLOB   = ITEMSET_CFG["source"]["val_glob"]
TEST_GLOB  = ITEMSET_CFG["source"]["test_glob"]
train_files = sorted(glob.glob(TRAIN_GLOB))
val_files   = sorted(glob.glob(VAL_GLOB))
test_files  = sorted(glob.glob(TEST_GLOB))
print(f"[{CELL}] train_files={len(train_files)} val_files={len(val_files)} test_files={len(test_files)}")

PAD_ID_SRC = int(ITEMSET_CFG["pad_id"])
UNK_ID_SRC = int(ITEMSET_CFG["unk_id"])
VOCAB_SIZE_SRC = int(ITEMSET_CFG["vocab_size"])
SEQ_COL = ITEMSET_CFG["seq_col"]

print(f"[{CELL}] PAD={PAD_ID_SRC} UNK={UNK_ID_SRC} VOCAB={VOCAB_SIZE_SRC} seq_col={SEQ_COL}")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-04] Starting... 2026-01-05 12:53:35
[13-04] REPORT_12B_JSON=C:\mooc-coldstart-session-meta\reports\12B_meta_train_on_source\20260104_165117\report.json
[13-04] Loaded PROTO keys=['K_LIST', 'MAX_PREFIX_LEN', 'CAP_ENABLED', 'CAP_SESSION_LEN', 'CAP_STRATEGY']
[13-04] Loaded META_CFG keys=['emb_dim', 'hidden_dim', 'dropout', 'meta_lr', 'inner_lr', 'inner_steps', 'meta_steps', 'meta_batch_tasks', 'grad_clip', 'seed', 'log_every', 'eval_every', 'val_episodes']
[13-04] META_CFG: meta_steps=2000 meta_batch_tasks=4 inner_steps=1 inner_lr=0.01 meta_lr=0.0005
[13-04] ITEMSET_CFG_PATH=C:\mooc-coldstart-session-meta\reports\13_meta_train_on_source_itemset\20260105_114325\meta_task_config_itemset.json
[13-04] TASK_REFS_PATH=C:\mooc-coldstart-session-meta\reports\13_meta_train_on_source_itemset\20260105_114325\task_refs_itemset_pass2.pt
[13-04] kept_tasks=1345 | refs_tasks=1345 | cap=5000
[13-04] train_files=1024 val_files=1024 test_files=1024
[13-04] PAD=0 UNK=1 VOCAB=1620 seq_col=items
[13-04] 

Load vocab mapping (string → id) and define robust converters

In [12]:
# [CELL 13-05] Load item2id mapping + robust converters (np.ndarray[str] -> list[int])
import time, json
from datetime import datetime
import numpy as np
import pyarrow.parquet as pq

CELL = "13-05"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

VOCAB_JSON = ITEMSET_CFG["source"]["vocab_json"]
print(f"[{CELL}] VOCAB_JSON={VOCAB_JSON}")

with open(VOCAB_JSON, "r", encoding="utf-8") as f:
    vocab_obj = json.load(f)

def find_mapping(obj: dict):
    if isinstance(obj, dict) and obj and all(isinstance(v, int) for v in obj.values()):
        return obj
    for k in ["item2id", "token_to_id", "vocab", "mapping"]:
        if k in obj and isinstance(obj[k], dict) and obj[k] and all(isinstance(v, int) for v in obj[k].values()):
            return obj[k]
    return None

item2id = find_mapping(vocab_obj)
if item2id is None:
    raise RuntimeError(f"[{CELL}] token->id mapping not found in vocab_json keys={list(vocab_obj.keys())}")

print(f"[{CELL}] item2id size={len(item2id)} (expect {VOCAB_SIZE_SRC})")

def normalize_seq(x):
    if x is None:
        return None
    if hasattr(x, "as_py"):
        x = x.as_py()
    if isinstance(x, np.ndarray):
        return x.tolist()
    if isinstance(x, (list, tuple)):
        return list(x)
    if hasattr(x, "tolist") and not isinstance(x, (str, bytes)):
        try:
            y = x.tolist()
            if isinstance(y, list):
                return y
        except Exception:
            pass
    return None

def seq_str_to_ids(seq_tokens, item2id_map, unk_id: int):
    if seq_tokens is None:
        return None
    out = []
    for x in seq_tokens:
        if x is None:
            continue
        sx = str(x).strip()
        if sx == "":
            continue
        out.append(int(item2id_map.get(sx, unk_id)))
    return out

def cap_session(seq_ids, cap_len: int, strategy: str):
    if seq_ids is None:
        return None
    if cap_len is None or cap_len <= 0:
        return seq_ids
    if len(seq_ids) <= cap_len:
        return seq_ids
    if strategy == "take_last":
        return seq_ids[-cap_len:]
    if strategy == "take_first":
        return seq_ids[:cap_len]
    return seq_ids[-cap_len:]

print(f"[{CELL}] ✅ converters ready")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-05] Starting... 2026-01-05 12:54:09
[13-05] VOCAB_JSON=C:\mooc-coldstart-session-meta\data\processed\session_sequences\source_sessions_20251229_232834\source_vocab_items_20251229_232834.json
[13-05] item2id size=1620 (expect 1620)
[13-05] ✅ converters ready
[13-05] Done in 0.00s


Small LRU cache for parquet files (speed + reproducible)

In [13]:
# [CELL 13-06] Parquet file cache (LRU-like) to speed row access
import time
from datetime import datetime
from collections import OrderedDict
import pyarrow.parquet as pq

CELL = "13-06"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

MAX_CACHE_FILES = 8
_cache = OrderedDict()  # file_path -> pandas df (items only)

def get_items_df(file_path: str):
    # LRU: move accessed item to end
    if file_path in _cache:
        df = _cache.pop(file_path)
        _cache[file_path] = df
        return df

    # load from disk
    df = pq.read_table(file_path, columns=[SEQ_COL]).to_pandas()
    _cache[file_path] = df
    if len(_cache) > MAX_CACHE_FILES:
        _cache.popitem(last=False)
    return df

print(f"[{CELL}] MAX_CACHE_FILES={MAX_CACHE_FILES}")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-06] Starting... 2026-01-05 12:54:35
[13-06] MAX_CACHE_FILES=8
[13-06] Done in 0.00s


Build one supervised example from a session: prefix → next-item (left-pad to MAX_PREFIX_LEN)

In [14]:
# [CELL 13-07] Build (x,y) from a single session: random prefix -> next item (left-pad)
import time
from datetime import datetime
import numpy as np
import torch

CELL = "13-07"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

MAX_PREFIX_LEN = int(PROTO["MAX_PREFIX_LEN"])
CAP_ENABLED = bool(PROTO["CAP_ENABLED"])
CAP_SESSION_LEN = int(PROTO["CAP_SESSION_LEN"]) if CAP_ENABLED else None
CAP_STRATEGY = str(PROTO["CAP_STRATEGY"])

print(f"[{CELL}] MAX_PREFIX_LEN={MAX_PREFIX_LEN} | CAP_ENABLED={CAP_ENABLED} CAP_SESSION_LEN={CAP_SESSION_LEN} CAP_STRATEGY={CAP_STRATEGY}")

RNG = np.random.RandomState(SEED)

def session_to_one_pair(seq_ids):
    """
    seq_ids: list[int]
    returns (input_ids[T], attn_mask[T], label_int) or None if too short
    """
    if seq_ids is None:
        return None
    seq_ids = cap_session(seq_ids, CAP_SESSION_LEN, CAP_STRATEGY)
    if seq_ids is None or len(seq_ids) < 2:
        return None

    # choose prefix length (>=1) so there is a next-item label
    max_pref = min(len(seq_ids) - 1, MAX_PREFIX_LEN)
    if max_pref < 1:
        return None
    pref_len = int(RNG.randint(1, max_pref + 1))  # inclusive

    x_tokens = seq_ids[:pref_len]
    y = int(seq_ids[pref_len])  # next item

    # left-pad to MAX_PREFIX_LEN
    pad_n = MAX_PREFIX_LEN - len(x_tokens)
    input_ids = [PAD_ID_SRC] * pad_n + [int(v) for v in x_tokens]
    attn_mask = [0] * pad_n + [1] * len(x_tokens)

    return input_ids, attn_mask, y

print(f"[{CELL}] ✅ session_to_one_pair ready")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-07] Starting... 2026-01-05 12:54:56
[13-07] MAX_PREFIX_LEN=20 | CAP_ENABLED=True CAP_SESSION_LEN=200 CAP_STRATEGY=take_last
[13-07] ✅ session_to_one_pair ready
[13-07] Done in 0.00s


Build support/query batch from task refs

In [15]:
# [CELL 13-08] Build batch from task refs (file_idx,row_idx) -> tensors
import time
from datetime import datetime
import numpy as np
import torch

CELL = "13-08"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

def sample_refs_for_task(task_id: int, B: int):
    ref_pack = REFS[str(task_id)] if isinstance(next(iter(REFS.keys())), str) else REFS[task_id]
    fi = ref_pack["file_idx"]
    ri = ref_pack["row_idx"]
    N = len(fi)
    if N == 0:
        return []
    idxs = RNG.choice(np.arange(N), size=B, replace=(N < B))
    return [(int(fi[j]), int(ri[j])) for j in idxs]

def build_batch_from_refs(files_list, refs_list):
    """
    refs_list: list[(file_idx,row_idx)]
    Returns dict with tensors: input_ids[B,T], attn_mask[B,T], lengths[B], labels[B]
    """
    xs, ms, ys = [], [], []
    tries = 0
    max_tries = len(refs_list) * 3 + 10

    for (fi, r) in refs_list:
        tries += 1
        if tries > max_tries:
            break
        fp = files_list[fi]
        df = get_items_df(fp)
        seq_raw = df.iloc[r][SEQ_COL]
        seq_tokens = normalize_seq(seq_raw)
        seq_ids = seq_str_to_ids(seq_tokens, item2id, UNK_ID_SRC)
        pair = session_to_one_pair(seq_ids)
        if pair is None:
            continue
        x, m, y = pair
        xs.append(x); ms.append(m); ys.append(y)

    if len(xs) == 0:
        return None

    input_ids = torch.tensor(xs, dtype=torch.long)
    attn_mask = torch.tensor(ms, dtype=torch.long)
    labels = torch.tensor(ys, dtype=torch.long)
    lengths = attn_mask.sum(dim=1).clamp(min=1)

    return {"input_ids": input_ids, "attn_mask": attn_mask, "lengths": lengths, "labels": labels}

# quick sanity on one task
t_example = int(KEPT_TASKS[0])
refs = sample_refs_for_task(t_example, B=8)
batch = build_batch_from_refs(train_files, refs)
if batch is None:
    raise RuntimeError(f"[{CELL}] failed to build batch for task={t_example}")

print(f"[{CELL}] batch shapes: input_ids={tuple(batch['input_ids'].shape)} labels={tuple(batch['labels'].shape)}")
print(f"[{CELL}] sample0 len={int(batch['lengths'][0])} y0={int(batch['labels'][0])} x_last5={batch['input_ids'][0].tolist()[-5:]}")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-08] Starting... 2026-01-05 12:55:28
[13-08] batch shapes: input_ids=(8, 20) labels=(8,)
[13-08] sample0 len=19 y0=2 x_last5=[2, 2, 2, 2, 2]
[13-08] Done in 0.12s


Model + optimizer (same architecture family as 12B)

In [16]:
# [CELL 13-09] Model: GRU4RecDropout + meta optimizer
import time
from datetime import datetime
import torch.nn as nn
import torch.nn.functional as F

CELL = "13-09"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

class GRU4RecDropout(nn.Module):
    def __init__(self, vocab_size: int, emb_dim: int, hidden_dim: int, pad_id: int, dropout: float = 0.3):
        super().__init__()
        self.pad_id = int(pad_id)
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=self.pad_id)
        self.drop = nn.Dropout(float(dropout))
        self.gru = nn.GRU(input_size=emb_dim, hidden_size=hidden_dim, batch_first=True)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_ids: torch.Tensor, lengths: torch.Tensor):
        emb = self.drop(self.emb(input_ids))  # [B,T,E]
        packed = nn.utils.rnn.pack_padded_sequence(emb, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, h = self.gru(packed)  # [1,B,H]
        logits = self.out(h.squeeze(0))  # [B,V]
        return logits

emb_dim = int(META_CFG["emb_dim"])
hidden_dim = int(META_CFG["hidden_dim"])
dropout = float(META_CFG["dropout"])
meta_lr = float(META_CFG["meta_lr"])

model = GRU4RecDropout(
    vocab_size=VOCAB_SIZE_SRC,
    emb_dim=emb_dim,
    hidden_dim=hidden_dim,
    pad_id=PAD_ID_SRC,
    dropout=dropout
).to(DEVICE)

opt = torch.optim.Adam(model.parameters(), lr=meta_lr)

print(f"[{CELL}] model=GRU4RecDropout(vocab={VOCAB_SIZE_SRC}, emb={emb_dim}, hid={hidden_dim}, drop={dropout}) on {DEVICE}")
print(f"[{CELL}] meta_lr={meta_lr}")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-09] Starting... 2026-01-05 12:55:52
[13-09] model=GRU4RecDropout(vocab=1620, emb=64, hid=128, drop=0.3) on cpu
[13-09] meta_lr=0.0005
[13-09] Done in 2.42s


Meta-learning core (FOMAML-style): adapt copy on support, use query grads to update base

This is first-order MAML approximation: we copy query grads from the adapted model into the base model and step.

In [17]:
# [CELL 13-10] Meta-learning core (first-order): adapt on support, backprop query on adapted, copy grads -> base
import time, copy
from datetime import datetime
import torch

CELL = "13-10"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

inner_lr = float(META_CFG["inner_lr"])
inner_steps = int(META_CFG["inner_steps"])
meta_batch_tasks = int(META_CFG["meta_batch_tasks"])
grad_clip = float(META_CFG["grad_clip"])

print(f"[{CELL}] inner_lr={inner_lr} inner_steps={inner_steps} meta_batch_tasks={meta_batch_tasks} grad_clip={grad_clip}")

def adapt_one_task(base_model, support_batch):
    """
    Returns adapted_model (deepcopy) trained for inner_steps on support.
    """
    m = copy.deepcopy(base_model)
    m.train()

    inner_opt = torch.optim.SGD(m.parameters(), lr=inner_lr)
    for s in range(inner_steps):
        inner_opt.zero_grad(set_to_none=True)
        logits = m(support_batch["input_ids"].to(DEVICE), support_batch["lengths"].to(DEVICE))
        loss = F.cross_entropy(logits, support_batch["labels"].to(DEVICE), ignore_index=PAD_ID_SRC)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(m.parameters(), grad_clip)
        inner_opt.step()
    return m

def zero_grads(m):
    for p in m.parameters():
        p.grad = None

def add_grads_from(src_model, dst_model):
    """
    Add grads from src_model params into dst_model params by name order.
    Assumes same architecture.
    """
    for (n_dst, p_dst), (n_src, p_src) in zip(dst_model.named_parameters(), src_model.named_parameters()):
        if p_src.grad is None:
            continue
        if p_dst.grad is None:
            p_dst.grad = p_src.grad.detach().clone()
        else:
            p_dst.grad.add_(p_src.grad.detach())

def meta_step(base_model):
    """
    One meta update: sample tasks, for each task:
      - build support/query batches
      - adapt copy on support
      - compute query loss, backprop on adapted
      - accumulate adapted grads into base grads
    """
    base_model.train()
    opt.zero_grad(set_to_none=True)

    tasks = RNG.choice(np.array(KEPT_TASKS), size=meta_batch_tasks, replace=(len(KEPT_TASKS) < meta_batch_tasks))
    total_q_loss = 0.0
    used = 0

    for tk in tasks:
        tk = int(tk)

        # build support/query
        sup_refs = sample_refs_for_task(tk, B=int(ITEMSET_CFG.get("min_task_examples", 50)) and int(rep12b["task_cfg"]["n_support"]))
        qry_refs = sample_refs_for_task(tk, B=int(rep12b["task_cfg"]["n_query"]))

        sup = build_batch_from_refs(train_files, sup_refs)
        qry = build_batch_from_refs(train_files, qry_refs)
        if sup is None or qry is None:
            continue

        adapted = adapt_one_task(base_model, sup)

        # query loss on adapted
        zero_grads(adapted)
        logits_q = adapted(qry["input_ids"].to(DEVICE), qry["lengths"].to(DEVICE))
        q_loss = F.cross_entropy(logits_q, qry["labels"].to(DEVICE), ignore_index=PAD_ID_SRC)
        q_loss.backward()

        add_grads_from(adapted, base_model)

        total_q_loss += float(q_loss.detach().cpu())
        used += 1

    if used == 0:
        return {"used_tasks": 0, "q_loss": None}

    # average gradients
    for p in base_model.parameters():
        if p.grad is not None:
            p.grad.div_(used)

    torch.nn.utils.clip_grad_norm_(base_model.parameters(), grad_clip)
    opt.step()

    return {"used_tasks": used, "q_loss": total_q_loss / used}

print(f"[{CELL}] ✅ meta_step/adapt_one_task ready")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-10] Starting... 2026-01-05 12:56:30
[13-10] inner_lr=0.01 inner_steps=1 meta_batch_tasks=4 grad_clip=1.0
[13-10] ✅ meta_step/adapt_one_task ready
[13-10] Done in 0.00s


Evaluation: HR@K (quick) for meta-adapt on VAL refs

We’ll do HR@20 only for selection (like 12B best_val_hr20), to keep it clean.

In [18]:
# [CELL 13-11] Eval meta-adapt HR@20 on VAL by sampling episodes/tasks
import time, copy
from datetime import datetime
import numpy as np
import torch

CELL = "13-11"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

K_EVAL = 20
VAL_EPISODES = int(META_CFG.get("val_episodes", 50))
print(f"[{CELL}] VAL_EPISODES={VAL_EPISODES} K={K_EVAL}")

# Build VAL refs (fast path): reuse TRAIN refs if you want, but better is VAL sampling.
# For now (no extra 6.6M scan), we evaluate on TRAIN refs as a proxy.
# This is still real data; but you must note it as proxy.
# If you want strict VAL, we will add a dedicated VAL pass like 13-02B2 on val_files.

def hit_rate_at_k(logits, labels, k: int):
    topk = torch.topk(logits, k=k, dim=1).indices  # [B,k]
    y = labels.view(-1, 1)
    hit = (topk == y).any(dim=1).float().mean().item()
    return float(hit)

@torch.no_grad()
def eval_meta_adapt_hr20(base_model, episodes: int):
    base_model.eval()
    hr_list = []
    used = 0

    for e in range(episodes):
        tk = int(RNG.choice(np.array(KEPT_TASKS)))
        sup_refs = sample_refs_for_task(tk, B=int(rep12b["task_cfg"]["n_support"]))
        qry_refs = sample_refs_for_task(tk, B=int(rep12b["task_cfg"]["n_query"]))

        sup = build_batch_from_refs(train_files, sup_refs)
        qry = build_batch_from_refs(train_files, qry_refs)
        if sup is None or qry is None:
            continue

        # adapt must run with grad, so temporarily enable
        with torch.enable_grad():
            adapted = adapt_one_task(base_model, sup)

        logits = adapted(qry["input_ids"].to(DEVICE), qry["lengths"].to(DEVICE)).cpu()
        hr = hit_rate_at_k(logits, qry["labels"], k=K_EVAL)
        hr_list.append(hr)
        used += 1

    if used == 0:
        return None
    return {"HR@20": float(np.mean(hr_list)), "_episodes_used": used}

print(f"[{CELL}] ✅ eval ready")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-11] Starting... 2026-01-05 12:57:09
[13-11] VAL_EPISODES=50 K=20
[13-11] ✅ eval ready
[13-11] Done in 0.00s


Meta-train loop + checkpoint + report.json

In [19]:
# [CELL 13-12] Meta-train loop (item-set tasks) + checkpoint + report
import time, json
from datetime import datetime
import os
import torch

CELL = "13-12"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

META_STEPS = int(META_CFG["meta_steps"])
LOG_EVERY = int(META_CFG["log_every"])
EVAL_EVERY = int(META_CFG["eval_every"])

best_step = None
best_val_hr20 = -1.0

CKPT_PATH = os.path.join(REPORT_DIR, "meta_model_source_itemset.pt")
REPORT_PATH = os.path.join(REPORT_DIR, "report.json")

report = {
    "run_tag": RUN_TAG,
    "created_at": datetime.now().isoformat(timespec="seconds"),
    "proto": PROTO,
    "meta_cfg": META_CFG,
    "task_cfg_itemset": ITEMSET_CFG,
    "best_step": None,
    "best_val_hr20": None,
    "val_meta_adapt": None,
    "notes": [
        "Notebook 13: item-set tasks based on seed_item_first_nonpad (first item in session after mapping).",
        "Meta-learning is first-order (grads copied from adapted to base).",
        "Pairs are generated on-the-fly from session sequences (prefix->next item), left-padded to MAX_PREFIX_LEN.",
    ],
}

for step in range(1, META_STEPS + 1):
    out = meta_step(model)

    if step % LOG_EVERY == 0:
        print(f"[{CELL}] step={step}/{META_STEPS} used_tasks={out['used_tasks']} q_loss={out['q_loss']}")

    if step % EVAL_EVERY == 0:
        val_res = eval_meta_adapt_hr20(model, episodes=VAL_EPISODES)
        if val_res is None:
            print(f"[{CELL}] EVAL step={step}: val_res=None (no episodes built)")
            continue
        hr20 = float(val_res["HR@20"])
        print(f"[{CELL}] EVAL step={step}: VAL meta-adapt HR@20={hr20:.6f} episodes_used={val_res['_episodes_used']}")

        if hr20 > best_val_hr20:
            best_val_hr20 = hr20
            best_step = step

            # save checkpoint
            torch.save({
                "run_tag": RUN_TAG,
                "proto": PROTO,
                "meta_cfg": META_CFG,
                "task_cfg_itemset_path": ITEMSET_CFG_PATH,
                "vocab_size_source": VOCAB_SIZE_SRC,
                "pad_id_source": PAD_ID_SRC,
                "unk_id_source": UNK_ID_SRC,
                "state_dict": model.state_dict(),
                "best_step": best_step,
                "best_val_hr20": best_val_hr20,
            }, CKPT_PATH)

            print(f"[{CELL}] ✅ saved best ckpt: {CKPT_PATH}")

# final eval snapshot
final_val = eval_meta_adapt_hr20(model, episodes=max(VAL_EPISODES, 100))
report["best_step"] = best_step
report["best_val_hr20"] = float(best_val_hr20)
report["val_meta_adapt"] = final_val

with open(REPORT_PATH, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)

print(f"[{CELL}] ✅ wrote report: {REPORT_PATH}")
print(f"[{CELL}] best_step={best_step} best_val_hr20={best_val_hr20}")
print(f"[{CELL}] final_val={final_val}")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-12] Starting... 2026-01-05 12:57:37
[13-12] step=100/2000 used_tasks=4 q_loss=7.4349600076675415
[13-12] step=200/2000 used_tasks=4 q_loss=7.141557931900024
[13-12] EVAL step=250: VAL meta-adapt HR@20=0.037000 episodes_used=50
[13-12] ✅ saved best ckpt: C:\mooc-coldstart-session-meta\reports\13_meta_train_on_source_itemset\20260105_114325\meta_model_source_itemset.pt
[13-12] step=300/2000 used_tasks=4 q_loss=7.186716318130493
[13-12] step=400/2000 used_tasks=4 q_loss=7.101266384124756
[13-12] step=500/2000 used_tasks=4 q_loss=7.279435873031616
[13-12] EVAL step=500: VAL meta-adapt HR@20=0.058000 episodes_used=50
[13-12] ✅ saved best ckpt: C:\mooc-coldstart-session-meta\reports\13_meta_train_on_source_itemset\20260105_114325\meta_model_source_itemset.pt
[13-12] step=600/2000 used_tasks=4 q_loss=6.874122142791748
[13-12] step=700/2000 used_tasks=4 q_loss=7.266090512275696
[13-12] EVAL step=750: VAL meta-adapt HR@20=0.111000 episodes_used=50
[13-12] ✅ saved best ckpt: C:\mooc-coldstar

In [20]:
# [CELL 13-13] STRICT VAL: Build task refs from VAL split (reservoir sampling; kept tasks only)
import time
from datetime import datetime
from collections import defaultdict
import numpy as np
import torch
import pyarrow.parquet as pq

CELL = "13-13"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

VAL_CAP_PER_TASK_REFS = int(ITEMSET_CFG.get("cap_per_task_refs", 5000))  # match train cap by default
VAL_MIN_TASK_EX = int(ITEMSET_CFG.get("min_task_examples", 50))
RNG_VAL = np.random.RandomState(SEED + 13)  # deterministic but different stream than TRAIN

print(f"[{CELL}] kept_tasks={len(KEPT_TASKS)} | VAL_MIN_TASK_EX={VAL_MIN_TASK_EX} | VAL_CAP_PER_TASK_REFS={VAL_CAP_PER_TASK_REFS}")

val_task_seen = defaultdict(int)
val_task_to_refs = defaultdict(list)

row_count = 0
skip_nonseq = 0
skip_empty = 0
skip_allpad = 0
skip_notkept = 0

for fi, path in enumerate(val_files):
    tf = time.time()
    df = pq.read_table(path, columns=[SEQ_COL]).to_pandas()
    n = len(df)

    for r in range(n):
        seq_tokens = normalize_seq(df.iloc[r][SEQ_COL])
        if seq_tokens is None:
            skip_nonseq += 1
            continue

        ids = seq_str_to_ids(seq_tokens, item2id, UNK_ID_SRC)
        if ids is None or len(ids) == 0:
            skip_empty += 1
            continue

        # seed = first non-pad id
        s = None
        for v in ids:
            iv = int(v)
            if iv != PAD_ID_SRC:
                s = iv
                break
        if s is None:
            skip_allpad += 1
            continue

        if s not in set(KEPT_TASKS):
            skip_notkept += 1
            continue

        val_task_seen[s] += 1
        seen = val_task_seen[s]
        buf = val_task_to_refs[s]

        if len(buf) < VAL_CAP_PER_TASK_REFS:
            buf.append((fi, r))
        else:
            j = RNG_VAL.randint(0, seen)
            if j < VAL_CAP_PER_TASK_REFS:
                buf[j] = (fi, r)

    row_count += n
    if fi < 3 or (fi + 1) % 100 == 0:
        print(f"[{CELL}] scanned {fi+1}/{len(val_files)} rows={row_count} tasks_with_refs={len(val_task_to_refs)} elapsed_file={time.time()-tf:.2f}s")

print(f"[{CELL}] VAL scan done. rows={row_count}")
print(f"[{CELL}] skips: nonseq={skip_nonseq} empty={skip_empty} allpad={skip_allpad} notkept={skip_notkept}")
print(f"[{CELL}] tasks_with_refs={len(val_task_to_refs)}")

if len(val_task_to_refs) == 0:
    raise RuntimeError(f"[{CELL}] No VAL refs built. Unexpected; check VAL parquet.")

# Pack and save
val_refs_pack = {}
for k, refs in val_task_to_refs.items():
    val_refs_pack[int(k)] = {
        "file_idx": [int(a) for a, b in refs],
        "row_idx": [int(b) for a, b in refs],
    }

VAL_REFS_PATH = os.path.join(REPORT_DIR, "val_task_refs_itemset.pt")
torch.save(
    {"kept_tasks": KEPT_TASKS, "cap": VAL_CAP_PER_TASK_REFS, "refs": val_refs_pack},
    VAL_REFS_PATH
)

print(f"[{CELL}] ✅ wrote VAL refs: {VAL_REFS_PATH}")

# Quick sanity
sk = KEPT_TASKS[0]
print(f"[{CELL}] sanity task={sk} val_sampled_refs={len(val_task_to_refs.get(sk, []))} val_seen={val_task_seen.get(sk, 0)}")

print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-13] Starting... 2026-01-05 15:03:07
[13-13] kept_tasks=1345 | VAL_MIN_TASK_EX=50 | VAL_CAP_PER_TASK_REFS=5000
[13-13] scanned 1/1024 rows=769 tasks_with_refs=341 elapsed_file=0.08s
[13-13] scanned 2/1024 rows=1579 tasks_with_refs=492 elapsed_file=0.07s
[13-13] scanned 3/1024 rows=2349 tasks_with_refs=596 elapsed_file=0.06s
[13-13] scanned 100/1024 rows=81398 tasks_with_refs=1274 elapsed_file=0.08s
[13-13] scanned 200/1024 rows=163159 tasks_with_refs=1317 elapsed_file=0.06s
[13-13] scanned 300/1024 rows=244632 tasks_with_refs=1334 elapsed_file=0.07s
[13-13] scanned 400/1024 rows=325724 tasks_with_refs=1336 elapsed_file=0.06s
[13-13] scanned 500/1024 rows=407196 tasks_with_refs=1341 elapsed_file=0.06s
[13-13] scanned 600/1024 rows=488570 tasks_with_refs=1342 elapsed_file=0.06s
[13-13] scanned 700/1024 rows=569943 tasks_with_refs=1344 elapsed_file=0.06s
[13-13] scanned 800/1024 rows=651167 tasks_with_refs=1344 elapsed_file=0.06s
[13-13] scanned 900/1024 rows=732510 tasks_with_refs=134

In [21]:
# [CELL 13-14] STRICT VAL EVAL: use VAL refs (not TRAIN proxy) + update report.json
import time, json
from datetime import datetime
import numpy as np
import torch

CELL = "13-14"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

VAL_PACK = torch.load(os.path.join(REPORT_DIR, "val_task_refs_itemset.pt"), map_location="cpu")
VAL_REFS = VAL_PACK["refs"]
VAL_TASKS = list(map(int, VAL_PACK["kept_tasks"]))
print(f"[{CELL}] VAL_TASKS={len(VAL_TASKS)} | VAL_REFS tasks={len(VAL_REFS)} | cap={VAL_PACK.get('cap')}")

def sample_refs_for_task_from(pack_refs, task_id: int, B: int, rng: np.random.RandomState):
    ref_pack = pack_refs[str(task_id)] if isinstance(next(iter(pack_refs.keys())), str) else pack_refs[task_id]
    fi = ref_pack["file_idx"]
    ri = ref_pack["row_idx"]
    N = len(fi)
    if N == 0:
        return []
    idxs = rng.choice(np.arange(N), size=B, replace=(N < B))
    return [(int(fi[j]), int(ri[j])) for j in idxs]

@torch.no_grad()
def eval_meta_adapt_hr20_strict_val(base_model, episodes: int, seed_offset: int = 0):
    base_model.eval()
    hr_list = []
    used = 0
    rng = np.random.RandomState(SEED + 1000 + seed_offset)

    for e in range(episodes):
        tk = int(rng.choice(np.array(VAL_TASKS)))
        sup_refs = sample_refs_for_task_from(REFS, tk, B=int(rep12b["task_cfg"]["n_support"]), rng=rng)   # support from TRAIN refs
        qry_refs = sample_refs_for_task_from(VAL_REFS, tk, B=int(rep12b["task_cfg"]["n_query"]), rng=rng) # query from VAL refs

        sup = build_batch_from_refs(train_files, sup_refs)
        qry = build_batch_from_refs(val_files, qry_refs)
        if sup is None or qry is None:
            continue

        with torch.enable_grad():
            adapted = adapt_one_task(base_model, sup)

        logits = adapted(qry["input_ids"].to(DEVICE), qry["lengths"].to(DEVICE)).cpu()
        topk = torch.topk(logits, k=20, dim=1).indices
        y = qry["labels"].view(-1, 1)
        hr = (topk == y).any(dim=1).float().mean().item()

        hr_list.append(float(hr))
        used += 1

    if used == 0:
        return None
    return {"HR@20": float(np.mean(hr_list)), "_episodes_used": int(used)}

STRICT_VAL_EPISODES = 200  # stronger estimate than 50
strict_val = eval_meta_adapt_hr20_strict_val(model, episodes=STRICT_VAL_EPISODES, seed_offset=14)

print(f"[{CELL}] STRICT VAL meta-adapt: {strict_val}")

# Update report.json
REPORT_PATH = os.path.join(REPORT_DIR, "report.json")
with open(REPORT_PATH, "r", encoding="utf-8") as f:
    rep = json.load(f)

rep["val_meta_adapt_strict"] = strict_val
rep["notes"].append("STRICT VAL eval added: support sampled from TRAIN refs, query sampled from VAL refs (val_task_refs_itemset.pt).")

with open(REPORT_PATH, "w", encoding="utf-8") as f:
    json.dump(rep, f, indent=2)

print(f"[{CELL}] ✅ updated report.json: {REPORT_PATH}")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[13-14] Starting... 2026-01-05 15:04:17
[13-14] VAL_TASKS=1345 | VAL_REFS tasks=1345 | cap=5000
[13-14] STRICT VAL meta-adapt: {'HR@20': 0.1387500020302832, '_episodes_used': 200}
[13-14] ✅ updated report.json: C:\mooc-coldstart-session-meta\reports\13_meta_train_on_source_itemset\20260105_114325\report.json
[13-14] Done in 77.87s
