Bootstrap: paths, run_tag, device, seed

In [1]:
# [CELL 14-00] Bootstrap: repo root, report dir, device, seed
import os, sys, json, time, glob
from pathlib import Path
from datetime import datetime

import numpy as np
import torch

CELL = "14-00"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# --- locate repo root (Windows-safe) ---
CWD = Path.cwd().resolve()
def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "PROJECT_STATE.md").exists() or (p / "PROJECT_STATE_UPDATED_20260104_CORRECTED.md").exists():
            return p
    # fallback: use current dir
    return start

REPO_ROOT = find_repo_root(CWD)
REPORT_DIR = REPO_ROOT / "reports" / "14_meta_adapt_and_eval_on_target_itemset" / datetime.now().strftime("%Y%m%d_%H%M%S")
REPORT_DIR.mkdir(parents=True, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 20260105

def set_seed(seed: int):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

set_seed(SEED)

print(f"[{CELL}] REPO_ROOT : {REPO_ROOT}")
print(f"[{CELL}] REPORT_DIR: {REPORT_DIR}")
print(f"[{CELL}] DEVICE   : {DEVICE} | torch={torch.__version__}")
print(f"[{CELL}] SEED     : {SEED}")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[14-00] Starting... 2026-01-05 15:24:16
[14-00] REPO_ROOT : C:\mooc-coldstart-session-meta
[14-00] REPORT_DIR: C:\mooc-coldstart-session-meta\reports\14_meta_adapt_and_eval_on_target_itemset\20260105_152416
[14-00] DEVICE   : cpu | torch=2.9.1+cpu
[14-00] SEED     : 20260105
[14-00] Done in 0.02s


Point to Notebook 13 checkpoint + load proto/meta_cfg

In [2]:
# [CELL 14-01] Load Notebook 13 meta-init checkpoint + pull proto/meta_cfg
import time
from datetime import datetime

CELL = "14-01"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# ---- set your Notebook 13 run_tag ----
RUN_TAG_13 = "20260105_114325"
RUN_DIR_13 = REPO_ROOT / "reports" / "13_meta_train_on_source_itemset" / RUN_TAG_13
CKPT_13 = RUN_DIR_13 / "meta_model_source_itemset.pt"
REPORT_13 = RUN_DIR_13 / "report.json"

print(f"[{CELL}] RUN_DIR_13 : {RUN_DIR_13}")
print(f"[{CELL}] CKPT_13    : {CKPT_13}")
print(f"[{CELL}] REPORT_13  : {REPORT_13}")

if not CKPT_13.exists():
    raise RuntimeError(f"[{CELL}] Missing ckpt: {CKPT_13}")
if not REPORT_13.exists():
    raise RuntimeError(f"[{CELL}] Missing report.json: {REPORT_13}")

with open(REPORT_13, "r", encoding="utf-8") as f:
    rep13 = json.load(f)

PROTO = rep13["proto"]
META_CFG = rep13["meta_cfg"]

print(f"[{CELL}] Loaded PROTO keys: {list(PROTO.keys())}")
print(f"[{CELL}] Loaded META_CFG keys: {list(META_CFG.keys())}")
print(f"[{CELL}] META_CFG: inner_steps={META_CFG['inner_steps']} inner_lr={META_CFG['inner_lr']}")

print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[14-01] Starting... 2026-01-05 15:25:42
[14-01] RUN_DIR_13 : C:\mooc-coldstart-session-meta\reports\13_meta_train_on_source_itemset\20260105_114325
[14-01] CKPT_13    : C:\mooc-coldstart-session-meta\reports\13_meta_train_on_source_itemset\20260105_114325\meta_model_source_itemset.pt
[14-01] REPORT_13  : C:\mooc-coldstart-session-meta\reports\13_meta_train_on_source_itemset\20260105_114325\report.json
[14-01] Loaded PROTO keys: ['K_LIST', 'MAX_PREFIX_LEN', 'CAP_ENABLED', 'CAP_SESSION_LEN', 'CAP_STRATEGY']
[14-01] Loaded META_CFG keys: ['emb_dim', 'hidden_dim', 'dropout', 'meta_lr', 'inner_lr', 'inner_steps', 'meta_steps', 'meta_batch_tasks', 'grad_clip', 'seed', 'log_every', 'eval_every', 'val_episodes']
[14-01] META_CFG: inner_steps=1 inner_lr=0.01
[14-01] Done in 0.02s


Load target tensor config + metadata (PAD/UNK/VOCAB)
This matches what we already fixed in 12C: get PAD/UNK/VOCAB from target_tensor_metadata_*.json.

In [3]:
# [CELL 14-02] Load TARGET tensor splits + metadata (pad/unk/vocab)
import time
from datetime import datetime

CELL = "14-02"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

DL_CFG_PATH = REPO_ROOT / "data" / "processed" / "supervised" / "dataloader_config_20251229_163357_20251229_232834.json"
print(f"[{CELL}] DL_CFG_PATH: {DL_CFG_PATH}")

if not DL_CFG_PATH.exists():
    raise RuntimeError(f"[{CELL}] Missing DL config: {DL_CFG_PATH}")

with open(DL_CFG_PATH, "r", encoding="utf-8") as f:
    dl_cfg = json.load(f)

TARGET_CFG = dl_cfg["target"]
print(f"[{CELL}] TARGET_CFG keys: {list(TARGET_CFG.keys())}")

TENSOR_DIR = Path(TARGET_CFG["tensor_dir"])
TRAIN_PT = Path(TARGET_CFG["train_pt"])
VAL_PT   = Path(TARGET_CFG["val_pt"])
TEST_PT  = Path(TARGET_CFG["test_pt"])
META_JSON = Path(TARGET_CFG["meta_json"])

print(f"[{CELL}] tensor_dir: {TENSOR_DIR}")
print(f"[{CELL}] train_pt : {TRAIN_PT}")
print(f"[{CELL}] val_pt   : {VAL_PT}")
print(f"[{CELL}] test_pt  : {TEST_PT}")
print(f"[{CELL}] meta_json: {META_JSON}")

if not META_JSON.exists():
    raise RuntimeError(f"[{CELL}] Missing target meta_json: {META_JSON}")

with open(META_JSON, "r", encoding="utf-8") as f:
    target_meta = json.load(f)

PAD_ID_TGT = int(target_meta["pad_id"])
UNK_ID_TGT = int(target_meta["unk_id"])
VOCAB_SIZE_TGT = int(target_meta["vocab_size"])
MAX_LEN_TGT = int(target_meta["max_len"])

print(f"[{CELL}] Extracted PAD_ID_TGT={PAD_ID_TGT} UNK_ID_TGT={UNK_ID_TGT} VOCAB_SIZE_TGT={VOCAB_SIZE_TGT} MAX_LEN={MAX_LEN_TGT}")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[14-02] Starting... 2026-01-05 15:26:14
[14-02] DL_CFG_PATH: C:\mooc-coldstart-session-meta\data\processed\supervised\dataloader_config_20251229_163357_20251229_232834.json
[14-02] TARGET_CFG keys: ['run_tag', 'tensor_dir', 'train_pt', 'val_pt', 'test_pt', 'meta_json']
[14-02] tensor_dir: C:\mooc-coldstart-session-meta\data\processed\tensor_target
[14-02] train_pt : C:\mooc-coldstart-session-meta\data\processed\tensor_target\target_tensor_train_20251229_163357.pt
[14-02] val_pt   : C:\mooc-coldstart-session-meta\data\processed\tensor_target\target_tensor_val_20251229_163357.pt
[14-02] test_pt  : C:\mooc-coldstart-session-meta\data\processed\tensor_target\target_tensor_test_20251229_163357.pt
[14-02] meta_json: C:\mooc-coldstart-session-meta\data\processed\tensor_target\target_tensor_metadata_20251229_163357.json
[14-02] Extracted PAD_ID_TGT=0 UNK_ID_TGT=1 VOCAB_SIZE_TGT=747 MAX_LEN=20
[14-02] Done in 0.02s


Load target tensors (PyTorch 2.6+ safe: weights_only=False)

In [4]:
# [CELL 14-03] Load TARGET tensor splits (torch.load weights_only=False)
import time
from datetime import datetime

CELL = "14-03"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"[{CELL}] torch.__version__={torch.__version__}")
print(f"[{CELL}] Loading TARGET tensor splits with weights_only=False (trusted local files)")

def _standardize_loaded(raw, name: str, pad_id: int):
    """
    Accepts dict-like saved tensors. Ensures required keys exist:
      input_ids, attn_mask, labels
    """
    if isinstance(raw, dict):
        d = raw
    else:
        raise RuntimeError(f"[{CELL}] {name}: unexpected type {type(raw)} from torch.load")
    for k in ["input_ids", "attn_mask", "labels"]:
        if k not in d:
            raise RuntimeError(f"[{CELL}] {name}: missing key '{k}' in loaded dict keys={list(d.keys())}")
    # lengths from attn_mask
    lengths = d["attn_mask"].sum(dim=1).clamp(min=1)
    d["lengths"] = lengths
    return d

def load_split(pt_path: Path, name: str, pad_id: int):
    raw = torch.load(str(pt_path), map_location="cpu", weights_only=False)
    data = _standardize_loaded(raw, name, pad_id=pad_id)
    N, L = data["input_ids"].shape
    print(f"[{CELL}] {name}: keys={list(data.keys())}")
    print(f"[{CELL}] {name}: input_ids={tuple(data['input_ids'].shape)} lengths={tuple(data['lengths'].shape)} labels={tuple(data['labels'].shape)}")
    lens = data["lengths"].numpy()
    print(f"[{CELL}] {name}: lengths min={int(lens.min())} p50={int(np.median(lens))} max={int(lens.max())}")
    print(f"[{CELL}] {name}: sample0 len={int(data['lengths'][0])} y={int(data['labels'][0])} x_last5={data['input_ids'][0].tolist()[-5:]}")
    return data

TARGET_TRAIN = load_split(TRAIN_PT, "TARGET_TRAIN", pad_id=PAD_ID_TGT)
TARGET_VAL   = load_split(VAL_PT,   "TARGET_VAL",   pad_id=PAD_ID_TGT)
TARGET_TEST  = load_split(TEST_PT,  "TARGET_TEST",  pad_id=PAD_ID_TGT)

# sanity
max_token = int(torch.max(torch.cat([TARGET_TRAIN["input_ids"], TARGET_VAL["input_ids"], TARGET_TEST["input_ids"]])).item())
print(f"[{CELL}] max_token_id_seen={max_token} | VOCAB_SIZE_TGT={VOCAB_SIZE_TGT}")

print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[14-03] Starting... 2026-01-05 15:26:41
[14-03] torch.__version__=2.9.1+cpu
[14-03] Loading TARGET tensor splits with weights_only=False (trusted local files)
[14-03] TARGET_TRAIN: keys=['input_ids', 'attn_mask', 'labels', 'session_id', 'user_id', 't', 'split', 'lengths']
[14-03] TARGET_TRAIN: input_ids=(1944, 20) lengths=(1944,) labels=(1944,)
[14-03] TARGET_TRAIN: lengths min=1 p50=4 max=20
[14-03] TARGET_TRAIN: sample0 len=1 y=423 x_last5=[0, 0, 0, 0, 416]
[14-03] TARGET_VAL: keys=['input_ids', 'attn_mask', 'labels', 'session_id', 'user_id', 't', 'split', 'lengths']
[14-03] TARGET_VAL: input_ids=(189, 20) lengths=(189,) labels=(189,)
[14-03] TARGET_VAL: lengths min=1 p50=3 max=20
[14-03] TARGET_VAL: sample0 len=1 y=380 x_last5=[0, 0, 0, 0, 383]
[14-03] TARGET_TEST: keys=['input_ids', 'attn_mask', 'labels', 'session_id', 'user_id', 't', 'split', 'lengths']
[14-03] TARGET_TEST: input_ids=(200, 20) lengths=(200,) labels=(200,)
[14-03] TARGET_TEST: lengths min=1 p50=3 max=20
[14-03] TAR

Metrics helpers (HR/MRR/NDCG) + exclude-seen

In [5]:
# [CELL 14-04] Metrics helpers (HR/MRR/NDCG) + exclude-seen masking
import time
from datetime import datetime
import torch

CELL = "14-04"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

K_LIST = list(map(int, PROTO["K_LIST"]))
print(f"[{CELL}] K_LIST={K_LIST}")

def mask_seen_logits_(logits: torch.Tensor, input_ids: torch.Tensor, lengths: torch.Tensor, pad_id: int):
    """
    In-place: set logits for seen tokens to -inf, excluding pad_id.
    seen tokens = tokens in the prefix positions (non-pad)
    """
    B, V = logits.shape
    changed = 0
    for i in range(B):
        L = int(lengths[i].item())
        if L <= 0:
            continue
        seen = input_ids[i, -L:].tolist()  # last L tokens are the actual prefix (left-pad)
        for tok in seen:
            if tok == pad_id:
                continue
            if 0 <= tok < V:
                logits[i, tok] = float("-inf")
                changed += 1
    return changed

def compute_metrics_from_logits(logits: torch.Tensor, labels: torch.Tensor, k_list):
    """
    logits: [B,V] (higher=better)
    labels: [B]
    returns dict HR@K, MRR@K, NDCG@K
    """
    out = {}
    B = logits.size(0)
    for k in k_list:
        topk = torch.topk(logits, k=k, dim=1).indices  # [B,k]
        y = labels.view(-1, 1)
        hit = (topk == y).any(dim=1).float()
        out[f"HR@{k}"] = float(hit.mean().item())

        # rank positions
        ranks = torch.full((B,), fill_value=0, dtype=torch.long)
        for i in range(B):
            m = (topk[i] == labels[i]).nonzero(as_tuple=False)
            if len(m) > 0:
                ranks[i] = int(m[0].item()) + 1  # 1-based rank
        rr = torch.where(ranks > 0, 1.0 / ranks.float(), torch.zeros_like(ranks, dtype=torch.float))
        out[f"MRR@{k}"] = float(rr.mean().item())

        ndcg = torch.where(ranks > 0, 1.0 / torch.log2(ranks.float() + 1.0), torch.zeros_like(ranks, dtype=torch.float))
        out[f"NDCG@{k}"] = float(ndcg.mean().item())
    return out

print(f"[{CELL}] ✅ metrics ready")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[14-04] Starting... 2026-01-05 15:27:09
[14-04] K_LIST=[5, 10, 20]
[14-04] ✅ metrics ready
[14-04] Done in 0.00s


Model definition (must match checkpoint architecture) + make_lengths
We used the same GRU4RecDropout class (same as 13 and 12C).

In [6]:
# [CELL 14-05] Model: GRU4RecDropout (same family) + make_lengths
import time
from datetime import datetime
import torch.nn as nn
import torch.nn.functional as F
import copy

CELL = "14-05"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

class GRU4RecDropout(nn.Module):
    def __init__(self, vocab_size: int, emb_dim: int, hidden_dim: int, pad_id: int, dropout: float = 0.3):
        super().__init__()
        self.pad_id = int(pad_id)
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=self.pad_id)
        self.drop = nn.Dropout(float(dropout))
        self.gru = nn.GRU(input_size=emb_dim, hidden_size=hidden_dim, batch_first=True)
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_ids: torch.Tensor, lengths: torch.Tensor):
        emb = self.drop(self.emb(input_ids))
        packed = nn.utils.rnn.pack_padded_sequence(emb, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, h = self.gru(packed)
        logits = self.out(h.squeeze(0))
        return logits

def make_lengths(attn_mask: torch.Tensor) -> torch.Tensor:
    return attn_mask.sum(dim=1).clamp(min=1)

print(f"[{CELL}] ✅ GRU4RecDropout ready")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[14-05] Starting... 2026-01-05 15:27:47
[14-05] ✅ GRU4RecDropout ready
[14-05] Done in 0.00s


Load 13 checkpoint + transfer GRU weights into TARGET-sized model (vocab mismatch safe)

This is the critical part: load only GRU weights; emb/out are target-sized.

In [7]:
# [CELL 14-06] Load meta-init from 13 and transfer GRU weights into target-sized model
import time
from datetime import datetime

CELL = "14-06"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

ckpt = torch.load(str(CKPT_13), map_location="cpu", weights_only=False)
print(f"[{CELL}] Loaded ckpt type={type(ckpt)}")
print(f"[{CELL}] ckpt keys(head)={list(ckpt.keys())[:12]}")

state_dict_src = ckpt["state_dict"] if "state_dict" in ckpt else ckpt
print(f"[{CELL}] state_dict_src keys(head)={list(state_dict_src.keys())[:10]}")

emb_dim = int(META_CFG["emb_dim"])
hidden_dim = int(META_CFG["hidden_dim"])
dropout = float(META_CFG["dropout"])

# instantiate TARGET vocab model
base = GRU4RecDropout(
    vocab_size=VOCAB_SIZE_TGT,
    emb_dim=emb_dim,
    hidden_dim=hidden_dim,
    pad_id=PAD_ID_TGT,
    dropout=dropout,
).to(DEVICE)

# transfer GRU weights only
dst_sd = base.state_dict()
copied = []
skipped = []

for k, v in state_dict_src.items():
    if k.startswith("gru."):
        if k in dst_sd and dst_sd[k].shape == v.shape:
            dst_sd[k] = v.clone()
            copied.append(k)
        else:
            skipped.append(k)

base.load_state_dict(dst_sd, strict=False)

print(f"[{CELL}] copied GRU params={len(copied)} (expected >0)")
print(f"[{CELL}] example copied: {copied[:4]}")
if skipped:
    print(f"[{CELL}] skipped GRU params (shape mismatch?) count={len(skipped)} head={skipped[:4]}")

print(f"[{CELL}] TARGET model ready with GRU transferred, emb/out randomly init for target vocab.")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[14-06] Starting... 2026-01-05 15:28:15
[14-06] Loaded ckpt type=<class 'dict'>
[14-06] ckpt keys(head)=['run_tag', 'proto', 'meta_cfg', 'task_cfg_itemset_path', 'vocab_size_source', 'pad_id_source', 'unk_id_source', 'state_dict', 'best_step', 'best_val_hr20']
[14-06] state_dict_src keys(head)=['emb.weight', 'gru.weight_ih_l0', 'gru.weight_hh_l0', 'gru.bias_ih_l0', 'gru.bias_hh_l0', 'out.weight', 'out.bias']
[14-06] copied GRU params=4 (expected >0)
[14-06] example copied: ['gru.weight_ih_l0', 'gru.weight_hh_l0', 'gru.bias_ih_l0', 'gru.bias_hh_l0']
[14-06] TARGET model ready with GRU transferred, emb/out randomly init for target vocab.
[14-06] Done in 0.03s


Evaluate META-INIT on TARGET (VAL + TEST), with/without exclude-seen

In [8]:
# [CELL 14-07] Evaluate META-INIT (no adaptation) on TARGET: VAL/TEST (with/without exclude-seen)
import time
from datetime import datetime
import torch

CELL = "14-07"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

@torch.no_grad()
def eval_split(model, split_data, split_name: str, exclude_seen: bool):
    model.eval()
    input_ids = split_data["input_ids"].to(DEVICE)
    lengths   = split_data["lengths"].to(DEVICE)
    labels    = split_data["labels"].to(DEVICE)

    logits = model(input_ids, lengths)  # [B,V]
    logits = logits.detach().cpu()

    if exclude_seen:
        _changed = mask_seen_logits_(
            logits,
            split_data["input_ids"].cpu(),
            split_data["lengths"].cpu(),
            pad_id=PAD_ID_TGT
        )
        print(f"[{CELL}] {split_name} exclude_seen changed_elements={_changed}")

    metrics = compute_metrics_from_logits(logits, labels.cpu(), K_LIST)
    metrics["_exclude_seen"] = bool(exclude_seen)
    metrics["_n"] = int(labels.numel())
    return metrics

# META-INIT eval
val_init_noex = eval_split(base, TARGET_VAL,  "VAL",  exclude_seen=False)
val_init_ex   = eval_split(base, TARGET_VAL,  "VAL",  exclude_seen=True)
test_init_noex= eval_split(base, TARGET_TEST, "TEST", exclude_seen=False)
test_init_ex  = eval_split(base, TARGET_TEST, "TEST", exclude_seen=True)

print(f"[{CELL}] META-INIT VAL  (no-exclude): HR@20={val_init_noex['HR@20']:.6f} full={val_init_noex}")
print(f"[{CELL}] META-INIT VAL  (exclude)   : HR@20={val_init_ex['HR@20']:.6f} full={val_init_ex}")
print(f"[{CELL}] META-INIT TEST (no-exclude): HR@20={test_init_noex['HR@20']:.6f} full={test_init_noex}")
print(f"[{CELL}] META-INIT TEST (exclude)   : HR@20={test_init_ex['HR@20']:.6f} full={test_init_ex}")

print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[14-07] Starting... 2026-01-05 15:29:43
[14-07] VAL exclude_seen changed_elements=991
[14-07] TEST exclude_seen changed_elements=1471
[14-07] META-INIT VAL  (no-exclude): HR@20=0.021164 full={'HR@5': 0.005291005130857229, 'MRR@5': 0.005291005130857229, 'NDCG@5': 0.005291005130857229, 'HR@10': 0.010582010261714458, 'MRR@10': 0.0061728390865027905, 'NDCG@10': 0.007175699342042208, 'HR@20': 0.021164020523428917, 'MRR@20': 0.006892230827361345, 'NDCG@20': 0.009829754941165447, '_exclude_seen': False, '_n': 189}
[14-07] META-INIT VAL  (exclude)   : HR@20=0.021164 full={'HR@5': 0.005291005130857229, 'MRR@5': 0.005291005130857229, 'NDCG@5': 0.005291005130857229, 'HR@10': 0.010582010261714458, 'MRR@10': 0.0061728390865027905, 'NDCG@10': 0.007175699342042208, 'HR@20': 0.021164020523428917, 'MRR@20': 0.006892230827361345, 'NDCG@20': 0.009829754941165447, '_exclude_seen': True, '_n': 189}
[14-07] META-INIT TEST (no-exclude): HR@20=0.035000 full={'HR@5': 0.009999999776482582, 'MRR@5': 0.0020000000

Episodic sampler on TARGET (support from TRAIN, query from VAL/TEST)

Matches our 12C review: sampling may be with replacement due to small splits; we log it explicitly.

In [9]:
# [CELL 14-08] Episodic sampler for TARGET (support from TRAIN, query from VAL/TEST)
import time
from datetime import datetime
import numpy as np

CELL = "14-08"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

RNG14 = np.random.RandomState(SEED + 14)

N_SUPPORT = 20
N_QUERY = 20
EP_VAL = 200
EP_TEST = 200

print(f"[{CELL}] N_SUPPORT={N_SUPPORT} N_QUERY={N_QUERY} EP_VAL={EP_VAL} EP_TEST={EP_TEST}")
print(f"[{CELL}] TARGET sizes: train={TARGET_TRAIN['labels'].shape[0]} val={TARGET_VAL['labels'].shape[0]} test={TARGET_TEST['labels'].shape[0]}")

def sample_batch_from_split(split_data, B: int, rng: np.random.RandomState):
    N = split_data["labels"].shape[0]
    replace = N < B
    idx = rng.choice(np.arange(N), size=B, replace=replace)
    batch = {
        "input_ids": split_data["input_ids"][idx],
        "attn_mask": split_data["attn_mask"][idx],
        "lengths":   split_data["lengths"][idx],
        "labels":    split_data["labels"][idx],
    }
    return batch, bool(replace), int(N)

# quick sanity
sup, rep_sup, Ntr = sample_batch_from_split(TARGET_TRAIN, N_SUPPORT, RNG14)
qry, rep_q, Nvl = sample_batch_from_split(TARGET_VAL, N_QUERY, RNG14)
print(f"[{CELL}] sanity support: replace={rep_sup} N={Ntr} batch={sup['input_ids'].shape}")
print(f"[{CELL}] sanity query  : replace={rep_q} N={Nvl} batch={qry['input_ids'].shape}")

print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[14-08] Starting... 2026-01-05 15:30:22
[14-08] N_SUPPORT=20 N_QUERY=20 EP_VAL=200 EP_TEST=200
[14-08] TARGET sizes: train=1944 val=189 test=200
[14-08] sanity support: replace=False N=1944 batch=torch.Size([20, 20])
[14-08] sanity query  : replace=False N=189 batch=torch.Size([20, 20])
[14-08] Done in 0.02s


Adaptation helpers: clone_for_adapt + head-adapt scopes

We keep the same idea as your 12C improvement: freeze GRU, adapt head (out) and optionally emb.

In [10]:
# [CELL 14-09] Adaptation helpers: clone_for_adapt + scopes (full vs head-only)
import time
from datetime import datetime
import copy
import torch.nn as nn

CELL = "14-09"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

def clone_for_adapt(base_model, scope: str):
    """
    scope:
      - "full": all params trainable
      - "out": only out.* trainable
      - "out+emb": out.* and emb.* trainable
    Returns (model_clone, trainable_params)
    """
    m = copy.deepcopy(base_model).to(DEVICE)
    m.train()

    # freeze all by default
    for p in m.parameters():
        p.requires_grad = False

    train_params = []
    if scope == "full":
        for p in m.parameters():
            p.requires_grad = True
        train_params = [p for p in m.parameters() if p.requires_grad]

    elif scope == "out":
        for p in m.out.parameters():
            p.requires_grad = True
        train_params = [p for p in m.parameters() if p.requires_grad]

    elif scope == "out+emb":
        for p in m.out.parameters():
            p.requires_grad = True
        for p in m.emb.parameters():
            p.requires_grad = True
        train_params = [p for p in m.parameters() if p.requires_grad]
    else:
        raise ValueError(f"Unknown scope={scope}")

    trainable = sum(p.numel() for p in train_params)
    total = sum(p.numel() for p in m.parameters())
    print(f"[{CELL}] clone_for_adapt scope={scope}: trainable={trainable:,}/{total:,}")

    # name debug
    tn = []
    fn = []
    for n, p in m.named_parameters():
        (tn if p.requires_grad else fn).append(n)
    print(f"[{CELL}] trainable named params={len(tn)} frozen={len(fn)}")
    if len(tn) <= 10:
        for n in tn:
            print(f"  - trainable: {n}")

    return m, train_params

print(f"[{CELL}] ✅ clone_for_adapt ready")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[14-09] Starting... 2026-01-05 15:30:53
[14-09] ✅ clone_for_adapt ready
[14-09] Done in 0.00s


Episodic META-ADAPT evaluation (full vs head-adapt) on VAL/TEST

This is the core experiment. We compute HR@20 (and full metrics) with exclude-seen ON (recommended sanity).

In [11]:
# [CELL 14-10] Episodic evaluation: META-ADAPT(full) vs HEAD-ADAPT on TARGET VAL/TEST
import time
from datetime import datetime
import torch
import torch.nn.functional as F

CELL = "14-10"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

INNER_LR = float(META_CFG["inner_lr"])
INNER_STEPS = int(META_CFG["inner_steps"])
GRAD_CLIP = float(META_CFG["grad_clip"])

print(f"[{CELL}] INNER_LR={INNER_LR} INNER_STEPS={INNER_STEPS} GRAD_CLIP={GRAD_CLIP}")

def adapt_one_episode(base_model, support_batch, scope: str):
    m, train_params = clone_for_adapt(base_model, scope=scope)
    opt = torch.optim.SGD(train_params, lr=INNER_LR)

    for s in range(INNER_STEPS):
        opt.zero_grad(set_to_none=True)
        logits = m(support_batch["input_ids"].to(DEVICE), support_batch["lengths"].to(DEVICE))
        loss = F.cross_entropy(logits, support_batch["labels"].to(DEVICE), ignore_index=PAD_ID_TGT)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(train_params, GRAD_CLIP)
        opt.step()

    return m

@torch.no_grad()
def eval_episode_model(m, query_batch, exclude_seen: bool):
    logits = m(query_batch["input_ids"].to(DEVICE), query_batch["lengths"].to(DEVICE)).detach().cpu()
    if exclude_seen:
        mask_seen_logits_(
            logits,
            query_batch["input_ids"].cpu(),
            query_batch["lengths"].cpu(),
            pad_id=PAD_ID_TGT
        )
    metrics = compute_metrics_from_logits(logits, query_batch["labels"].cpu(), K_LIST)
    return metrics

def eval_meta_adapt(base_model, query_split, episodes: int, scope: str, exclude_seen: bool, rng: np.random.RandomState):
    hr20_list = []
    used = 0
    repl_support = 0
    repl_query = 0

    for e in range(episodes):
        support, rep_s, _ = sample_batch_from_split(TARGET_TRAIN, N_SUPPORT, rng)
        query,   rep_q, _ = sample_batch_from_split(query_split, N_QUERY, rng)
        repl_support += int(rep_s)
        repl_query   += int(rep_q)

        # adapt (needs grad)
        with torch.enable_grad():
            adapted = adapt_one_episode(base_model, support, scope=scope)

        # eval
        m = eval_episode_model(adapted, query, exclude_seen=exclude_seen)
        hr20_list.append(float(m["HR@20"]))
        used += 1

    return {
        "HR@20": float(np.mean(hr20_list)) if used else None,
        "_episodes": int(used),
        "_replace_support_frac": float(repl_support / max(1, used)),
        "_replace_query_frac": float(repl_query / max(1, used)),
        "_scope": scope,
        "_exclude_seen": bool(exclude_seen),
        "_n_support": int(N_SUPPORT),
        "_n_query": int(N_QUERY),
    }

# Evaluate
EXCLUDE_SEEN = True

val_full = eval_meta_adapt(base, TARGET_VAL,  episodes=EP_VAL,  scope="full",    exclude_seen=EXCLUDE_SEEN, rng=RNG14)
val_out  = eval_meta_adapt(base, TARGET_VAL,  episodes=EP_VAL,  scope="out",     exclude_seen=EXCLUDE_SEEN, rng=np.random.RandomState(SEED + 140))
val_outemb = eval_meta_adapt(base, TARGET_VAL, episodes=EP_VAL, scope="out+emb", exclude_seen=EXCLUDE_SEEN, rng=np.random.RandomState(SEED + 141))

test_full = eval_meta_adapt(base, TARGET_TEST, episodes=EP_TEST, scope="full",    exclude_seen=EXCLUDE_SEEN, rng=np.random.RandomState(SEED + 240))
test_out  = eval_meta_adapt(base, TARGET_TEST, episodes=EP_TEST, scope="out",     exclude_seen=EXCLUDE_SEEN, rng=np.random.RandomState(SEED + 241))
test_outemb = eval_meta_adapt(base, TARGET_TEST, episodes=EP_TEST, scope="out+emb", exclude_seen=EXCLUDE_SEEN, rng=np.random.RandomState(SEED + 242))

print(f"[{CELL}] VAL  full   : {val_full}")
print(f"[{CELL}] VAL  out    : {val_out}")
print(f"[{CELL}] VAL  out+emb: {val_outemb}")
print(f"[{CELL}] TEST full   : {test_full}")
print(f"[{CELL}] TEST out    : {test_out}")
print(f"[{CELL}] TEST out+emb: {test_outemb}")

print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[14-10] Starting... 2026-01-05 15:31:28
[14-10] INNER_LR=0.01 INNER_STEPS=1 GRAD_CLIP=1.0
[14-10] clone_for_adapt scope=full: trainable=218,667/218,667
[14-10] trainable named params=7 frozen=0
  - trainable: emb.weight
  - trainable: gru.weight_ih_l0
  - trainable: gru.weight_hh_l0
  - trainable: gru.bias_ih_l0
  - trainable: gru.bias_hh_l0
  - trainable: out.weight
  - trainable: out.bias
[14-10] clone_for_adapt scope=full: trainable=218,667/218,667
[14-10] trainable named params=7 frozen=0
  - trainable: emb.weight
  - trainable: gru.weight_ih_l0
  - trainable: gru.weight_hh_l0
  - trainable: gru.bias_ih_l0
  - trainable: gru.bias_hh_l0
  - trainable: out.weight
  - trainable: out.bias
[14-10] clone_for_adapt scope=full: trainable=218,667/218,667
[14-10] trainable named params=7 frozen=0
  - trainable: emb.weight
  - trainable: gru.weight_ih_l0
  - trainable: gru.weight_hh_l0
  - trainable: gru.bias_ih_l0
  - trainable: gru.bias_hh_l0
  - trainable: out.weight
  - trainable: out.bia

Save report.json + checkpoint + meta.json update hook (no auto-edit unless you want)

This writes a local report + checkpoint in the 14 report dir. We won’t touch global meta.json yet unless you ask.

In [12]:
# [CELL 14-11] Save artifacts: report.json + checkpoint (meta-init target-sized with GRU transferred)
import time, json
from datetime import datetime

CELL = "14-11"
t0 = time.time()
print(f"[{CELL}] Starting... {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

REPORT_PATH = REPORT_DIR / "report.json"
CKPT_PATH = REPORT_DIR / "meta_init_target_itemset_gru_only.pt"

report = {
    "run_tag": REPORT_DIR.name,
    "created_at": datetime.now().isoformat(timespec="seconds"),
    "source_meta_init": {
        "notebook": "13_meta_train_on_source_itemset",
        "run_tag": RUN_TAG_13,
        "ckpt": str(CKPT_13),
        "report": str(REPORT_13),
    },
    "target_data": {
        "run_tag": TARGET_CFG["run_tag"],
        "train_pt": str(TRAIN_PT),
        "val_pt": str(VAL_PT),
        "test_pt": str(TEST_PT),
        "meta_json": str(META_JSON),
        "pad_id": PAD_ID_TGT,
        "unk_id": UNK_ID_TGT,
        "vocab_size": VOCAB_SIZE_TGT,
        "max_len": MAX_LEN_TGT,
    },
    "proto": PROTO,
    "meta_cfg": META_CFG,
    "meta_init_eval": {
        "val_no_exclude": val_init_noex,
        "val_exclude": val_init_ex,
        "test_no_exclude": test_init_noex,
        "test_exclude": test_init_ex,
    },
    "meta_adapt_eval": {
        "val_full": val_full,
        "val_out": val_out,
        "val_out+emb": val_outemb,
        "test_full": test_full,
        "test_out": test_out,
        "test_out+emb": test_outemb,
    },
    "notes": [
        "Vocab mismatch (source!=target): transferred GRU weights only; emb/out are target-sized random init.",
        "Episodic eval: support from TARGET_TRAIN, query from TARGET_VAL/TEST.",
        "Sampling may be with replacement due to small target splits; see _replace_* fields.",
    ],
}

with open(REPORT_PATH, "w", encoding="utf-8") as f:
    json.dump(report, f, indent=2)

torch.save({
    "run_tag": REPORT_DIR.name,
    "from_13_ckpt": str(CKPT_13),
    "proto": PROTO,
    "meta_cfg": META_CFG,
    "pad_id_target": PAD_ID_TGT,
    "unk_id_target": UNK_ID_TGT,
    "vocab_size_target": VOCAB_SIZE_TGT,
    "state_dict": base.state_dict(),
    "gru_transferred_from_source_vocab": int(ckpt.get("vocab_size_source", -1)),
}, CKPT_PATH)

print(f"[{CELL}] ✅ wrote report: {REPORT_PATH}")
print(f"[{CELL}] ✅ wrote ckpt  : {CKPT_PATH}")
print(f"[{CELL}] Done in {time.time()-t0:.2f}s")


[14-11] Starting... 2026-01-05 15:35:28
[14-11] ✅ wrote report: C:\mooc-coldstart-session-meta\reports\14_meta_adapt_and_eval_on_target_itemset\20260105_152416\report.json
[14-11] ✅ wrote ckpt  : C:\mooc-coldstart-session-meta\reports\14_meta_adapt_and_eval_on_target_itemset\20260105_152416\meta_init_target_itemset_gru_only.pt
[14-11] Done in 0.00s
