### Goal

Reptile meta-learning starter for session-based recommendation. Uses tasks built from pretraining datasets (Yoochoose / Amazon categories). Produces meta-model that can be adapted quickly to MARS.


In [4]:
# Quick (unsafe) workaround to avoid the libiomp5md.dll crash.
# Use this only to continue working in the notebook quickly.
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
print("Set KMP_DUPLICATE_LIB_OK=TRUE — restart kernel and re-run cells now.")

Set KMP_DUPLICATE_LIB_OK=TRUE — restart kernel and re-run cells now.


In [7]:
# Robust task builder: create unified meta_item2id, map prefixes+targets to ints, then build tasks.
import pandas as pd
import numpy as np
import torch
import json
from pathlib import Path
from collections import defaultdict

ROOT = Path('..')
DATA_DIR = ROOT/'data'/'processed'
CKPT_DIR = ROOT/'models'
META_VOCAB_DIR = DATA_DIR/'meta_vocab'
META_VOCAB_DIR.mkdir(parents=True, exist_ok=True)

# find candidate prefix/pair files
candidates = list(DATA_DIR.glob('**/*prefix*part*.parquet')) + list(DATA_DIR.glob('**/*prefix_target*.parquet')) + list(DATA_DIR.glob('**/*prefix_target*.parquet'))
print("Found candidate files:", len(candidates))

# 1) Two-pass: collect all unique item ids (strings) from prefix and target columns
unique_items = set()
n_files = 0
for p in candidates:
    try:
        df = pd.read_parquet(p, columns=['prefix','target'] if 'prefix' in pd.read_parquet(p, nrows=1).columns else df.columns)
    except Exception as e:
        # try reading full file if subset failed
        try:
            df = pd.read_parquet(p)
        except Exception as e2:
            print("Skipping unreadable file:", p.name)
            continue
    n_files += 1
    # iterate rows (vectorize when possible)
    if 'prefix' in df.columns:
        # prefixes may be strings like "12 34 56" or lists; normalize to str tokens
        prefs = df['prefix'].dropna().astype(str)
        for s in prefs:
            if not s: continue
            # split on whitespace to get tokens (works for "a b c" or "['a','b']" crude)
            toks = s.strip().split()
            for t in toks:
                unique_items.add(t)
    # targets
    if 'target' in df.columns:
        for t in df['target'].dropna().astype(str):
            unique_items.add(t)
print(f"Scanned {n_files} files; unique item tokens found: {len(unique_items)}")

# build mapping
meta_item2id = {}
for i, item in enumerate(sorted(unique_items)):
    meta_item2id[item] = i+1   # reserve 0 for padding/OOV
meta_item2id['<OOV>'] = 0

# save mapping
with open(META_VOCAB_DIR/'meta_item2id.json','w') as fh:
    json.dump(meta_item2id, fh)
print("Saved meta_item2id.json with size", len(meta_item2id))

# 2) Second pass: build tasks using mapped integer ids
def str_prefix_to_id_list(pref_str, mapping, max_len=20):
    if not isinstance(pref_str, str): return []
    toks = pref_str.strip().split()
    ids = [ mapping.get(t, 0) for t in toks ]  # unknown -> 0
    if len(ids) > max_len: ids = ids[-max_len:]
    return ids

tasks = []
MAX_TASKS = 500
MIN_PAIRS = 50
max_per_file_tasks = 100

for p in candidates:
    try:
        df = pd.read_parquet(p)
    except Exception:
        continue
    # ensure prefix and target columns exist
    if 'prefix' not in df.columns or 'target' not in df.columns:
        # try to infer columns with similar names
        cols = df.columns
        if any('prefix' in c for c in cols) and any('target' in c for c in cols):
            # map columns
            pref_col = [c for c in cols if 'prefix' in c][0]
            targ_col = [c for c in cols if 'target' in c][0]
            df = df.rename(columns={pref_col: 'prefix', targ_col: 'target'})
        else:
            # skip
            continue

    # optional grouping column
    group_col = None
    for col in ['category','dataset','label','course_id']:
        if col in df.columns:
            group_col = col
            break

    if group_col:
        groups = df.groupby(group_col)
        cnt = 0
        for name, g in groups:
            # map and build (prefix, target) lists
            P=[]; T=[]
            for _, r in g.iterrows():
                pref = r['prefix'] if pd.notna(r['prefix']) else ''
                pref_ids = str_prefix_to_id_list(str(pref), meta_item2id)
                if len(pref_ids) == 0: continue
                padded = [0]*(20-len(pref_ids)) + pref_ids
                target_token = str(r['target'])
                tgt_id = meta_item2id.get(target_token, 0)
                P.append(padded); T.append(tgt_id)
            if len(P) >= MIN_PAIRS:
                tasks.append({'P': torch.LongTensor(P), 'T': torch.LongTensor(T), 'name': f"{p.stem}::{name}"})
                cnt += 1
            if cnt >= max_per_file_tasks: break
    else:
        # chunking fallback
        rows = df.to_dict(orient='records')
        # build prefix-target rows list first
        seqs = []
        for r in rows:
            pref = r.get('prefix','')
            if pref is None: pref = ''
            pref_ids = str_prefix_to_id_list(str(pref), meta_item2id)
            if len(pref_ids) == 0: continue
            padded = [0]*(20-len(pref_ids)) + pref_ids
            tgt_id = meta_item2id.get(str(r.get('target', '')), 0)
            seqs.append((padded, tgt_id))
        # chunk into tasks
        for start in range(0, len(seqs), MIN_PAIRS):
            sub = seqs[start:start+MIN_PAIRS*5]
            if len(sub) < MIN_PAIRS: break
            P = [x[0] for x in sub]; T = [x[1] for x in sub]
            tasks.append({'P': torch.LongTensor(P), 'T': torch.LongTensor(T), 'name': f"{p.stem}::chunk_{start}"})
            if len(tasks) >= MAX_TASKS:
                break
    if len(tasks) >= MAX_TASKS:
        break

print("Total tasks built:", len(tasks))

# Diagnostics: print first 10 tasks
for i, t in enumerate(tasks[:10]):
    nonzero = (t['P'][0] != 0).sum().item()
    print(f"task[{i}] name={t['name']} pairs={t['P'].size(0)} example_prefix_nonzero_len={nonzero}")

# Save a small summary file
summary = [{'name': t['name'], 'pairs': int(t['P'].size(0))} for t in tasks]
pd.DataFrame(summary).to_csv(META_VOCAB_DIR/'meta_tasks_summary.csv', index=False)
print("Saved tasks summary:", META_VOCAB_DIR/'meta_tasks_summary.csv')


Found candidate files: 491
Scanned 491 files; unique item tokens found: 3754480
Saved meta_item2id.json with size 3754481
Total tasks built: 491
task[0] name=amazon_prefix_target_part0000::amazon_books_2023 pairs=200000 example_prefix_nonzero_len=1
task[1] name=amazon_prefix_target_part0001::amazon_books_2023 pairs=200000 example_prefix_nonzero_len=7
task[2] name=amazon_prefix_target_part0002::amazon_books_2023 pairs=12544 example_prefix_nonzero_len=1
task[3] name=amazon_prefix_target_part0003::amazon_books_2023 pairs=200000 example_prefix_nonzero_len=1
task[4] name=amazon_prefix_target_part0004::amazon_books_2023 pairs=200000 example_prefix_nonzero_len=19
task[5] name=amazon_prefix_target_part0005::amazon_books_2023 pairs=10989 example_prefix_nonzero_len=1
task[6] name=amazon_prefix_target_part0006::amazon_books_2023 pairs=200000 example_prefix_nonzero_len=1
task[7] name=amazon_prefix_target_part0007::amazon_books_2023 pairs=200000 example_prefix_nonzero_len=8
task[8] name=amazon_pref