In [1]:
# === Cell 0 (02以降 共通): レジストリから解決して paths を読む ===
import run_id_registry as runreg
rid = runreg.bootstrap()  # env→ファイル(artifacts/_current/run_id.txt)→Part3→latest→新規 の順で解決

import importlib
import _compat.paths as paths
importlib.reload(paths)

print("[NX] RUN_ID =", rid, "| paths.RUN_ID =", paths.RUN_ID)
paths.ensure_roots()  # artifacts/{RUN_ID}/... を必ず作成
BASE_DIRS = paths.compat_base_dirs  # {'raw','data','models','results','handoff','logs','traces'}


[NX] RUN_ID = 2025-10-22_074213 | paths.RUN_ID = 2025-10-22_074213


# 04-1_config_and_data_preparation (joblib loader fix)

- 目的: Part3 の handoff を読み込み、config と I/O を初期化し、04-1 の handoff を出力
- 修正点: **Part3 handoff のローダを `joblib.load` 優先**（`pickle.load` はフォールバック）に変更
- 前提: **Cell 0** で `run_id_registry` により RUN_ID 決定、`_compat.paths` を初期化済み


In [2]:
# === Cell 1: Config loader ===
from __future__ import annotations
import os, json
from copy import deepcopy
from pathlib import Path

try:
    import yaml  # optional
except Exception:
    yaml = None

_DEFAULTS = {
    "system": {
        "cert_only_mode": False,
        "seed": 42,
        "development_mode": False,
    },
    "model": {
        "n_estimators": 300,
        "max_depth": 8,
        "learning_rate": 0.1,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "early_stopping_rounds": 20,
    },
    "engine": {
        "batch_size": 10_000,
        "gpu_auto_detect": True,
    },
    "llm": {
        "enabled": True,
        "provider": "vllm",
        "base_url": "http://127.0.0.1:30000/v1",
        "model": "Qwen/Qwen3-14B-FP8",
        "api_key": None,
    },
    "visualization": {"enabled": True},
    "db": {
        "dbname": None,
        "user": None,
        "password": None,
        "host": "localhost",
        "port": "5432",
    },
    "brand_keywords": {
        "dynamic_extraction": True,
        "min_count": 2,
        "max_brands": 100,
        "default_list": [],
    },
    "paths": {
        "models_dir": "models",
        "results_dir": "results",
    },
}

def _deep_update(dst: dict, src: dict) -> dict:
    for k, v in (src or {}).items():
        if isinstance(v, dict) and isinstance(dst.get(k), dict):
            dst[k] = _deep_update(dst[k], v)
        else:
            dst[k] = v
    return dst

def load_configuration(config_path: str | None = None, override_dict: dict | None = None) -> dict:
    cfg = deepcopy(_DEFAULTS)
    config_path = config_path or os.getenv("CONFIG_PATH") or "config.json"
    p = Path(config_path)

    file_cfg = {}
    if p.exists():
        if p.suffix.lower() == ".json":
            file_cfg = json.loads(p.read_text(encoding="utf-8"))
        elif p.suffix.lower() in {".yml",".yaml"} and yaml is not None:
            file_cfg = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
    cfg = _deep_update(cfg, file_cfg)

    # env overrides
    dev_env = os.getenv("DEV_MODE")
    if dev_env is not None:
        cfg.setdefault("system", {})["development_mode"] = str(dev_env).lower() in {"1","true","yes","on"}

    if override_dict:
        cfg = _deep_update(cfg, override_dict)

    # guard defaults
    cfg.setdefault("db", {})
    for k in ("dbname","user","password","host","port"):
        cfg["db"].setdefault(k, None if k not in ("host","port") else ("localhost" if k=="host" else "5432"))
    cfg.setdefault("paths", {}).setdefault("models_dir", "models")
    cfg["paths"].setdefault("results_dir", "results")
    return cfg


In [3]:
# === Cell 2: Apply config / paths compat / DB guard ===
import uuid, warnings, importlib
from pathlib import Path
import _compat.paths as paths

# 1) cfg の読込（CONFIG_PATH は任意）
CFG_PATH = os.getenv("CONFIG_PATH") or "config.json"
cfg = load_configuration(CFG_PATH)

# 2) paths から RUN_ID / 出力ディレクトリを受け取る（Cell 0 前提）
importlib.reload(paths)  # 念のため最新のRUN_IDに同期
paths.ensure_roots()     # 統一ツリーを作成
RUN_ID = paths.RUN_ID  # [PATCH: unify RUN_ID]
SESSION_ID = f"{RUN_ID}-{uuid.uuid4().hex[:8]}"
output_dirs = dict(paths.compat_base_dirs)  # raw/data/models/results/handoff/logs/traces

# 3) DEV_MODE
DEV_MODE = bool(cfg.get("system", {}).get("development_mode", False))

# 4) DB_CONFIG guard
_db = cfg.get("db") or {}
DB_CONFIG = {
    "dbname": _db.get("dbname"),
    "user": _db.get("user"),
    "password": _db.get("password"),
    "host": _db.get("host") or "localhost",
    "port": str(_db.get("port") or "5432"),
}
_missing = [k for k,v in DB_CONFIG.items() if v in (None, "")]
if _missing:
    warnings.warn(f"DB_CONFIG の必須キーが未設定: {sorted(_missing)}  -> DB依存機能は適宜スキップしてください。")

# 5) cfgへ実行情報を埋め込み
cfg["run_id"] = RUN_ID
cfg["session_id"] = SESSION_ID
cfg["output_dirs"] = output_dirs

print(f"DEV_MODE={DEV_MODE}, RUN_ID={RUN_ID}, SESSION_ID={SESSION_ID}")
print("output_dirs:", output_dirs)


DEV_MODE=False, RUN_ID=2025-10-22_074213, SESSION_ID=2025-10-22_074213-21a0199e
output_dirs: {'raw': 'artifacts/2025-10-22_074213/raw', 'data': 'artifacts/2025-10-22_074213/processed', 'models': 'artifacts/2025-10-22_074213/models', 'results': 'artifacts/2025-10-22_074213/results', 'handoff': 'artifacts/2025-10-22_074213/handoff', 'logs': 'artifacts/2025-10-22_074213/logs', 'traces': 'artifacts/2025-10-22_074213/traces'}


In [4]:
# === Cell 3: Part3 handoff 読み込み・検証・保存（joblib優先） ===
import warnings, glob
from pathlib import Path

# pre-resolved handoff_in があれば尊重、無ければ既定パス
if 'handoff_in' not in globals():
    handoff_in = Path(output_dirs["handoff"]) / "03_ai_agent_analysis_part3.pkl"

# loader: joblib → pickle フォールバック
part3 = None
try:
    import joblib
    part3 = joblib.load(handoff_in)
except Exception:
    try:
        import pickle
        with open(handoff_in, "rb") as f:
            part3 = pickle.load(f)
    except Exception as e:
        candidates = sorted(glob.glob(str(Path(output_dirs["handoff"]) / "03_ai_agent_analysis_part3*.pkl")))
        hint = "\n".join(f"  - {c}" for c in candidates[-30:])
        raise FileNotFoundError(
            f"Part3 handoff の読み込みに失敗しました: {handoff_in} (err={e})\n"
            f"候補（末尾30件）：\n{hint}"
        )

# 必須キー検証（dict の場合のみ）
required_keys = ["false_negatives_df", "brand_keywords", "cert_full_info_map", "fn_features_df"]
if isinstance(part3, dict):
    missing = [k for k in required_keys if k not in part3]
    if missing:
        warnings.warn(f"Part3 handoff に必須キーが不足: {missing}")
else:
    warnings.warn(f"Part3 handoff が dict ではありません（type={type(part3).__name__}）。"
                  " 保存側(Part3)を確認してください。")

# === 04-1 handoff を保存 ===
handoff_out = Path(output_dirs["handoff"]) / "04-1_config_and_data_preparation.pkl"
payload = dict(part3) if isinstance(part3, dict) else {"part3_payload": part3}
payload.update({
    "cfg": cfg,
    "DB_CONFIG": DB_CONFIG,
    "RUN_ID": RUN_ID,
    "SESSION_ID": SESSION_ID,
    "output_dirs": output_dirs,
})
import pickle
with open(handoff_out, "wb") as f:
    pickle.dump(payload, f)

print(f"[OK] 04-1 handoff を保存: {handoff_out}")


[OK] 04-1 handoff を保存: artifacts/2025-10-22_074213/handoff/04-1_config_and_data_preparation.pkl


In [5]:
# === Cell 4: Summary ===
from pathlib import Path
from pprint import pprint
print("=== SUMMARY ===")
print(f"RUN_ID       : {RUN_ID}")
print(f"SESSION_ID   : {SESSION_ID}")
print(f"DEV_MODE     : {DEV_MODE}")
print(f"handoff_in   : {Path(output_dirs['handoff']) / '03_ai_agent_analysis_part3.pkl'}")
print(f"handoff_out  : {Path(output_dirs['handoff']) / '04-1_config_and_data_preparation.pkl'}")
print("\n[DB_CONFIG]")
pprint(DB_CONFIG)
print("\n[cfg keys]")
print(sorted(list(cfg.keys())))


=== SUMMARY ===
RUN_ID       : 2025-10-22_074213
SESSION_ID   : 2025-10-22_074213-21a0199e
DEV_MODE     : False
handoff_in   : artifacts/2025-10-22_074213/handoff/03_ai_agent_analysis_part3.pkl
handoff_out  : artifacts/2025-10-22_074213/handoff/04-1_config_and_data_preparation.pkl

[DB_CONFIG]
{'dbname': 'rapids_data',
 'host': 'localhost',
 'password': 'asomura',
 'port': '5432',
 'user': 'postgres'}

[cfg keys]
['brand_keywords', 'db', 'engine', 'llm', 'model', 'output_dirs', 'paths', 'run_id', 'session_id', 'system', 'visualization']
