In [1]:
# === Cell 0 (02以降 共通): レジストリから解決して paths を読む ===
import run_id_registry as runreg
rid = runreg.bootstrap()  # env→ファイル(artifacts/_current/run_id.txt)→Part3→latest→新規 の順で解決

import importlib
import _compat.paths as paths
importlib.reload(paths)

print("[NX] RUN_ID =", rid, "| paths.RUN_ID =", paths.RUN_ID)
paths.ensure_roots()  # artifacts/{RUN_ID}/... を必ず作成
BASE_DIRS = paths.compat_base_dirs  # {'raw','data','models','results','handoff','logs','traces'}


[NX] RUN_ID = 2026-02-02_220431 | paths.RUN_ID = 2026-02-02_220431


# 04-1_config_and_data_preparation (joblib loader fix)

- 目的: Part3 の handoff を読み込み、config と I/O を初期化し、04-1 の handoff を出力
- 修正点: **Part3 handoff のローダを `joblib.load` 優先**（`pickle.load` はフォールバック）に変更
- 前提: **Cell 0** で `run_id_registry` により RUN_ID 決定、`_compat.paths` を初期化済み


In [2]:
# === Cell 1: Config loader (STRICT; no defaults) ===
# === CHANGELOG ===
# 2025-12-28: _DEFAULTS を全削除（暗黙のフォールバックを廃止）。_compat/config.json を既定パスとして厳格に読み込む。

from __future__ import annotations
import os, json
from pathlib import Path

try:
    import yaml  # optional
except Exception:
    yaml = None

def load_configuration(config_path: str | None = None) -> dict:
    """
    Strict configuration loader.
    - 既定: 環境変数 CONFIG_PATH があればそれを使用。無ければ `_compat/config.json` を使用。
    - ファイルが無ければ FileNotFoundError（デフォルト値での救済はしない）
    """
    config_path = config_path or os.getenv("CONFIG_PATH") or "_compat/config.json"
    p = Path(config_path)

    if not p.exists():
        raise FileNotFoundError(
            f"❌ 必須設定ファイルが見つかりません: {p.resolve()}\n"
            "   デフォルト設定は廃止されています。`_compat/config.json`（または CONFIG_PATH）を用意してください。"
        )

    try:
        if p.suffix.lower() == ".json":
            return json.loads(p.read_text(encoding="utf-8"))
        if p.suffix.lower() in {".yml", ".yaml"}:
            if yaml is None:
                raise ModuleNotFoundError("yaml loader is unavailable (PyYAML not installed)")
            return yaml.safe_load(p.read_text(encoding="utf-8")) or {}
    except Exception as e:
        raise ValueError(f"❌ 設定ファイルの読み込みに失敗: {p.resolve()} ({e})")

    raise ValueError(f"❌ 未対応の設定ファイル拡張子です: {p.suffix} (path={p.resolve()})")


In [3]:
# === CHANGELOG ===
# 2025-12-28: DB_CONFIG guard を strict 化（不足キーは即エラー）

# === Cell 2: Apply config / paths compat / DB guard ===
import uuid, importlib
from pathlib import Path
import _compat.paths as paths

# 1) cfg の読込（CONFIG_PATH は任意）
CFG_PATH = os.getenv("CONFIG_PATH") or "_compat/config.json"
cfg = load_configuration(CFG_PATH)

# 2) paths から RUN_ID / 出力ディレクトリを受け取る（Cell 0 前提）
importlib.reload(paths)  # 念のため最新のRUN_IDに同期
paths.ensure_roots()     # 統一ツリーを作成
RUN_ID = paths.RUN_ID  # [PATCH: unify RUN_ID]
SESSION_ID = f"{RUN_ID}-{uuid.uuid4().hex[:8]}"
output_dirs = dict(paths.compat_base_dirs)  # raw/data/models/results/handoff/logs/traces

# 3) DEV_MODE
DEV_MODE = bool(cfg.get("system", {}).get("development_mode", False))

# 4) DB_CONFIG guard (STRICT)
if "db" not in cfg or not isinstance(cfg["db"], dict):
    raise KeyError("❌ config.json に 'db' セクションがありません。")

_db = cfg["db"]
required_keys = ["dbname", "user", "password", "host", "port"]
missing = [k for k in required_keys if _db.get(k) in (None, "", [])]
if missing:
    raise KeyError(f"❌ config.json の db セクションに必須キーが不足しています: {missing}")

DB_CONFIG = {
    "dbname": _db["dbname"],
    "user": _db["user"],
    "password": _db["password"],
    "host": _db["host"],
    "port": str(_db["port"]),
}

# 5) cfgへ実行情報を埋め込み
cfg["run_id"] = RUN_ID
cfg["session_id"] = SESSION_ID
cfg["output_dirs"] = output_dirs

print(f"DEV_MODE={DEV_MODE}, RUN_ID={RUN_ID}, SESSION_ID={SESSION_ID}")
print("output_dirs:", output_dirs)


DEV_MODE=False, RUN_ID=2026-02-02_220431, SESSION_ID=2026-02-02_220431-0956f082
output_dirs: {'raw': 'artifacts/2026-02-02_220431/raw', 'data': 'artifacts/2026-02-02_220431/processed', 'models': 'artifacts/2026-02-02_220431/models', 'results': 'artifacts/2026-02-02_220431/results', 'handoff': 'artifacts/2026-02-02_220431/handoff', 'logs': 'artifacts/2026-02-02_220431/logs', 'traces': 'artifacts/2026-02-02_220431/traces'}


In [4]:
# === Cell 3: Part3 handoff 読み込み・検証・保存（joblib優先） ===
import warnings, glob
from pathlib import Path

# pre-resolved handoff_in があれば尊重、無ければ既定パス
if 'handoff_in' not in globals():
    handoff_in = Path(output_dirs["handoff"]) / "03_ai_agent_analysis_part3.pkl"

# loader: joblib → pickle フォールバック
part3 = None
try:
    import joblib
    part3 = joblib.load(handoff_in)
except Exception:
    try:
        import pickle
        with open(handoff_in, "rb") as f:
            part3 = pickle.load(f)
    except Exception as e:
        candidates = sorted(glob.glob(str(Path(output_dirs["handoff"]) / "03_ai_agent_analysis_part3*.pkl")))
        hint = "\n".join(f"  - {c}" for c in candidates[-30:])
        raise FileNotFoundError(
            f"Part3 handoff の読み込みに失敗しました: {handoff_in} (err={e})\n"
            f"候補（末尾30件）：\n{hint}"
        )

# 必須キー検証（dict の場合のみ）
required_keys = ["false_negatives_df", "brand_keywords", "cert_full_info_map", "fn_features_df"]
if isinstance(part3, dict):
    missing = [k for k in required_keys if k not in part3]
    if missing:
        warnings.warn(f"Part3 handoff に必須キーが不足: {missing}")
else:
    warnings.warn(f"Part3 handoff が dict ではありません（type={type(part3).__name__}）。"
                  " 保存側(Part3)を確認してください。")

# === 04-1 handoff を保存 ===
handoff_out = Path(output_dirs["handoff"]) / "04-1_config_and_data_preparation.pkl"
payload = dict(part3) if isinstance(part3, dict) else {"part3_payload": part3}
payload.update({
    "cfg": cfg,
    "DB_CONFIG": DB_CONFIG,
    "RUN_ID": RUN_ID,
    "SESSION_ID": SESSION_ID,
    "output_dirs": output_dirs,
})
import pickle
with open(handoff_out, "wb") as f:
    pickle.dump(payload, f)

print(f"[OK] 04-1 handoff を保存: {handoff_out}")


[OK] 04-1 handoff を保存: artifacts/2026-02-02_220431/handoff/04-1_config_and_data_preparation.pkl


In [5]:
# === Cell 4: Summary ===
from pathlib import Path
from pprint import pprint
print("=== SUMMARY ===")
print(f"RUN_ID       : {RUN_ID}")
print(f"SESSION_ID   : {SESSION_ID}")
print(f"DEV_MODE     : {DEV_MODE}")
print(f"handoff_in   : {Path(output_dirs['handoff']) / '03_ai_agent_analysis_part3.pkl'}")
print(f"handoff_out  : {Path(output_dirs['handoff']) / '04-1_config_and_data_preparation.pkl'}")
print("\n[DB_CONFIG]")
pprint(DB_CONFIG)
print("\n[cfg keys]")
print(sorted(list(cfg.keys())))


=== SUMMARY ===
RUN_ID       : 2026-02-02_220431
SESSION_ID   : 2026-02-02_220431-0956f082
DEV_MODE     : False
handoff_in   : artifacts/2026-02-02_220431/handoff/03_ai_agent_analysis_part3.pkl
handoff_out  : artifacts/2026-02-02_220431/handoff/04-1_config_and_data_preparation.pkl

[DB_CONFIG]
{'dbname': 'rapids_data',
 'host': 'localhost',
 'password': 'asomura',
 'port': '5432',
 'user': 'postgres'}

[cfg keys]
['DB_CONFIG', 'analysis', 'brand_keywords', 'db', 'engine', 'free_ca_list', 'full_processing', 'handoff', 'llm', 'model', 'output_dirs', 'paths', 'run_id', 'session_id', 'system', 'tld_analysis', 'visualization']
