# 02 • Features Builder (оркестратор)

Цель: в 3 клика собрать фичи через `tools/run_features.py`, видеть план (Dry-run), логи, паспорт фич и базовые проверки.

Шаги:
1) Выбери профиль, поставь `RUN_TAG`, пути и столбцы  
2) Настрой сплит и долю данных  
3) Отметь блоки и их параметры  
4) Нажми **Dry-run** → проверь план  
5) Нажми **Build features** → смотри лог → открой паспорт  


In [None]:
from IPython.display import display, HTML
display(HTML("""
<style>
/* ipywidgets v8 (JupyterLab 4) */
.jp-OutputArea .widget-button .widget-label { 
  white-space: normal !important; 
  overflow: visible !important; 
  text-overflow: clip !important;
  line-height: 1.2 !important;
}
/* fallback для ipywidgets v7 */
.jupyter-widgets.widget-button .widget-label {
  white-space: normal !important; 
  overflow: visible !important; 
  text-overflow: clip !important;
  line-height: 1.2 !important;
}
</style>
"""))


In [None]:
import os, sys, json, time, math, subprocess, textwrap, shutil, pickle, gc, uuid, warnings
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
plt.rcParams["figure.figsize"] = (10, 5)

try:
    import ipywidgets as w
    from IPython.display import display, clear_output, HTML
except Exception as e:
    raise RuntimeError("Нужен пакет ipywidgets (pip install ipywidgets)") from e

# ---------- Helpers
def now_tag(prefix="run"):
    return f"{prefix}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

def mem_gb_df(df):
    try:
        return float(df.memory_usage(deep=True).sum())/(1024**3)
    except Exception:
        return 0.0

def run_cmd(args, cwd=None, stream=False, log_file=None):
    """
    args: list[str] - команда
    stream: если True, выводим потоково
    log_file: путь для tee в файл (создастся)
    """
    if stream:
        proc = subprocess.Popen(args, cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1)
        lines = []
        if log_file:
            Path(log_file).parent.mkdir(parents=True, exist_ok=True)
            lf = open(log_file, "a", encoding="utf-8")
        else:
            lf = None
        try:
            for line in proc.stdout:
                print(line, end="")
                if lf: lf.write(line)
                lines.append(line)
        finally:
            if lf: lf.close()
        code = proc.wait()
        return code, "".join(lines)
    else:
        res = subprocess.run(args, cwd=cwd, capture_output=True, text=True)
        if log_file:
            Path(log_file).parent.mkdir(parents=True, exist_ok=True)
            with open(log_file, "a", encoding="utf-8") as f:
                f.write(res.stdout)
                f.write(res.stderr)
        return res.returncode, (res.stdout + res.stderr)

def tail_log(path, n=120):
    p = Path(path)
    if not p.exists():
        return f"[нет лога] {p}"
    with p.open("r", encoding="utf-8", errors="ignore") as f:
        data = f.readlines()
    return "".join(data[-n:])

def read_json(path):
    p = Path(path)
    if not p.exists():
        return None
    try:
        return json.loads(p.read_text(encoding="utf-8"))
    except Exception:
        return None

def save_json(path, obj):
    p = Path(path)
    p.parent.mkdir(parents=True, exist_ok=True)
    p.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")

def try_read_parquet(path):
    p = Path(path)
    if not p.exists():
        return None
    try:
        return pd.read_parquet(p)
    except Exception as e:
        # pyarrow отсутствует — пробуем fastparquet
        try:
            import fastparquet  # noqa
            return pd.read_parquet(p, engine="fastparquet")
        except Exception:
            raise e

UI_STATE_PATH = Path(".tmp/feature_ui_state.json")
def load_ui_state():
    return read_json(UI_STATE_PATH) or {}

def save_ui_state(d):
    save_json(UI_STATE_PATH, d)

def autoload_or(default, key, ui):
    st = load_ui_state()
    return st.get(key, default)

def badge(text, color="#0a0"):
    return HTML(f"<span style='display:inline-block;background:{color};color:#fff;padding:3px 8px;border-radius:7px;font-weight:600'>{text}</span>")

print("Python:", sys.version.split()[0])

In [None]:
BTN_LAYOUT = w.Layout(min_width="220px", width="auto", height="36px", flex="0 0 auto")
ROW_LAYOUT = w.Layout(flex_flow="row wrap", grid_gap="8px")
GRID_LAYOUT = w.Layout(grid_template_columns="repeat(3, minmax(220px, 1fr))", grid_gap="8px")

In [None]:
# базовые пути/профиль
PROFILE = w.Dropdown(options=["scout","gate","tour","full","panic"], value=autoload_or("tour","PROFILE",None), description="PROFILE")
RUN_TAG  = w.Text(value=autoload_or(now_tag("s5e11"), "RUN_TAG", None), description="RUN_TAG", layout=w.Layout(width="60%"))

BASE     = w.Text(value=autoload_or(".", "BASE", None), description="BASE")
DATA_DIR = w.Text(value=autoload_or("./data", "DATA_DIR", None), description="DATA_DIR")

OUT_HINT = w.HTML()

def refresh_out_hint(*a):
    tag = RUN_TAG.value.strip()
    out_dir = Path("artifacts/sets")/tag
    OUT_HINT.value = f"<b>Артефакты:</b> {out_dir.as_posix()}<br>Лог: {(out_dir/'run_features.log').as_posix()}"
    
for wid in [RUN_TAG, BASE, DATA_DIR, PROFILE]:
    wid.observe(refresh_out_hint, names="value")

refresh_out_hint()
box0 = w.VBox([w.HBox([PROFILE, RUN_TAG], layout=ROW_LAYOUT), w.HBox([BASE, DATA_DIR], layout=ROW_LAYOUT), OUT_HINT])
display(box0)

In [None]:
ID_COL     = w.Text(value=autoload_or("id","ID_COL",None), description="ID_COL")
TARGET_COL = w.Text(value=autoload_or("","TARGET_COL",None), description="TARGET_COL")
DATE_COL   = w.Text(value=autoload_or("","DATE_COL",None), description="DATE_COL")
GROUP_COL  = w.Text(value=autoload_or("","GROUP_COL",None), description="GROUP_COL")
LAT_COL    = w.Text(value=autoload_or("","LAT_COL",None), description="LAT_COL")
LON_COL    = w.Text(value=autoload_or("","LON_COL",None), description="LON_COL")

BTN_CHECK = w.Button(description="Проверить файлы и autodetect", button_style="info", layout=BTN_LAYOUT)
OUT_CHECK = w.Output(layout={'border':'1px solid #ccc'})

def on_check_clicked(_):
    OUT_CHECK.clear_output(wait=True)
    with OUT_CHECK:
        base = Path(BASE.value)
        data = Path(DATA_DIR.value)
        tr, te = data/"train.csv", data/"test.csv"
        if not tr.exists() or not te.exists():
            print("❌ Не найден train/test:", tr, te)
            return
        train = pd.read_csv(tr)
        test  = pd.read_csv(te)
        print("Train:", train.shape, "| Test:", test.shape)
        if ID_COL.value not in train.columns or ID_COL.value not in test.columns:
            print(f"❌ ID_COL '{ID_COL.value}' отсутствует")
            return
        tgt = TARGET_COL.value.strip() or None
        if tgt and tgt not in train.columns:
            print(f"❌ TARGET_COL '{tgt}' отсутствует в train")
            return
        # autodetect
        exclude = {ID_COL.value}
        if tgt: exclude.add(tgt)
        num_cols, cat_cols, text_cols = [], [], []
        for c in train.columns:
            if c in exclude: continue
            s = train[c]
            if pd.api.types.is_numeric_dtype(s):
                num_cols.append(c)
            elif pd.api.types.is_string_dtype(s):
                is_text = s.map(lambda x: isinstance(x,str) and len(x)>30).mean() > 0.3
                (text_cols if is_text else cat_cols).append(c)
            else:
                cat_cols.append(c)
        print(f"NUM: {len(num_cols)}, CAT: {len(cat_cols)}, TEXT: {len(text_cols)}")
        display(train.head(3))
        print("Память train:", round(mem_gb_df(train),3), "GB")

BTN_CHECK.on_click(on_check_clicked)
display(w.VBox([w.HBox([ID_COL, TARGET_COL, DATE_COL], layout=ROW_LAYOUT), w.HBox([GROUP_COL, LAT_COL, LON_COL], layout=ROW_LAYOUT), BTN_CHECK, OUT_CHECK]))

In [None]:
SPLIT_KIND   = w.Dropdown(options=["stratified","kfold","group","time"], value=autoload_or("stratified","SPLIT_KIND",None), description="SPLIT")
N_SPLITS     = w.IntText(value=int(autoload_or(5,"N_SPLITS",None)), description="N_SPLITS")
SEED         = w.IntText(value=int(autoload_or(42,"SEED",None)), description="SEED")
TIME_EMBARGO = w.Text(value=autoload_or("","TIME_EMBARGO",None), description="EMBARGO")

FRAC = w.Dropdown(options=[("15% (scout)",0.15),("40% (gate)",0.4),("100% (full)",1.0)],
                  value=float(autoload_or(1.0,"FRAC",None)), description="FRAC")

display(w.HBox([SPLIT_KIND, N_SPLITS, SEED, TIME_EMBARGO, FRAC], layout=ROW_LAYOUT))

In [None]:
# глобальные флаги
FAST      = w.Checkbox(value=bool(autoload_or(True,"FAST",None)), description="FAST")
SAFE      = w.Checkbox(value=bool(autoload_or(True,"SAFE",None)), description="SAFE")
USE_CACHE = w.Checkbox(value=bool(autoload_or(True,"USE_CACHE",None)), description="USE_CACHE")
THREADS   = w.IntText(value=int(autoload_or(-1,"THREADS",None)), description="THREADS")

# чекбоксы блоков
B_num_basic   = w.Checkbox(value=bool(autoload_or(True,"B_num_basic",None)), description="num_basic")
B_cat_freq    = w.Checkbox(value=bool(autoload_or(True,"B_cat_freq",None)), description="cat_freq")
B_cat_te_oof  = w.Checkbox(value=bool(autoload_or(False,"B_cat_te_oof",None)), description="cat_te_oof")
B_text_tfidf  = w.Checkbox(value=bool(autoload_or(False,"B_text_tfidf",None)), description="text_tfidf")
B_geo_grid    = w.Checkbox(value=bool(autoload_or(False,"B_geo_grid",None)), description="geo_grid")
B_geo_nb      = w.Checkbox(value=bool(autoload_or(False,"B_geo_nb",None)), description="geo_neighbors")
B_time_agg    = w.Checkbox(value=bool(autoload_or(False,"B_time_agg",None)), description="time_agg")
B_crosses     = w.Checkbox(value=bool(autoload_or(False,"B_crosses",None)), description="crosses (whitelist)")
B_img_stats   = w.Checkbox(value=bool(autoload_or(False,"B_img_stats",None)), description="img_stats")
B_img_embed   = w.Checkbox(value=bool(autoload_or(False,"B_img_embed",None)), description="img_embed")

row_blocks = w.HBox([B_num_basic, B_cat_freq, B_cat_te_oof, B_text_tfidf, B_crosses], layout=ROW_LAYOUT)
row_blocks2= w.HBox([B_geo_grid, B_geo_nb, B_time_agg, B_img_stats, B_img_embed], layout=ROW_LAYOUT)

# параметры блоков (минимально нужные)
te_top_k    = w.IntText(value=int(autoload_or(3,"te_top_k",None)), description="TE top_k")
te_smooth   = w.Text(value=autoload_or("m-estimate","te_smooth",None), description="TE smoothing")

tfidf_col   = w.Text(value=autoload_or("","tfidf_col",None), description="text_col")
tfidf_min_df= w.IntText(value=int(autoload_or(5,"tfidf_min_df",None)), description="min_df")
tfidf_ngram = w.Text(value=autoload_or("1,2","tfidf_ngram",None), description="ngram")
tfidf_svd_k = w.Text(value=autoload_or("None","tfidf_svd_k",None), description="svd_k")

geo_steps   = w.Text(value=autoload_or("1000","geo_steps",None), description="steps_m (csv)")
geo_radii   = w.Text(value=autoload_or("1000","geo_radii",None), description="radii_m (csv)")

time_lags   = w.Text(value=autoload_or("1,7","time_lags",None), description="lags")
time_roll   = w.Text(value=autoload_or("7,30","time_roll",None), description="rollings")
time_groups = w.Text(value=autoload_or("", "time_groups", None), description="group_cols (csv)")

cross_white = w.Text(value=autoload_or("configs/crosses_whitelist.yaml","cross_white",None), description="whitelist.yaml")

accordion = w.Accordion(children=[
    w.VBox([w.HBox([FAST, SAFE, USE_CACHE, THREADS], layout=ROW_LAYOUT)]),
    w.VBox([w.HBox([te_top_k, te_smooth], layout=ROW_LAYOUT)]),
    w.VBox([w.HBox([tfidf_col, tfidf_min_df, tfidf_ngram, tfidf_svd_k], layout=ROW_LAYOUT)]),
    w.VBox([w.HBox([geo_steps, geo_radii], layout=ROW_LAYOUT)]),
    w.VBox([w.HBox([time_lags, time_roll, time_groups], layout=ROW_LAYOUT)]),
    w.VBox([w.HBox([cross_white], layout=ROW_LAYOUT)]),
])
accordion.set_title(0, "Глобальные флаги")
accordion.set_title(1, "Параметры TE")
accordion.set_title(2, "Параметры TF-IDF")
accordion.set_title(3, "Параметры GEO")
accordion.set_title(4, "Параметры TIME")
accordion.set_title(5, "Параметры CROSSES")

display(row_blocks, row_blocks2, accordion)

In [None]:
BTN_DRY   = w.Button(description="Dry-run (план)", button_style="warning", layout=BTN_LAYOUT)
OUT_DRY   = w.Output(layout={'border':'1px dashed #e0a'})

def build_blocks_yaml_dict():
    blocks = {}
    # enabled flags
    flags = {
        "num_basic": B_num_basic.value,
        "cat_freq": B_cat_freq.value,
        "cat_te_oof": B_cat_te_oof.value,
        "text_tfidf": B_text_tfidf.value,
        "geo_grid": B_geo_grid.value,
        "geo_neighbors": B_geo_nb.value,
        "time_agg": B_time_agg.value,
        "crosses": B_crosses.value,
        "img_stats": B_img_stats.value,
        "img_embed": B_img_embed.value,
    }
    for k, ena in flags.items():
        if ena:
            blocks[k] = {"enabled": True}
    # params
    if B_cat_te_oof.value:
        blocks["cat_te_oof"].update({"top_k": int(te_top_k.value), "smoothing": te_smooth.value})
    if B_text_tfidf.value:
        ngram = tuple(int(x.strip()) for x in tfidf_ngram.value.split(",") if x.strip().isdigit())
        svd_k = None if tfidf_svd_k.value.strip().lower()=="none" else int(tfidf_svd_k.value)
        extra = {"min_df": int(tfidf_min_df.value), "ngram_range": list(ngram), "svd_k": svd_k}
        if tfidf_col.value.strip():
            extra["text_cols"] = [tfidf_col.value.strip()]
        blocks["text_tfidf"].update(extra)
    if B_geo_grid.value:
        steps = [int(x.strip()) for x in geo_steps.value.split(",") if x.strip().isdigit()]
        blocks["geo_grid"].update({"steps_m": steps})
    if B_geo_nb.value:
        radii = [int(x.strip()) for x in geo_radii.value.split(",") if x.strip().isdigit()]
        blocks["geo_neighbors"].update({"radii_m": radii})
    if B_time_agg.value:
        lags = [int(x.strip()) for x in time_lags.value.split(",") if x.strip().isdigit()]
        rolls= [int(x.strip()) for x in time_roll.value.split(",") if x.strip().isdigit()]
        grp = [g.strip() for g in time_groups.value.split(",") if g.strip()]
        extra = {"lags": lags, "rollings": rolls}
        if grp: extra["group_cols"] = grp
        blocks["time_agg"].update(extra)
    if B_crosses.value:
        blocks["crosses"].update({"whitelist": {"path": cross_white.value}})
    return {"blocks": blocks}

def save_blocks_yaml_for_run(tag, blocks_dict):
    tmp_dir = Path("artifacts/sets")/tag
    tmp_dir.mkdir(parents=True, exist_ok=True)
    yaml_path = tmp_dir/"blocks_override.yaml"
    try:
        import yaml
    except Exception:
        raise RuntimeError("Нужен PyYAML для генерации blocks yaml (pip install pyyaml)")
    yaml_path.write_text(yaml.safe_dump(blocks_dict, allow_unicode=True, sort_keys=False), encoding="utf-8")
    return yaml_path

def assemble_base_args(dry=False):
    tag = RUN_TAG.value.strip()
    args = [
        sys.executable, "tools/run_features.py",
        "--profile", PROFILE.value,
        "--tag", tag,
        "--id-col", ID_COL.value.strip(),
        "--base", BASE.value.strip(),
        "--data-dir", DATA_DIR.value.strip(),
    ]
    # optional cols
    if TARGET_COL.value.strip(): args += ["--target-col", TARGET_COL.value.strip()]
    if DATE_COL.value.strip():   args += ["--date-col", DATE_COL.value.strip()]
    if GROUP_COL.value.strip():  args += ["--group-col", GROUP_COL.value.strip()]
    if LAT_COL.value.strip():    args += ["--lat-col", LAT_COL.value.strip()]
    if LON_COL.value.strip():    args += ["--lon-col", LON_COL.value.strip()]
    # split/dataset
    args += ["--split-kind", SPLIT_KIND.value, "--n-splits", str(N_SPLITS.value), "--seed", str(SEED.value), "--frac", str(FRAC.value)]
    if TIME_EMBARGO.value.strip():
        args += ["--time-embargo", TIME_EMBARGO.value.strip()]
    # flags
    if FAST.value:      args += ["--fast"]
    if SAFE.value:      args += ["--safe"]
    if USE_CACHE.value: args += ["--use-cache"]
    if THREADS.value is not None: args += ["--threads", str(THREADS.value)]
    # save-set всегда включаем для удобства
    args += ["--save-set"]
    if dry: args += ["--dry-run"]
    return args

def on_dry(_):
    OUT_DRY.clear_output(wait=True)
    with OUT_DRY:
        # persist UI state
        save_ui_state({
            "PROFILE": PROFILE.value, "RUN_TAG": RUN_TAG.value, "BASE": BASE.value, "DATA_DIR": DATA_DIR.value,
            "ID_COL": ID_COL.value, "TARGET_COL": TARGET_COL.value, "DATE_COL": DATE_COL.value,
            "GROUP_COL": GROUP_COL.value, "LAT_COL": LAT_COL.value, "LON_COL": LON_COL.value,
            "SPLIT_KIND": SPLIT_KIND.value, "N_SPLITS": N_SPLITS.value, "SEED": SEED.value, "TIME_EMBARGO": TIME_EMBARGO.value,
            "FRAC": FRAC.value, "FAST": FAST.value, "SAFE": SAFE.value, "USE_CACHE": USE_CACHE.value, "THREADS": THREADS.value,
            "B_num_basic": B_num_basic.value, "B_cat_freq": B_cat_freq.value, "B_cat_te_oof": B_cat_te_oof.value,
            "B_text_tfidf": B_text_tfidf.value, "B_geo_grid": B_geo_grid.value, "B_geo_nb": B_geo_nb.value,
            "B_time_agg": B_time_agg.value, "B_crosses": B_crosses.value, "B_img_stats": B_img_stats.value, "B_img_embed": B_img_embed.value,
            "te_top_k": te_top_k.value, "te_smooth": te_smooth.value,
            "tfidf_col": tfidf_col.value, "tfidf_min_df": tfidf_min_df.value, "tfidf_ngram": tfidf_ngram.value, "tfidf_svd_k": tfidf_svd_k.value,
            "geo_steps": geo_steps.value, "geo_radii": geo_radii.value,
            "time_lags": time_lags.value, "time_roll": time_roll.value, "time_groups": time_groups.value,
            "cross_white": cross_white.value
        })
        blocks_dict = build_blocks_yaml_dict()
        tag = RUN_TAG.value.strip()
        yaml_path = save_blocks_yaml_for_run(tag, blocks_dict)
        args = assemble_base_args(dry=True) + ["--blocks-yaml", str(yaml_path)]
        print("CMD:", " ".join(args))
        code, out = run_cmd(args, stream=False, log_file=None)
        print(out if out.strip() else f"[exit code {code}]")
        if code==0:
            display(badge("DRY-RUN OK", "#0a0"))
        else:
            display(badge("DRY-RUN FAIL", "#c00"))

BTN_DRY.on_click(on_dry)
display(w.VBox([BTN_DRY, OUT_DRY]))

In [None]:
BTN_BUILD = w.Button(description="Build features", button_style="success", layout=BTN_LAYOUT)
BTN_REFRESH_LOG = w.Button(description="Обновить лог", button_style="", layout=BTN_LAYOUT)
OUT_LOG = w.Output(layout={'border':'1px solid #ccc', 'height':'260px', 'overflow_y':'auto'})

def on_build(_):
    OUT_LOG.clear_output(wait=True)
    with OUT_LOG:
        tag = RUN_TAG.value.strip()
        blocks_dict = build_blocks_yaml_dict()
        yaml_path = save_blocks_yaml_for_run(tag, blocks_dict)
        args = assemble_base_args(dry=False) + ["--blocks-yaml", str(yaml_path)]
        log_path = Path("artifacts/sets")/tag/"run_features.log"
        print("CMD:", " ".join(args))
        print("Лог будет писаться в:", log_path.as_posix(), "
")
        code, _ = run_cmd(args, stream=True, log_file=log_path)
        print("
[EXIT CODE]", code)
        if code==0:
            display(badge("BUILD OK", "#0a0"))
        else:
            display(badge("BUILD FAIL", "#c00"))

def on_refresh(_):
    OUT_LOG.clear_output(wait=True)
    with OUT_LOG:
        tag = RUN_TAG.value.strip()
        log_path = Path("artifacts/sets")/tag/"run_features.log"
        print(tail_log(log_path, n=200))

BTN_BUILD.on_click(on_build)
BTN_REFRESH_LOG.on_click(on_refresh)
display(w.HBox([BTN_BUILD, BTN_REFRESH_LOG], layout=ROW_LAYOUT), OUT_LOG)

In [None]:
BTN_CATALOG = w.Button(description="Открыть паспорт фич", button_style="info", layout=BTN_LAYOUT)
OUT_CAT = w.Output(layout={'border':'1px solid #ccc'})

def on_catalog(_):
    OUT_CAT.clear_output(wait=True)
    with OUT_CAT:
        tag = RUN_TAG.value.strip()
        base = Path("artifacts/sets")/tag
        cat = read_json(base/"catalog.json")
        meta= read_json(base/"meta.json")
        Xd_tr = base/"X_dense_train.parquet"
        Xd_te = base/"X_dense_test.parquet"
        Xs_tr = base/"X_sparse_train.npz"
        Xs_te = base/"X_sparse_test.npz"

        print("Каталог:", (base/"catalog.json").as_posix())
        if not cat:
            print("— не найден или пуст")
        else:
            rows=[]
            for k,v in cat.items():
                rows.append({
                    "package": k,
                    "kind": v.get("kind","?"),
                    "n_cols": v.get("n_cols","?"),
                    "secs": v.get("secs","?"),
                    "params": json.dumps(v.get("params",{}), ensure_ascii=False)
                })
            df = pd.DataFrame(rows).sort_values(["kind","n_cols"], ascending=[True,False])
            display(df)

        # формы
        if Xd_tr.exists() and Xd_te.exists():
            xdtr = try_read_parquet(Xd_tr); xgte = try_read_parquet(Xd_te)
            print("Dense shapes:", xdtr.shape, xgte.shape, "| mem:", round(mem_gb_df(xdtr),3),"GB")
        else:
            print("Dense: —")
        if Xs_tr.exists() and Xs_te.exists():
            print("Sparse shapes:", "см. размеры в логе (npz)")
        else:
            print("Sparse: —")

        if meta:
            print("
META:")
            print(json.dumps(meta, ensure_ascii=False, indent=2))

BTN_CATALOG.on_click(on_catalog)
display(BTN_CATALOG, OUT_CAT)

In [None]:
BTN_CHECKS = w.Button(description="Интегритити-чеки", button_style="warning", layout=BTN_LAYOUT)
OUT_CHECKS = w.Output(layout={'border':'1px solid #ccc'})

def integrity_checks():
    tag = RUN_TAG.value.strip()
    base = Path("artifacts/sets")/tag
    ok = True
    msgs = []

    # наличие файлов
    needed = ["ids_train.parquet","ids_test.parquet","folds.pkl","meta.json","catalog.json"]
    for f in needed:
        if not (base/f).exists():
            ok=False; msgs.append(f"❌ нет {f}")

    # dense
    Xd_tr = base/"X_dense_train.parquet"
    Xd_te = base/"X_dense_test.parquet"
    if Xd_tr.exists() and Xd_te.exists():
        try:
            xdtr = try_read_parquet(Xd_tr); xgte = try_read_parquet(Xd_te)
            if set(xdtr.columns)!=set(xgte.columns):
                ok=False; msgs.append("❌ разные колонки в dense train/test")
            # NaN/inf
            num = xdtr.select_dtypes(include=[np.number])
            if not np.isfinite(num.to_numpy(dtype=float)).all():
                ok=False; msgs.append("❌ NaN/inf в dense train")
        except Exception as e:
            ok=False; msgs.append(f"❌ ошибка чтения dense: {e}")
    else:
        msgs.append("ℹ️ Dense не найден — ок, если строили только sparse")

    # TE анти-утечки (эвристика): ищем в meta флаги
    meta = read_json(base/"meta.json") or {}
    built = meta.get("built",[])
    if any("te" in b for b in built):
        cat = read_json(base/"catalog.json") or {}
        # если пакет TE содержит 'oof': true в params — хорошо; иначе предупредим
        te_pkgs = [k for k in cat.keys() if k.startswith("te")]
        for k in te_pkgs:
            prm = (cat[k] or {}).get("params",{})
            if prm.get("oof", None) is False:
                msgs.append(f"⚠️ TE пакет {k}: нет oof=True в params → проверь anti-leak")
    
    return ok, msgs

def on_checks(_):
    OUT_CHECKS.clear_output(wait=True)
    with OUT_CHECKS:
        ok, msgs = integrity_checks()
        for m in msgs: print(m)
        display(badge("OK" if ok else "HAS ISSUES", "#0a0" if ok else "#c00"))

BTN_CHECKS.on_click(on_checks)
display(BTN_CHECKS, OUT_CHECKS)

In [None]:
REPORT = w.Checkbox(value=False, description="REPORT визуализации")
BTN_REPORT = w.Button(description="Сделать отчёт", button_style="", layout=BTN_LAYOUT)
OUT_REPORT = w.Output()

def on_report(_):
    OUT_REPORT.clear_output(wait=True)
    if not REPORT.value:
        return
    with OUT_REPORT:
        tag = RUN_TAG.value.strip()
        base = Path("artifacts/sets")/tag
        Xd_tr = base/"X_dense_train.parquet"
        if not Xd_tr.exists():
            print("Dense не найден — визуалки пропущены.")
            return
        X = try_read_parquet(Xd_tr)
        # Топ-20 по дисперсии
        num = X.select_dtypes(include=[np.number])
        if num.shape[1]==0:
            print("Числовых фич нет (dense) — пропуск.")
            return
        var = num.var().sort_values(ascending=False).head(20)
        plt.figure(); var.plot(kind="bar"); plt.title("Top-20 variance (dense)"); plt.show()

        # Heatmap корреляций на 15 фич
        cols = var.index[:15].tolist()
        corr = num[cols].corr()
        plt.figure()
        plt.imshow(corr, interpolation="nearest")
        plt.colorbar(); plt.title("Correlation (top-15 by variance)")
        plt.xticks(range(len(cols)), cols, rotation=90)
        plt.yticks(range(len(cols)), cols)
        plt.tight_layout()
        plt.show()

BTN_REPORT.on_click(on_report)
display(w.HBox([REPORT, BTN_REPORT], layout=ROW_LAYOUT), OUT_REPORT)

In [None]:
BTN_SCOUT = w.Button(description="SCOUT: 15% + num/cat_freq", button_style="", layout=BTN_LAYOUT)
BTN_GATE  = w.Button(description="GATE: 40% + TE top-3", button_style="", layout=BTN_LAYOUT)
BTN_TOUR  = w.Button(description="TOUR: 100% базовые", button_style="", layout=BTN_LAYOUT)

def set_scout(_):
    PROFILE.value = "scout"; FRAC.value = 0.15
    B_num_basic.value=True; B_cat_freq.value=True
    B_cat_te_oof.value=False; B_text_tfidf.value=False
def set_gate(_):
    PROFILE.value = "gate"; FRAC.value = 0.4
    B_num_basic.value=True; B_cat_freq.value=True; B_cat_te_oof.value=True; te_top_k.value=3
def set_tour(_):
    PROFILE.value = "tour"; FRAC.value = 1.0
    B_num_basic.value=True; B_cat_freq.value=True
    B_cat_te_oof.value=False

BTN_SCOUT.on_click(set_scout)
BTN_GATE.on_click(set_gate)
BTN_TOUR.on_click(set_tour)
display(w.HBox([BTN_SCOUT, BTN_GATE, BTN_TOUR], layout=ROW_LAYOUT))

In [None]:
BTN_SAVE = w.Button(description="Сохранить состояние UI", button_style="", layout=BTN_LAYOUT)
BTN_LOAD = w.Button(description="Загрузить состояние UI", button_style="", layout=BTN_LAYOUT)

def on_save(_):
    save_ui_state({
        "PROFILE": PROFILE.value, "RUN_TAG": RUN_TAG.value, "BASE": BASE.value, "DATA_DIR": DATA_DIR.value,
        "ID_COL": ID_COL.value, "TARGET_COL": TARGET_COL.value, "DATE_COL": DATE_COL.value,
        "GROUP_COL": GROUP_COL.value, "LAT_COL": LAT_COL.value, "LON_COL": LON_COL.value,
        "SPLIT_KIND": SPLIT_KIND.value, "N_SPLITS": N_SPLITS.value, "SEED": SEED.value, "TIME_EMBARGO": TIME_EMBARGO.value,
        "FRAC": FRAC.value, "FAST": FAST.value, "SAFE": SAFE.value, "USE_CACHE": USE_CACHE.value, "THREADS": THREADS.value,
        "B_num_basic": B_num_basic.value, "B_cat_freq": B_cat_freq.value, "B_cat_te_oof": B_cat_te_oof.value,
        "B_text_tfidf": B_text_tfidf.value, "B_geo_grid": B_geo_grid.value, "B_geo_nb": B_geo_nb.value,
        "B_time_agg": B_time_agg.value, "B_crosses": B_crosses.value, "B_img_stats": B_img_stats.value, "B_img_embed": B_img_embed.value,
        "te_top_k": te_top_k.value, "te_smooth": te_smooth.value,
        "tfidf_col": tfidf_col.value, "tfidf_min_df": tfidf_min_df.value, "tfidf_ngram": tfidf_ngram.value, "tfidf_svd_k": tfidf_svd_k.value,
        "geo_steps": geo_steps.value, "geo_radii": geo_radii.value,
        "time_lags": time_lags.value, "time_roll": time_roll.value, "time_groups": time_groups.value,
        "cross_white": cross_white.value
    })
    display(badge("SAVED", "#0a0"))

def on_load(_):
    st = load_ui_state()
    for k,v in st.items():
        try:
            globals()[k].value = v
        except Exception:
            pass
    display(badge("LOADED", "#0a0"))

BTN_SAVE.on_click(on_save)
BTN_LOAD.on_click(on_load)
display(w.HBox([BTN_SAVE, BTN_LOAD], layout=ROW_LAYOUT))

### Что дальше
Открой `03_model.ipynb`:
- там возьмёшь `artifacts/sets/<RUN_TAG>/X_*`, `y_train.parquet`, `folds.pkl`, `catalog.json`, `meta.json`;
- обучишь 1–3 кандидата, соберёшь бленд и сабмиты.

Удачи ✌️