In [None]:
import os
import numpy as np
import polars as pl
import pandas as pd

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.metrics import f1_score, make_scorer
from tqdm.auto import tqdm

CFG = {
    "extr_num": 1,
    "top_k_extra": 100,
    "data_dir": "data",
    "out_dir": "modified_data/{CFG["extr_num"]}",
}

id_col = "customer_id"

train_main_path  = os.path.join(CFG["data_dir"], "train_main_features.parquet")
test_main_path   = os.path.join(CFG["data_dir"], "test_main_features.parquet")
train_extra_path = os.path.join(CFG["data_dir"], "train_extra_features.parquet")
test_extra_path  = os.path.join(CFG["data_dir"], "test_extra_features.parquet")
target_path      = "data/train_target.parquet"


train_main  = pl.read_parquet(train_main_path)
test_main   = pl.read_parquet(test_main_path)
train_extra = pl.read_parquet(train_extra_path)
test_extra  = pl.read_parquet(test_extra_path)
target      = pl.read_parquet(target_path)


train_big = train_main.join(train_extra, on=id_col, how="left").join(target, on=id_col, how="left")
test_big  = test_main.join(test_extra, on=id_col, how="left")

target_columns = [c for c in target.columns if c != id_col]

feature_cols_all = [c for c in train_big.columns if c != id_col and c not in target_columns]

main_feature_cols = [c for c in train_main.columns if c != id_col]
extra_feature_cols = [c for c in train_extra.columns if c != id_col]

X_df = train_big.select(feature_cols_all).fill_null(0).to_pandas()
y_df = train_big.select(target_columns).to_pandas()

# На всякий случай: если где-то bool -> int
y_df = y_df.astype(np.int8)

# -----------------------------
# Train/Val split (важно для permutation importance)
# -----------------------------
X_tr, X_val, y_tr, y_val = train_test_split(
    X_df, y_df,
    test_size=0.2,
    random_state=42
)

# -----------------------------
# MODEL (гиперпараметры тут)
# -----------------------------
model = ExtraTreesClassifier(
    n_estimators=400,
    random_state=42,
    n_jobs=-1
)

model.fit(X_tr, y_tr)

from pathlib import Path

# -----------------------------
# Permutation Importance (multilabel через f1_micro)
# -----------------------------
scorer = make_scorer(f1_score, average="micro")

# n_repeats можно поднять до 10-20, но будет дольше
perm = permutation_importance(
    model,
    X_val,
    y_val,
    scoring=scorer,
    n_repeats=5,
    random_state=42,
    n_jobs=-1
)

importances = pd.DataFrame({
    "feature": feature_cols_all,
    "importance_mean": perm.importances_mean,
    "importance_std": perm.importances_std
}).sort_values("importance_mean", ascending=False)

importance_pl = pl.from_pandas(importances)

# -----------------------------
# Select top K only from extra
# -----------------------------
K = CFG["top_k_extra"]

extra_importance_pl = (
    importance_pl
    .filter(pl.col("feature").is_in(extra_feature_cols))
    .sort("importance_mean", descending=True)
)

selected_extra_features = extra_importance_pl.head(K)["feature"].to_list()

final_feature_cols = main_feature_cols + selected_extra_features

# -----------------------------
# Final datasets
# -----------------------------
train_final = train_big.select([id_col] + target_columns + final_feature_cols)
test_final  = test_big.select([id_col] + final_feature_cols)

# -----------------------------
# Save
# -----------------------------
os.makedirs(CFG["out_dir"], exist_ok=True)

train_out_path = os.path.join(CFG["out_dir"], f"train_extr{CFG['extr_num']}.parquet")
test_out_path  = os.path.join(CFG["out_dir"], f"test_extr{CFG['extr_num']}.parquet")
imp_out_path   = os.path.join(CFG["out_dir"], f"perm_importances_extr{CFG['extr_num']}.parquet")
sel_out_path   = os.path.join(CFG["out_dir"], f"selected_extra_features_top{K}_extr{CFG['extr_num']}.txt")

train_final.write_parquet(train_out_path)
test_final.write_parquet(test_out_path)
importance_pl.write_parquet(imp_out_path)

with open(sel_out_path, "w", encoding="utf-8") as f:
    for col in selected_extra_features:
        f.write(col + "\n")

print("------ DONE (multilabel) ------")
print("K extra selected:", len(selected_extra_features))
print("train_final shape:", train_final.shape)
print("test_final shape :", test_final.shape)
print("Saved:")
print(" -", train_out_path)
print(" -", test_out_path)
print(" -", imp_out_path)
print(" -", sel_out_path)

print("\nTop-10 extra feature importances (perm, f1_micro):")
print(extra_importance_pl.head(10))