In [3]:
import pandas as pd
from joblib import load
from mitochime.hyperparam_search_top import load_dataset  # or reuse your helper

X_test, y_test, feature_names = load_dataset("../data/processed/test_noq.tsv")

models = {
    "catboost_tuned":        load("../models_noq_tuned/catboost_tuned.joblib"),
    "gradient_boosting_tuned": load("../models_noq_tuned/gradient_boosting_tuned.joblib"),
    "bagging_trees_tuned":   load("../models_noq_tuned/bagging_trees_tuned.joblib"),
    "logreg_l2_base":        load("../models_noq/logreg_l2.joblib"),  # not tuned
}

In [5]:
from sklearn.inspection import permutation_importance
import numpy as np
from pathlib import Path

feat_dir = Path("../reports/feature_importance")
feat_dir.mkdir(parents=True, exist_ok=True)  # make sure folder exists

def perm_importance(model, X, y, feature_names, scoring="f1", n_repeats=10):
    result = permutation_importance(
        model, X, y,
        n_repeats=n_repeats,
        random_state=42,
        n_jobs=-1,
        scoring=scoring,
    )
    df = pd.DataFrame({
        "feature": feature_names,
        "importance_mean": result.importances_mean,
        "importance_std": result.importances_std,
    }).sort_values("importance_mean", ascending=False)
    return df

imp_results = {}

for name, model in models.items():
    print(f"\n=== Permutation importance ({name}) ===")
    df_imp = perm_importance(model, X_test, y_test, feature_names, scoring="f1")
    imp_results[name] = df_imp

    # save
    df_imp.to_csv(feat_dir / f"perm_imp_{name}.tsv",
                  sep="\t", index=False)

    print(df_imp.head(10))


=== Permutation importance (catboost_tuned) ===
                 feature  importance_mean  importance_std
17   total_clipped_bases         0.082066        0.002651
20    kmer_js_divergence         0.076510        0.003378
19      kmer_cosine_diff         0.058167        0.002891
15         softclip_left         0.026290        0.002867
16        softclip_right         0.025894        0.002075
1                   mapq         0.006059        0.000839
22      microhomology_gc         0.000528        0.000842
21  microhomology_length         0.000340        0.000659
6       sa_min_delta_pos         0.000057        0.000748
13             sa_min_nm         0.000000        0.000000

=== Permutation importance (gradient_boosting_tuned) ===
                 feature  importance_mean  importance_std
17   total_clipped_bases         0.118542        0.002420
20    kmer_js_divergence         0.061129        0.002969
19      kmer_cosine_diff         0.044505        0.002501
15         softclip_lef



ValueError: X has 23 features, but SimpleImputer is expecting 24 features as input.