# Synthesize information about trained models

In [2]:
from pathlib import Path
import pandas as pd

<h3>Does dropping correlated features lead to a better model? (no)</h3>

In [3]:
# load
def load_agg(p: Path) -> pd.DataFrame:
    df = pd.read_csv(p)
    df["metric"] = df["metric"].astype(str)

    return df.set_index("metric")

# compare deltas of model accuracy across dfs with correlated cols deleted vs not deleted
def compare_agg_folders(no_corr_dir: str, corr_dir: str, out_p: str = "../results/deltas.csv"):
    no_files = {p.name: p for p in Path(no_corr_dir).glob("*_agg.csv")}
    co_files = {p.name: p for p in Path(corr_dir).glob("*_agg.csv")}
    
    common = sorted(set(no_files) & set(co_files))
    metric_cols = ["best_cv_acc", "test_acc", "test_precision", "test_recall", "test_f1", "test_roc_auc"]

    wide_rows = {}
    for fname in common:
        df_no = load_agg(no_files[fname])
        df_co = load_agg(co_files[fname])

        # delta_mean = no_corr - corr
        delta_mean = df_no["mean"] - df_co["mean"]

        model = fname.replace("_agg.csv", "")
        wide_rows[model] = delta_mean.reindex(metric_cols)

    deltas = pd.DataFrame.from_dict(wide_rows, orient="index")
    deltas.index.name = "model"
    deltas.to_csv(out_p)

    return deltas

deltas = compare_agg_folders(no_corr_dir="../results/no_ftrs_dropped", corr_dir="../results/ftrs_dropped",)

print(deltas)


                best_cv_acc  test_acc  test_precision  test_recall   test_f1  \
model                                                                          
knn                0.003044  0.009091        0.036782     0.018182  0.021261   
logreg_elastic    -0.001081  0.000000        0.000000     0.000000  0.000000   
logreg_l2          0.003215 -0.018182       -0.035714    -0.054545 -0.056982   
rfc               -0.009502  0.009091        0.006667     0.018182  0.017647   
svc                0.002162 -0.009091       -0.003333     0.036364  0.003890   
xgb                0.001991  0.009091        0.029212    -0.018182 -0.003896   

                test_roc_auc  
model                         
knn             4.462810e-02  
logreg_elastic  2.220446e-16  
logreg_l2      -1.322314e-02  
rfc            -6.611570e-03  
svc            -4.958678e-03  
xgb             5.785124e-02  


<h3>Find top two models for primary and secondary evaluation metrics</h3>

In [4]:
# find best 2 models for each eval metric

root = Path("../results/no_ftrs_dropped") 

rows = []

for p in root.rglob("*_agg.csv"):
    df = pd.read_csv(p)

    model = p.stem.replace("_agg", "")

    for _, r in df.iterrows():
        rows.append({"model": model, "metric": r["metric"], "mean": r["mean"], "std": r["std"]})

all_metrics = pd.DataFrame(rows)

per_metric = (all_metrics
    .sort_values(["metric", "mean", "std"], ascending=[True, False, True])
    .groupby("metric", as_index=False).head(10).reset_index(drop=True))

per_metric.to_csv('../results/best_per_metric.csv', index=False)
print(per_metric)

             model          metric      mean       std
0   logreg_elastic     best_cv_acc  0.594708  0.025016
1              xgb     best_cv_acc  0.564040  0.015839
2              rfc     best_cv_acc  0.560939  0.021646
3              knn     best_cv_acc  0.559602  0.013331
4        logreg_l2     best_cv_acc  0.555590  0.019660
5              svc     best_cv_acc  0.554509  0.025536
6   logreg_elastic        test_acc  0.681818  0.101639
7        logreg_l2        test_acc  0.645455  0.126131
8              rfc        test_acc  0.609091  0.104644
9              xgb        test_acc  0.590909  0.132522
10             svc        test_acc  0.581818  0.108521
11             knn        test_acc  0.581818  0.141567
12  logreg_elastic         test_f1  0.682120  0.079114
13       logreg_l2         test_f1  0.584508  0.192599
14             rfc         test_f1  0.577983  0.066453
15             knn         test_f1  0.557971  0.104257
16             xgb         test_f1  0.557702  0.112770
17        