# TODO

In [39]:
from heapq import nlargest

import numpy as np, pandas as pd

import joblib

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import VotingClassifier

from src.helper_utilities import load_data
from src.modeling_utilities import f2_scorer

In [40]:
random_state = 42
cv = 5

In [41]:
# load the original dataset as df
X, y = load_data(mode='modeling', format='dataframe', introduce_nans=0.01, random_state=random_state)

# the "orange" dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=random_state)

# the "green" dataset
X_dev, X_val, y_dev, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=random_state)

# X_... is a pd.DataFrame
X_test[:3]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
690,A11,15.0,A34,A42,975.0,A61,A73,2.0,A91,A101,3.0,A122,25.0,A143,A152,2.0,A173,1.0,A191,A201
296,A14,12.0,A32,A41,4675.0,A65,A72,,A92,A101,4.0,A123,20.0,A143,A151,1.0,A173,1.0,A191,A201
672,A14,60.0,A32,A40,10366.0,A61,A75,2.0,A93,A101,4.0,A122,42.0,A143,A152,1.0,A174,1.0,A192,A201


In [42]:
best_estimators = joblib.load("models/best_estimators")

In [43]:
collector = dict()

best_estimators_and_ensembles =dict()

for pl in best_estimators:
    name = pl[-1].__class__.__name__ + ("2" if hasattr(pl[-1], 'penalty') and pl[-1].penalty == 'l2' else "")

    collector[name] = {
        "F2":  (temp := cross_val_score(pl, X_train, y_train, scoring=f2_scorer, cv=cv)).mean(),
        "sd(F2)": temp.std(),
        "AUC": cross_val_score(pl, X_train, y_train, scoring='roc_auc', cv=cv).mean(),
        "balanced accuracy": cross_val_score(pl, X_train, y_train, scoring='balanced_accuracy', cv=cv).mean(),
        "F1": cross_val_score(pl, X_train, y_train, scoring='f1', cv=cv).mean(),
    }

    best_estimators_and_ensembles[pl] = temp.mean()

print("Scores of the individual estimators:")
pd.DataFrame(collector).T.sort_values('F2', ascending=False).round(2)

Scores of the individual estimators:


Unnamed: 0,F2,sd(F2),AUC,balanced accuracy,F1
SVC,0.69,0.03,0.79,0.73,0.62
LogisticRegression2,0.69,0.05,0.79,0.73,0.62
LogisticRegression,0.66,0.06,0.79,0.71,0.6
LGBMClassifier,0.62,0.04,0.78,0.71,0.6
DecisionTreeClassifier,0.56,0.04,0.66,0.65,0.52


In [44]:
ensembles = [best_estimators[:n] for n in range(2, len(best_estimators)+1)]

for ensemble in ensembles:
    estimators = [(f"est{i+1}", est) for i,est in enumerate(ensemble)]
    vc = pl = VotingClassifier(estimators, voting='soft')

    collector[f"ensemble of {len(vc.estimators)}"] = {
    "F2":  (temp := cross_val_score(pl, X_train, y_train, scoring=f2_scorer, cv=cv)).mean(),
    "sd(F2)": temp.std(),
    "AUC": cross_val_score(pl, X_train, y_train, scoring='roc_auc', cv=cv).mean(),
    "balanced accuracy": cross_val_score(pl, X_train, y_train, scoring='balanced_accuracy', cv=cv).mean(),
    "F1": cross_val_score(pl, X_train, y_train, scoring='f1', cv=cv).mean(),
    }

    best_estimators_and_ensembles[pl] = temp.mean()

print("Scores:")
pd.DataFrame(collector).T.sort_values('F2', ascending=False).round(2)


Scores:


Unnamed: 0,F2,sd(F2),AUC,balanced accuracy,F1
SVC,0.69,0.03,0.79,0.73,0.62
LogisticRegression2,0.69,0.05,0.79,0.73,0.62
LogisticRegression,0.66,0.06,0.79,0.71,0.6
ensemble of 3,0.65,0.07,0.8,0.73,0.62
ensemble of 4,0.64,0.05,0.8,0.73,0.62
LGBMClassifier,0.62,0.04,0.78,0.71,0.6
ensemble of 5,0.61,0.06,0.8,0.72,0.6
ensemble of 2,0.59,0.09,0.8,0.71,0.59
DecisionTreeClassifier,0.56,0.04,0.66,0.65,0.52


In [45]:
n_best = 5

best_estimators_and_ensembles = tuple(t[0] for t in nlargest(n_best, best_estimators_and_ensembles.items(), key=lambda t: t[-1]))

joblib.dump(best_estimators_and_ensembles, 'models/best_estimators_and_ensembles')

['models/best_estimators_and_ensembles']