In [2]:
from pycaret.classification import setup, compare_models, models, add_metric
from sklearn.metrics import average_precision_score
from dataclasses import dataclass, field
import pandas as pd
from loguru import logger
from scipy.stats import randint, uniform
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import average_precision_score
from sklearn.model_selection import (
    RandomizedSearchCV,
    cross_val_score,
    train_test_split,
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import RobustScaler
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
df = pd.read_csv('data/classification_train.csv')

In [2]:
clf = setup(data = df, target = 'class', ignore_features=['id'], numeric_imputation='median', normalize=True)
add_metric('auprc', 'AUPRC', average_precision_score, target = 'pred_proba')

Unnamed: 0,Description,Value
0,Session id,7566
1,Target,class
2,Target type,Binary
3,Original data shape,"(1600, 33)"
4,Transformed data shape,"(1600, 64)"
5,Transformed train set shape,"(1120, 64)"
6,Transformed test set shape,"(480, 64)"
7,Ignore features,1
8,Ordinal features,2
9,Numeric features,20


Name                                                             AUPRC
Display Name                                                     AUPRC
Score Function       <pycaret.internal.metrics.EncodedDecodedLabels...
Scorer               make_scorer(average_precision_score, needs_pro...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                 True
Multiclass                                                        True
Custom                                                            True
Name: auprc, dtype: object

In [6]:
best = compare_models(include=models().index.tolist(), sort='AUPRC')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,AUPRC,TT (Sec)
et,Extra Trees Classifier,0.8607,0.9284,0.6047,0.903,0.7203,0.634,0.6585,0.8714,0.146
gpc,Gaussian Process Classifier,0.8366,0.8973,0.6811,0.7637,0.7184,0.604,0.6072,0.8259,0.63
rbfsvm,SVM - Radial Kernel,0.8054,0.8654,0.4793,0.8076,0.5992,0.482,0.5117,0.7838,0.172
rf,Random Forest Classifier,0.8,0.8762,0.4213,0.8583,0.5596,0.4502,0.5004,0.7695,0.145
mlp,MLP Classifier,0.8223,0.8544,0.655,0.7379,0.693,0.5686,0.5714,0.7641,0.633
lightgbm,Light Gradient Boosting Machine,0.8089,0.8634,0.5587,0.7551,0.6398,0.5142,0.5265,0.7605,0.741
gbc,Gradient Boosting Classifier,0.8018,0.8607,0.515,0.7647,0.613,0.4868,0.5055,0.7503,0.196
lr,Logistic Regression,0.7652,0.8053,0.4998,0.6605,0.5659,0.4092,0.4189,0.6788,0.066
lda,Linear Discriminant Analysis,0.7607,0.8061,0.4882,0.6516,0.5554,0.3963,0.4059,0.6766,0.07
ada,Ada Boost Classifier,0.7795,0.8179,0.5529,0.6737,0.6038,0.4537,0.4603,0.6688,0.119


## We will focus on some of the best models from AutoML that were covered during classes and fine tune them
- Extra Trees Classifier (bagging)
- Random Forest Classifier (bagging)
- LightGBM (boosting)
- XGBoost (boosting)

In [16]:
import re
train = pd.read_csv('data/classification_train.csv').drop(columns=['Unnamed: 0'])
test = pd.read_csv('data/classification_test.csv').drop(columns=['Unnamed: 0'])
y = train['class']
y_out_of_sample = test['class']
x = train.drop(['class', 'id'], axis=1)
x_out_of_sample = test.drop(['class', 'id'], axis=1)

In [51]:
@dataclass
class HyperoptInput:
    model: BaseEstimator
    hyperopt_space: dict = field(default_factory=dict)


def get_pipeline_for_model(
    model: BaseEstimator, model_params: dict = None
):
    numerical_prep = make_pipeline(RobustScaler())
    categorical_prep = make_pipeline(
        OneHotEncoder(handle_unknown="ignore", sparse=False, drop="first"),
    )
    preprocess = ColumnTransformer(
        [
            (
                "numerical",
                numerical_prep,
                make_column_selector(dtype_include=["int64", "float64"]),
            ),
            (
                "categorical",
                categorical_prep,
                make_column_selector(dtype_include=object),
            ),
        ],
        remainder="passthrough",
    )
    preprocess.set_output(transform="pandas")
    return Pipeline(
        [
            ("preprocess", preprocess),
            ("model", model(**model_params if model_params else {})),
        ]
    )
base_hyperopt_inputs = [
    HyperoptInput(
        model=RandomForestClassifier,
        hyperopt_space={
            "model__n_jobs": [-1],
            "model__n_estimators": randint(100, 500),
            "model__max_depth": randint(3, 10),
            "model__min_samples_split": uniform(0.01, 0.1),
            "model__min_samples_leaf": uniform(0.01, 0.1),
        },
    ),
    HyperoptInput(
        model=XGBClassifier,
        hyperopt_space={
            "model__n_jobs": [-1],
            "model__n_estimators": randint(50, 500),
            "model__max_depth": randint(3, 10),
            "model__learning_rate": uniform(0.01, 0.3),
            "model__subsample": uniform(0.5, 0.5),
            "model__colsample_bytree": uniform(0.5, 0.5),
            "model__gamma": uniform(0, 5),
        },
    ),
    HyperoptInput(
        model=ExtraTreesClassifier,
        hyperopt_space={
            "model__n_jobs": [-1],
            "model__criterion": ["gini", "entropy", "log_loss"],
            "model__n_estimators": randint(100, 500),
            "model__max_depth": randint(3, 10),
            "model__min_samples_split": uniform(0.01, 0.1),
            "model__min_samples_leaf": uniform(0.01, 0.1),
            "model__class_weight": ["balanced", "balanced_subsample"],
        },
    ),
    HyperoptInput(
        model=LGBMClassifier,
    hyperopt_space={
        "model__n_jobs": [-1],
        "model__boosting_type": ["gbdt"],
        "model__n_estimators": randint(50, 500),
        "model__max_depth": randint(3, 6),
        "model__class_weight": ['balanced'],
        "model__learning_rate": uniform(0.01, 0.3),
        "model__subsample": uniform(0.5, 0.9),
        "model__colsample_bytree": uniform(0.5, 0.9),
        "model__reg_alpha": uniform(0, 3),
        "model__reg_lambda": uniform(0, 3),
    })
]
@dataclass
class OneModelHyperoptResult:
    best_model: BaseEstimator
    best_score: float
    cv_results: pd.DataFrame

    def get_model_name(self) -> str:
        return self.best_model["model"].__class__.__name__


@dataclass
class HyperoptResults:
    results: list

    def __post_init__(self):
        self._sort_by_best_score()

    def _sort_by_best_score(self, reversed: bool = True):
        self.results.sort(key=lambda x: x.best_score, reverse=reversed)

    def get_best_model(self):
        return self.results[0].best_model

    def get_best_score(self):
        return self.results[0].best_score

    def get_merged_df(self):
        results = pd.DataFrame()
        for result in self.results:
            results = pd.concat(
                [
                    results,
                    result.cv_results.assign(
                        model_name=result.get_model_name()
                    ),
                ],
                axis=0,
            )

        return results

    def get_all_dfs(self):
        return [
            (result.get_model_name(), result.cv_results)
            for result in self.results
        ]

    def get_all_scores(self):
        return [
            (result.get_model_name(), result.best_score)
            for result in self.results
        ]

    def get_all_models(self):
        return [
            (result.get_model_name(), result.best_model)
            for result in self.results
        ]


def run_hyperopt_one_model(
    x: pd.DataFrame,
    y: pd.Series,
    model_input: HyperoptInput,
    n_iter: int = 10,
    cv: int = 5,
    random_state: int = 42,
):
    pipeline = get_pipeline_for_model(model_input.model)
    search = RandomizedSearchCV(
        pipeline,
        model_input.hyperopt_space,
        n_iter=n_iter,
        scoring="average_precision",
        n_jobs=-1,
        cv=cv,
        random_state=random_state,
    )
    search.fit(x, y)
    return OneModelHyperoptResult(
        best_model=search.best_estimator_,
        best_score=search.best_score_,
        cv_results=pd.DataFrame(search.cv_results_),
    )
def run_hyperopt(
    hyperopt_inputs=base_hyperopt_inputs,
    n_iter: int = 10,
    cv: int = 5,
    random_state: int = 42,
) -> HyperoptResults:
    results = []
    for model_input in hyperopt_inputs:
        logger.info(f"Running hyperopt for {model_input.model.__name__}")
        result = run_hyperopt_one_model(
            x=x,
            y=y,
            model_input=model_input,
            n_iter=n_iter,
            cv=cv,
            random_state=random_state,
        )
        results.append(result)
        logger.info(f"Best score: {result.best_score}")
    return HyperoptResults(results=results)
results = run_hyperopt()

2023-12-06 21:14:56.560 | INFO     | __main__:run_hyperopt:178 - Running hyperopt for RandomForestClassifier
2023-12-06 21:15:23.996 | INFO     | __main__:run_hyperopt:188 - Best score: 0.7266996188107447
2023-12-06 21:15:24.001 | INFO     | __main__:run_hyperopt:178 - Running hyperopt for XGBClassifier
2023-12-06 21:15:47.837 | INFO     | __main__:run_hyperopt:188 - Best score: 0.7781547349599146
2023-12-06 21:15:47.838 | INFO     | __main__:run_hyperopt:178 - Running hyperopt for ExtraTreesClassifier
2023-12-06 21:16:03.432 | INFO     | __main__:run_hyperopt:188 - Best score: 0.6615877425037071
2023-12-06 21:16:03.434 | INFO     | __main__:run_hyperopt:178 - Running hyperopt for LGBMClassifier


[LightGBM] [Info] Number of positive: 488, number of negative: 1112
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004883 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2969
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


2023-12-06 21:16:14.649 | INFO     | __main__:run_hyperopt:188 - Best score: 0.7585231188444816


In [52]:
results.get_merged_df()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__colsample_bytree,param_model__gamma,param_model__learning_rate,param_model__max_depth,param_model__n_estimators,param_model__n_jobs,...,std_test_score,rank_test_score,model_name,param_model__boosting_type,param_model__class_weight,param_model__reg_alpha,param_model__reg_lambda,param_model__min_samples_leaf,param_model__min_samples_split,param_model__criterion
0,0.746273,0.005173,0.027226,0.002796,0.68727,4.753572,0.229598,7,70,-1,...,0.030645,9,XGBClassifier,,,,,,,
1,3.265392,0.03677,0.029027,0.002676,0.577997,0.290418,0.269853,6,409,-1,...,0.038829,2,XGBClassifier,,,,,,,
2,4.083583,0.137767,0.034857,0.013352,0.510292,4.849549,0.259733,8,435,-1,...,0.051374,10,XGBClassifier,,,,,,,
3,3.592142,0.075455,0.034963,0.004414,0.591702,1.521211,0.167427,6,394,-1,...,0.035929,5,XGBClassifier,,,,,,,
4,3.242565,0.028696,0.030719,0.00146,0.805926,0.697469,0.097643,9,239,-1,...,0.049297,6,XGBClassifier,,,,,,,
5,1.332634,0.058681,0.029382,0.005937,0.809193,1.91231,0.304969,3,180,-1,...,0.040148,7,XGBClassifier,,,,,,,
6,2.487211,0.040815,0.03372,0.00752,0.840154,2.252496,0.013979,3,365,-1,...,0.047532,8,XGBClassifier,,,,,,,
7,4.415582,0.050267,0.030992,0.003501,0.692708,0.079831,0.079268,6,416,-1,...,0.043188,1,XGBClassifier,,,,,,,
8,2.920445,0.099591,0.058157,0.061182,0.804998,4.165975,0.062009,3,469,-1,...,0.040938,3,XGBClassifier,,,,,,,
9,5.011561,0.141616,0.041362,0.020787,0.877681,2.125779,0.072382,6,359,-1,...,0.041425,4,XGBClassifier,,,,,,,


In [53]:
results.get_all_scores()

[('XGBClassifier', 0.7781547349599146),
 ('LGBMClassifier', 0.7585231188444816),
 ('RandomForestClassifier', 0.7266996188107447),
 ('ExtraTreesClassifier', 0.6615877425037071)]

In [54]:
best_model = results.get_best_model()
best_model

In [55]:
y_pred = best_model.predict(x_out_of_sample)
average_precision_score(y_out_of_sample, y_pred)

0.6661848635235732

In [58]:
for i in range(4):
    y_pred = results.results[i].best_model.predict(x_out_of_sample)
    print(f'oos score: {results.get_all_scores()[i][0]}: {average_precision_score(y_out_of_sample, y_pred)}')

oos score: XGBClassifier: 0.6661848635235732
oos score: LGBMClassifier: 0.6335969664138679
oos score: RandomForestClassifier: 0.52
oos score: ExtraTreesClassifier: 0.5147297297297297
