In [1]:
import os
import time
import json
import warnings
import numpy as np
import xgboost as xgb
import pandas as pd

warnings.simplefilter("ignore")

from sklearn import set_config
from sklearn.linear_model import LogisticRegression
from skopt import BayesSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.exceptions import ConvergenceWarning
from tqdm import tqdm

warnings.simplefilter("ignore", ConvergenceWarning)
set_config(transform_output="pandas")

In [2]:
class BayesSearchCVWithProgressBar(BayesSearchCV):
    def fit(self, X, y=None, groups=None, callback=None):
        self._progress_bar = tqdm(
            total=self.n_iter, desc="BayesSearchCV Progress", leave=True
        )

        def update_progress(*args, **kwargs):
            self._progress_bar.update(1)

        super().fit(X, y=y, groups=groups, callback=update_progress)

        self._progress_bar.close()

In [3]:
data_dir = os.path.join(".", "..", "data")
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
raport_dir = os.path.join(".", "..", "raport")

os.makedirs(raport_dir, exist_ok=True)

In [4]:
"(LP, STATUS, HRBP, REKRUTER, MANAGER_RKS, MANAGER_POS, DYREKTOR_PIONU, SPOLKA, PION, DEPARTAMENT_REGION, JEDNOSTKA_ORGANIZACYJNA, NAZWA_STANOWISKA, POZIOM_STANOWISKA, LOKALIZACJA, RODZAJ_WAKATU, IMIE_NAZWISKO_POPRZEDNIEJ_OSOBY, MODEL_PROWADZENIA_REK_DO_POS, KRYTERIUM_DECYDUJACE, DATA_ZGLOSZENIA, DATA_OTWARCIA, DATA_OSTATNIEJ_AKTUALIZACJI, DATA_POLECENIA_PIERWSZEGO_KANDYDATA, DATA_POLECENIA_ZATRUDNIONEGO_KANDYDATA, DATA_ZLOZENIA_OFERTY, IMIE_NAZWISKO_ZATRUDNIONEGO, IMIE_ZATRUDNIONEGO, NAZWISKO_ZATRUDNIONEGO, SPOSOB_POZYSKANIA_KANDYDATA, BUMERANG, WYNAGRODZENIE, DATA_ROZPOCZECIA_PRACY, LICZBA_ODRZUCONYCH_OFERT, KOMENTARZ, CLEAR, CAN_DO, CLOSE_CARE, PASSION_FOR_BETTER, JOY, PLEC, CZAS_OD_ZGLOSZENIA_REK_DO_URUCHOMIENIA, CZAS_DO_POLECENIA_PIERWSZEGO_K_BIZNESOWI, CZAS_DO_POLECENIA_ZATRUDNIONEGO_K_BIZNESOWI, CZAS_DECYZJI_BIZNESU, CZAS_OD_ROZPOCZECIA_DO_ZLOZENIA, CZAS_OD_OSTATNIEJ_ZMIANY_PROFILU_DO_ZLOZENIA_OFERTY, TIME_TO_HIRE, ID_REKRUTACJI)".count(
    ","
)

46

# Bayes Search

In [5]:
def perform_bayes_search(
    clf_class,
    search_spaces,
    n_runs,
    cv=3,
    verbose=0,
    scoring="roc_auc",
    n_jobs=4,
    random_state=42,
    return_train_score=True,
):
    spaces = []
    bases = []
    for i in range(5):
        print(f"Creating optimization process for dataset {i}")
        bayes_search = BayesSearchCVWithProgressBar(
            estimator=clf_class(),
            search_spaces=search_spaces,
            scoring=scoring,
            cv=cv,
            verbose=verbose,
            n_jobs=n_jobs,
            n_iter=n_runs,
            return_train_score=return_train_score,
            random_state=random_state,
        )

        X_train = pd.read_csv(os.path.join(train_dir, f"X{i}_train_processed.csv"))
        y_train = pd.read_csv(os.path.join(train_dir, f"y{i}_train.csv")).values.ravel()
        base_model = clf_class().fit(X_train, y_train)

        t0 = time.time()
        bayes_search.fit(X_train, y_train)
        print(f"\t Took {t0 - time.time()} seconds.")

        spaces.append(bayes_search)
        bases.append(base_model)
    return spaces, bases

In [6]:
def generate_raports(grids, bases, name):
    raports = []
    for i, (grid, base_model) in enumerate(zip(grids, bases)):
        print(f"Generating raport for dataset {i}")

        results = grid.cv_results_
        X_test = pd.read_csv(os.path.join(test_dir, f"X{i}_test_processed.csv"))
        y_test = pd.read_csv(os.path.join(test_dir, f"y{i}_test.csv")).values.ravel()
        report = pd.DataFrame(
            {
                "params": results["params"],
                "mean_test_score": results["mean_test_score"],
                "std_test_score": results["std_test_score"],
                "rank_test_score": results["rank_test_score"],
            }
        )

        best_params = grid.best_params_
        best_score = grid.best_score_

        best_model = grid.best_estimator_

        y_pred = best_model.predict(X_test)
        classification_rep = classification_report(y_test, y_pred, output_dict=True)

        y_pred = base_model.predict(X_test)
        classification_rep_base = classification_report(
            y_test, y_pred, output_dict=True
        )

        detailed_report = {
            "best_params": best_params,
            "best_score": best_score,
            "results": report.to_dict(orient="records"),
            "classification_report": classification_rep,
            "classification_report_base": classification_rep_base,
        }
        raports.append(detailed_report)

    with open(os.path.join(raport_dir, name), "w") as f:
        json.dump(raports, f, indent=4)

    return raports

## XGBoost

Motivation stays the same as in [grid search](./model_optimisation_naive.ipynb). Yet will try this approach against larger models space.

In [7]:
param_space = {
    "n_estimators": [100, 300, 500, 1000],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [10, 15, 20],
    "min_child_weight": [1, 2, 5, 10],
    "gamma": [0, 0.15, 0.3],
    "n_jobs": [-1],
    "eval_metric": ["logloss", "error", "auc"],
    "random_state": [42],
}

In [8]:
grids, spaces = perform_bayes_search(
    clf_class=xgb.XGBClassifier, search_spaces=param_space, n_runs=10**2, n_jobs=-1
)

Creating optimization process for dataset 0


BayesSearchCV Progress: 100%|██████████| 100/100 [02:32<00:00,  1.53s/it]


	 Took -152.68452095985413 seconds.
Creating optimization process for dataset 1


BayesSearchCV Progress: 100%|██████████| 100/100 [04:18<00:00,  2.58s/it]


	 Took -258.10764026641846 seconds.
Creating optimization process for dataset 2


BayesSearchCV Progress: 100%|██████████| 100/100 [04:29<00:00,  2.70s/it]


	 Took -269.5133328437805 seconds.
Creating optimization process for dataset 3


BayesSearchCV Progress: 100%|██████████| 100/100 [06:35<00:00,  3.96s/it]


	 Took -395.64903593063354 seconds.
Creating optimization process for dataset 4


BayesSearchCV Progress: 100%|██████████| 100/100 [04:02<00:00,  2.42s/it]

	 Took -242.16162610054016 seconds.





In [9]:
raports = generate_raports(grids, spaces, "xgboost_bayes.json")

Generating raport for dataset 0
Generating raport for dataset 1
Generating raport for dataset 2
Generating raport for dataset 3
Generating raport for dataset 4


## Logistic Regression

Motivation stays the same as in [grid search](./model_optimisation_naive.ipynb). Yet will try this approach against larger models space.

In [19]:
param_space = {
    "C": np.logspace(-4, 4, 10),
    # "solver": ["liblinear", "saga", "lbfgs"],
    # "penalty": ["l1", "l2", "elasticnet"],
    "max_iter": [50, 100, 200, 300, 500, 1000],
    "tol": [1e-3, 1e-4, 1e-5, 1e-6],
    "fit_intercept": [True, False],
    "class_weight": [None, "balanced"],
    "random_state": [42],
    "max_iter": [500],
}

In [20]:
grids, spaces = perform_bayes_search(
    clf_class=LogisticRegression,
    search_spaces=param_space,
    n_runs=10**2,
    n_jobs=-1,
)

Creating optimization process for dataset 0


BayesSearchCV Progress:   0%|          | 0/100 [00:54<?, ?it/s].73it/s]
BayesSearchCV Progress: 100%|██████████| 100/100 [03:42<00:00,  2.22s/it]


	 Took -222.11804389953613 seconds.
Creating optimization process for dataset 1


BayesSearchCV Progress: 100%|██████████| 100/100 [03:38<00:00,  2.18s/it]


	 Took -218.36975002288818 seconds.
Creating optimization process for dataset 2


BayesSearchCV Progress: 100%|██████████| 100/100 [03:21<00:00,  2.02s/it]


	 Took -201.65851020812988 seconds.
Creating optimization process for dataset 3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

	 Took -197.96674990653992 seconds.
Creating optimization process for dataset 4


BayesSearchCV Progress: 100%|██████████| 100/100 [03:04<00:00,  1.84s/it]

	 Took -184.4291651248932 seconds.





In [21]:
raports = generate_raports(grids, spaces, "logistic_bayes.json")

Generating raport for dataset 0
Generating raport for dataset 1
Generating raport for dataset 2
Generating raport for dataset 3
Generating raport for dataset 4


## KNN

Motivation for this parameter space:

- [Official documentation](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)
- [Medium](https://medium.com/@mohtedibf/in-depth-parameter-tuning-for-knn-4c0de485baf6)
- [Some intuition](https://www.datasklr.com/select-classification-methods/k-nearest-neighbors)

In [22]:
param_space = {
    "n_neighbors": [3, 5, 7, 9, 11, 15, 30, 50],
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "metric": ["euclidean", "manhattan", "minkowski", "chebyshev"],
    "leaf_size": [20, 30, 40, 50],
}

In [23]:
grids, spaces = perform_bayes_search(
    clf_class=KNeighborsClassifier,
    search_spaces=param_space,
    n_runs=10**2,
    n_jobs=-1,
)

Creating optimization process for dataset 0


BayesSearchCV Progress: 100%|██████████| 100/100 [03:27<00:00,  2.07s/it]


	 Took -207.469340801239 seconds.
Creating optimization process for dataset 1


BayesSearchCV Progress: 100%|██████████| 100/100 [04:00<00:00,  2.40s/it]


	 Took -240.05590415000916 seconds.
Creating optimization process for dataset 2


BayesSearchCV Progress: 100%|██████████| 100/100 [03:52<00:00,  2.32s/it]


	 Took -232.41501903533936 seconds.
Creating optimization process for dataset 3


BayesSearchCV Progress: 100%|██████████| 100/100 [04:34<00:00,  2.75s/it]


	 Took -274.81183886528015 seconds.
Creating optimization process for dataset 4


BayesSearchCV Progress: 100%|██████████| 100/100 [03:48<00:00,  2.28s/it]

	 Took -228.414208650589 seconds.





In [24]:
raports = generate_raports(grids, spaces, "knn_bayes.json")

Generating raport for dataset 0
Generating raport for dataset 1
Generating raport for dataset 2
Generating raport for dataset 3
Generating raport for dataset 4
