In [1]:
#importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.pylab import rcParams
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import RocCurveDisplay
from sklearn import tree 
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
import xgboost as xgb_package

In [2]:
#to visualise
import graphviz
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/'

In [3]:
#reading files
training = pd.read_csv("training.csv", sep= ";", encoding='cp1252')

In [4]:
#what to drop
to_drop = ["neighbour code", "target code", "predictive"]

In [5]:
#classifiers
xgb = xgb_package.XGBClassifier(objective="binary:logistic", random_state=42, use_label_encoder=False)
rfc = RandomForestClassifier(n_estimators=1000, min_samples_leaf=3, max_depth= 10, max_samples=33,
                            bootstrap= True, random_state= 42,
                            class_weight= 'balanced')

In [6]:
#data preparation
def prepare_scale_data(training=training, to_drop= to_drop):
    """
    This function prepares the training data for the machine learning.
    Drops the previously selected columns using the to_drop list without renaming the remaining columns.
    Splits the training data to input parameters and labels.
    Lists the column of input parameters.
    Uses the StandardScaler to scale the training data.

    Args:
        to_drop: The list of column names which are going to be dropped from the training and unknown dataframe. 
        training: The dataframe of input parameters which are intended to use to the training of the machine learning model.
        
    Returns:
        X: The input data from the training dataframe.
        y: The labels for the training dataframe.
        feature_names: The list of column names in the X dataframe.

    """
    training_deleted = training.drop(columns= to_drop)
    X = training_deleted.iloc[:,:-1]
    y = training_deleted.iloc[:, -1]
    feature_names = list(X.columns)
    
    #skála igazítása
    scale = StandardScaler()
    scale.fit(X)
    X = scale.transform(X)
    
    return X, y, feature_names

In [7]:
#prepare all data
X, y, feature_names = prepare_scale_data()

In [8]:
#RF
clf = RandomForestClassifier(random_state= 42)
np.random.seed(0)

param_distributions = {"criterion": ["gini", "entropy", "log_loss"],
                      "max_depth": range(1,15+1, 2),
                      "min_samples_split": range(2,10+1, 2),
                      "min_samples_leaf": range(1,10+1, 2),
                      "min_weight_fraction_leaf": [0.5, 0.4, 0.3, 0.2, 0.1, 0],
                      "max_features": ["sqrt", "log2", None],
                      "max_leaf_nodes": range(2,30+1),
                      "oob_score": [True, False],
                      "n_jobs": range(1,50+1, 10),
                      "warm_start": [True, False],
                      "class_weight": ["balanced", "balanced_subsample", None],
                      "ccp_alpha": [0.5, 0.4, 0.3, 0.2, 0.1, 0],
                      "max_samples": range(1,72+1, 10)}

search = HalvingRandomSearchCV(clf, param_distributions,
                               resource='n_estimators',
                               max_resources= 100,
                               random_state= 42,
                               cv= 50,
                               verbose= 3,
                               n_jobs= 30).fit(X, y)
search.best_params_

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 1
max_resources_: 100
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 100
n_resources: 1
Fitting 50 folds for each of 100 candidates, totalling 5000 fits
----------
iter: 1
n_candidates: 34
n_resources: 3
Fitting 50 folds for each of 34 candidates, totalling 1700 fits
----------
iter: 2
n_candidates: 12
n_resources: 9
Fitting 50 folds for each of 12 candidates, totalling 600 fits
----------
iter: 3
n_candidates: 4
n_resources: 27
Fitting 50 folds for each of 4 candidates, totalling 200 fits
----------
iter: 4
n_candidates: 2
n_resources: 81
Fitting 50 folds for each of 2 candidates, totalling 100 fits


{'warm_start': False,
 'oob_score': False,
 'n_jobs': 31,
 'min_weight_fraction_leaf': 0,
 'min_samples_split': 4,
 'min_samples_leaf': 3,
 'max_samples': 61,
 'max_leaf_nodes': 27,
 'max_features': None,
 'max_depth': 5,
 'criterion': 'log_loss',
 'class_weight': 'balanced',
 'ccp_alpha': 0,
 'n_estimators': 81}

In [13]:
#XGB
clf = xgb_package.XGBClassifier(random_state= 42)
np.random.seed(0)

param_distributions = {"criterion": ["gini", "entropy", "log_loss"],
                      "max_depth": range(1,10+1, 2),
                      "max_leaves": range(1,20+1, 2),
                      "max_bin": range(2,40+1, 2),
                      "grow_policy": ["depthwise", "lossguide"],
                      "learning_rate": [1, 0.5, 0],
                      "booster": ["gbtree", "gblinear", "dart", None],
                      "gamma": [1, 0.75, 0.5, 0],
                      "min_child_weight": [0.75, 0.5, 0.25],
                      "max_delta_step": [1, 0.5, 0],
                      "subsample": [0.75, 0.5, 0],
                      "reg_alpha": [0.5, 0.25, 0],
                      "reg_lambda": [0.5, 0.25, 0],
                      "scale_pos_weight": [0.75, 0.5, 0.25],
                      "importance_type": ["gain", "weight", "cover", "total_gain", "total_cover"]}

search = HalvingRandomSearchCV(clf, param_distributions,
                               resource='n_estimators',
                               max_resources= 100,
                               random_state= 42,
                               cv= 50,
                               verbose= 3,
                               n_jobs= 40).fit(X, y)
search.best_params_

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 1
max_resources_: 100
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 100
n_resources: 1
Fitting 50 folds for each of 100 candidates, totalling 5000 fits
----------
iter: 1
n_candidates: 34
n_resources: 3
Fitting 50 folds for each of 34 candidates, totalling 1700 fits
----------
iter: 2
n_candidates: 12
n_resources: 9
Fitting 50 folds for each of 12 candidates, totalling 600 fits
----------
iter: 3
n_candidates: 4
n_resources: 27
Fitting 50 folds for each of 4 candidates, totalling 200 fits
----------
iter: 4
n_candidates: 2
n_resources: 81
Fitting 50 folds for each of 2 candidates, totalling 100 fits
Parameters: { "criterion" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issu

{'subsample': 0.5,
 'scale_pos_weight': 0.75,
 'reg_lambda': 0.25,
 'reg_alpha': 0.5,
 'min_child_weight': 0.75,
 'max_leaves': 11,
 'max_depth': 7,
 'max_delta_step': 0,
 'max_bin': 38,
 'learning_rate': 1,
 'importance_type': 'cover',
 'grow_policy': 'lossguide',
 'gamma': 0,
 'criterion': 'log_loss',
 'booster': 'dart',
 'n_estimators': 81}