# **Hyper Parameter Optimisation**

## Objectives

* Document hyperparameter selection and tuning strategies

## Inputs

* outputs/datasets/collection/HeartDiseasePrediction.csv
* Instructions on data cleaning and feature engineering from the relevant notebooks

## Additional Comments

* There are no outputs from this notebook as it is solely for demonstration purposes


---

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os


current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

# Load Data

Load the raw dataset and replace values of 0 in RestingBP and Cholesterol with NaN ready for the ML pipeline

In [None]:
import pandas as pd
import numpy as np


df = pd.read_csv("outputs/datasets/collection/HeartDiseasePrediction.csv")

for col in ["RestingBP", "Cholesterol"]:
    df[col] = df[col].replace(0, np.nan)

df.isna().sum()

---

# ML Pipelines and Data Preparation

## Pipeline for Data Cleaning and Feature Engineering

In [None]:
from sklearn.pipeline import Pipeline

#Data Cleaning
from feature_engine.imputation import MeanMedianImputer, RandomSampleImputer

# Feature Engineering
from feature_engine.discretisation import ArbitraryDiscretiser
from feature_engine.encoding import OrdinalEncoder


def DataCleaningandFeatEngPipeline():

    pipeline = Pipeline([
        ("median_imputation", MeanMedianImputer(imputation_method="median",
                                                variables=["RestingBP"])),
        ("random_sample_imputation", RandomSampleImputer(random_state=1,
                                                         seed='general',
                                                         variables=["Cholesterol"])),
        ("arbitrary_discretisation", ArbitraryDiscretiser(binning_dict={"Oldpeak":[-np.inf, 0, 1.5, np.inf]})),
        ("ordinal_encoding", OrdinalEncoder(encoding_method="arbitrary",
                                            variables=["Sex",
                                                       "ChestPainType",
                                                       "FastingBS",
                                                       "RestingECG",
                                                       "ExerciseAngina",
                                                       "ST_Slope"])),
        ])

    return pipeline

## Pipeline for Modelling

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

# Feature Selection
from sklearn.feature_selection import SelectFromModel

# ML Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


def ClassificationPipeline(model):

    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("feat_selection", SelectFromModel(model)),
        ("model", model),
        ])

    return pipeline

## Split Data into Train and Test Sets

* The data is split into train and test sets and transformed using the data cleaning and feature engineering pipeline.

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    df.drop(["HeartDisease"], axis=1),
    df["HeartDisease"],
    test_size=0.2,
    random_state=0,
)

data_cleaning_feat_eng_pipeline = DataCleaningandFeatEngPipeline()
X_train = data_cleaning_feat_eng_pipeline.fit_transform(X_train)
X_test = data_cleaning_feat_eng_pipeline.transform(X_test)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

* Target balancing is then carried out.

In [None]:
from imblearn.over_sampling import SMOTE


oversample = SMOTE(sampling_strategy='minority', random_state=0)
X_train, y_train = oversample.fit_resample(X_train, y_train)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

---

# Hyperparameter Optimisation

* Load custom hyperparameter optimsation function from Code Institute

In [None]:
from sklearn.model_selection import GridSearchCV


class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")

            model = ClassificationPipeline(self.models[key])
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, )
            gs.fit(X, y)
            self.grid_searches[key] = gs

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                'estimator': key,
                'min_score': min(scores),
                'max_score': max(scores),
                'mean_score': np.mean(scores),
                'std_score': np.std(scores),
            }
            return pd.Series({**params, **d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                scores.append(r.reshape(len(params), 1))

            all_scores = np.hstack(scores)
            for p, s in zip(params, all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
        columns = ['estimator', 'min_score',
                   'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        return df[columns], self.grid_searches

* Load custom model evaluation function from Code Institute

In [None]:
from sklearn.metrics import classification_report, confusion_matrix


def confusion_matrix_and_report(X, y, pipeline, label_map):

    prediction = pipeline.predict(X)

    print('---  Confusion Matrix  ---')
    print(pd.DataFrame(confusion_matrix(y_true=prediction, y_pred=y),
          columns=[["Actual " + sub for sub in label_map]],
          index=[["Prediction " + sub for sub in label_map]]
          ))
    print("\n")

    print('---  Classification Report  ---')
    print(classification_report(y, prediction, target_names=label_map), "\n")


def clf_performance(X_train, y_train, X_test, y_test, pipeline, label_map):
    print("#### Train Set #### \n")
    confusion_matrix_and_report(X_train, y_train, pipeline, label_map)

    print("#### Test Set ####\n")
    confusion_matrix_and_report(X_test, y_test, pipeline, label_map)

In order to streamline the tuning and selection of hyperparameters, each algorithm was investigated and optimised individually before being tested against each other.

## RandomForestClassifier

In [None]:
model = {"RandomForestClassifier": RandomForestClassifier(random_state=0)}

Before carrying out the optimisation, it is best to understand what the hyperparameters to investigate mean:

* ```n_estimators``` - The number of decision trees in the 'forest'.
* ```max_depth``` - The maximum depth of each tree, ie the depth at which nodes will expand to until either all leaves contain only a single class or  all leaves contain less than the minimum number of sample required to split.
* ```min_samples_split``` - The minimum number of samples required to split a node.
* ```min_samples_leaf``` - The minimum number of samples required to be at a leaf node.
* ```max_leaf_nodes``` - Trees with max_leaf_nodes are grown in best-first fashion.
* ```max_features``` - The number of features to consider when looking for the best split. If None, then max_features is equal to the total number of features.

The first round of optimisation was carried out using the Code Institute recommended hyperparameter ranges.

In [None]:
params_search = {
    "RandomForestClassifier": {
        "model__n_estimators": [50, 100, 140],
        "model__max_depth": [None, 4, 15],
        "model__min_samples_split": [2, 50],
        "model__min_samples_leaf": [1, 50],
        "model__max_leaf_nodes": [None, 50],
        }
        }

In [None]:
from sklearn.metrics import recall_score, make_scorer


search = HyperparameterOptimizationSearch(models=model, params=params_search)
search.fit(X_train, y_train,
           scoring =  make_scorer(recall_score, pos_label=1),
           n_jobs=-1, cv=5)

In [None]:
extensive_grid_search_summary, extensive_grid_search_pipelines = search.score_summary(sort_by='mean_score')

best_model = extensive_grid_search_summary.iloc[0,0]
best_parameters = extensive_grid_search_pipelines[best_model].best_params_

classification_pipeline = extensive_grid_search_pipelines[best_model].best_estimator_

best_parameters

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=classification_pipeline,
                label_map=["No Heart Disease", "Heart Disease"]
                )

The performance of the test set was significantly worse than the train set, so the model may be overfitting using these parameters.

* Recall on heart disease was 95% for the train set and 79% for the test set.
* Precision on no heart disease was 95% for the train set and 73% for the test set.

For the second round of optimisation, the max_features hyperparameter was added.

In [None]:
params_search = {
    "RandomForestClassifier": {
        "model__n_estimators": [50, 100, 140],
        "model__max_depth": [None, 4, 15],
        "model__min_samples_split": [2, 50],
        "model__min_samples_leaf": [1, 50],
        "model__max_leaf_nodes": [None, 50],
        "model__max_features": [None, "sqrt", "log2"],
        }
        }

In [None]:
search = HyperparameterOptimizationSearch(models=model, params=params_search)
search.fit(X_train, y_train,
           scoring =  make_scorer(recall_score, pos_label=1),
           n_jobs=-1, cv=5)

In [None]:
extensive_grid_search_summary, extensive_grid_search_pipelines = search.score_summary(sort_by='mean_score')

best_model = extensive_grid_search_summary.iloc[0,0]
best_parameters = extensive_grid_search_pipelines[best_model].best_params_

classification_pipeline = extensive_grid_search_pipelines[best_model].best_estimator_

best_parameters

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=classification_pipeline,
                label_map=["No Heart Disease", "Heart Disease"]
                )

This time, the model performs similarly on the train and test sets:

* Recall on heart disease was 89% for the train set and 88% for the test set.
* Precision on no heart disease was 86% for the train set and 81% for the test set.

For the third round, the values for n_estimators and min_samples_leaf were expanded.

In [None]:
params_search = {
    "RandomForestClassifier": {
        "model__n_estimators": [50, 140, 300, 500, 800, 1200],
        "model__max_depth": [None, 4, 15],
        "model__min_samples_split": [2, 50],
        "model__min_samples_leaf": [1, 2, 5, 10, 50],
        "model__max_leaf_nodes": [None, 50],
        "model__max_features": [None, "sqrt", "log2"],
        },
        }

In [None]:
search = HyperparameterOptimizationSearch(models=model, params=params_search)
search.fit(X_train, y_train,
           scoring =  make_scorer(recall_score, pos_label=1),
           n_jobs=-1, cv=5)

In [None]:
extensive_grid_search_summary, extensive_grid_search_pipelines = search.score_summary(sort_by='mean_score')

best_model = extensive_grid_search_summary.iloc[0,0]
best_parameters = extensive_grid_search_pipelines[best_model].best_params_

classification_pipeline = extensive_grid_search_pipelines[best_model].best_estimator_

best_parameters

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=classification_pipeline,
                label_map=["No Heart Disease", "Heart Disease"]
                )

The model performed the same as in the previous round:

* Recall on heart disease was 89% for the train set and 88% for the test set.
* Precision on no heart disease was 86% for the train set and 81% for the test set.

For the fourth round, the min_samples_split values were expanded to see whether any improvement in the test set performance could be found.

* The n_estimator range was also reduced, as the previous models had favoured lower numbers.

In [None]:
params_search = {
    "RandomForestClassifier": {
        "model__n_estimators": [50, 140, 300, 500],
        "model__max_depth": [None, 4, 15],
        "model__min_samples_split": [2, 5, 10, 15, 50],
        "model__min_samples_leaf": [1, 2, 5, 10, 50],
        "model__max_leaf_nodes": [None, 50],
        "model__max_features": [None, "sqrt", "log2"],
        },
        }

In [None]:
search = HyperparameterOptimizationSearch(models=model, params=params_search)
search.fit(X_train, y_train,
           scoring =  make_scorer(recall_score, pos_label=1),
           n_jobs=-1, cv=5)

In [None]:
extensive_grid_search_summary, extensive_grid_search_pipelines = search.score_summary(sort_by='mean_score')

best_model = extensive_grid_search_summary.iloc[0,0]
best_parameters = extensive_grid_search_pipelines[best_model].best_params_

classification_pipeline = extensive_grid_search_pipelines[best_model].best_estimator_

best_parameters

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=classification_pipeline,
                label_map=["No Heart Disease", "Heart Disease"]
                )

The model performed the same as in the previous round:

* Recall on heart disease was 89% for the train set and 88% for the test set.
* Precision on no heart disease was 86% for the train set and 81% for the test set.

As no further improvements were found, these hyperparameter value ranges were chosen as the final set for testing.

## LogisticRegression

In [None]:
model = {"LogisticRegression": LogisticRegression(random_state=0)}

Before carrying out the optimisation, it is best to understand what the hyperparameters to investigate mean:

* ```penalty``` - Adds a regularisation penalty in order to tune overfitting.
* ```C``` - Inverse of regularisation strength, gives more weight to data.
* ```tol``` - Tolerance for stopping criteria.


The first round of optimisation was carried out using the Code Institute recommended hyperparameter ranges.

In [None]:
params_search = {
    "LogisticRegression":{
        "model__penalty": ["l2", "l1", "elasticnet", None],
        "model__C": [2, 1, 0.5],
        "model__tol": [1e-3, 1e-4, 1e-5],
    },
    }

In [None]:
search = HyperparameterOptimizationSearch(models=model, params=params_search)
search.fit(X_train, y_train,
           scoring =  make_scorer(recall_score, pos_label=1),
           n_jobs=-1, cv=5)

In [None]:
extensive_grid_search_summary, extensive_grid_search_pipelines = search.score_summary(sort_by='mean_score')

best_model = extensive_grid_search_summary.iloc[0,0]
best_parameters = extensive_grid_search_pipelines[best_model].best_params_

classification_pipeline = extensive_grid_search_pipelines[best_model].best_estimator_

best_parameters

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=classification_pipeline,
                label_map=["No Heart Disease", "Heart Disease"]
                )

The performance on both the train and test sets was good.

* Recall on heart disease was 84% for the train set and 84% for the test set.
* Precision on no heart disease was 84% for the train set and 78% for the test set.

For the second round of optimisation, the C values were expanded upon.

In [None]:
params_search = {
    "LogisticRegression":{
        "model__penalty": ["l2", "l1", "elasticnet", None],
        "model__C": [10, 2, 1.0, 0.5, 0.1],
        "model__tol": [1e-3, 1e-4, 1e-5],
    },
    }

In [None]:
search = HyperparameterOptimizationSearch(models=model, params=params_search)
search.fit(X_train, y_train,
           scoring =  make_scorer(recall_score, pos_label=1),
           n_jobs=-1, cv=5)

In [None]:
extensive_grid_search_summary, extensive_grid_search_pipelines = search.score_summary(sort_by='mean_score')

best_model = extensive_grid_search_summary.iloc[0,0]
best_parameters = extensive_grid_search_pipelines[best_model].best_params_

classification_pipeline = extensive_grid_search_pipelines[best_model].best_estimator_

best_parameters

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=classification_pipeline,
                label_map=["No Heart Disease", "Heart Disease"]
                )

The performance on both the train and test sets was good.

* Recall on heart disease was 84% for the train set and 84% for the test set.
* Precision on no heart disease was 84% for the train set and 78% for the test set.

No further optimisation was carried out using this algorithm.

## XGBClassifier

In [None]:
model = {"XGBClassifier": XGBClassifier(random_state=0)}

Before carrying out the optimisation, it is best to understand what the hyperparameters to investigate mean:

* ```n_estimators``` - Number of gradient boosted trees.
* ```max_depth``` - Maximum tree depth.
* ```gamma``` - Minimum loss reduction required to split a node further, regularisation factor.
* ```learning_rate``` - Applies a weighting factor to corrections by new trees in order to slow down the learning and prevent overfitting.
* ```min_child_weight``` - Minimum weight required for a node to split further.
* ```subsample``` - The fraction of the dataset sampled for each tree.
* ```colsample_bytree``` - The fraction of features used for each tree.

The first round of optimisation was carried out using the Code Institute recommended hyperparameter ranges.

In [None]:
params_search = {
    "XGBClassifier":{
        "model__n_estimators": [30, 80, 200],
        "model__max_depth": [None, 3, 15],
        "model__learning_rate": [0.001, 0.01, 0.1],
        "model__gamma": [0, 0.1],
        }
        }

In [None]:
search = HyperparameterOptimizationSearch(models=model, params=params_search)
search.fit(X_train, y_train,
           scoring =  make_scorer(recall_score, pos_label=1),
           n_jobs=-1, cv=5)

In [None]:
extensive_grid_search_summary, extensive_grid_search_pipelines = search.score_summary(sort_by='mean_score')

best_model = extensive_grid_search_summary.iloc[0,0]
best_parameters = extensive_grid_search_pipelines[best_model].best_params_

classification_pipeline = extensive_grid_search_pipelines[best_model].best_estimator_

best_parameters

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=classification_pipeline,
                label_map=["No Heart Disease", "Heart Disease"]
                )

The performance on both the train and test sets was good, with slightly worse performance on the test set.

* Recall on heart disease was 86% for the train set and 79% for the test set.
* Precision on no heart disease was 84% for the train set and 72% for the test set.

For the second round of optimisation, min_child_weight was added to determine whether that gave any improvements.

In [None]:
params_search = {
    "XGBClassifier":{
        "model__n_estimators": [30, 80, 200],
        "model__max_depth": [None, 3, 15],
        "model__learning_rate": [0.001, 0.01, 0.1],
        "model__gamma": [0, 0.1],
        "model__min_child_weight": [1, 3, 5, 7],
        }
        }

In [None]:
search = HyperparameterOptimizationSearch(models=model, params=params_search)
search.fit(X_train, y_train,
           scoring =  make_scorer(recall_score, pos_label=1),
           n_jobs=-1, cv=5)

In [None]:
extensive_grid_search_summary, extensive_grid_search_pipelines = search.score_summary(sort_by='mean_score')

best_model = extensive_grid_search_summary.iloc[0,0]
best_parameters = extensive_grid_search_pipelines[best_model].best_params_

classification_pipeline = extensive_grid_search_pipelines[best_model].best_estimator_

best_parameters

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=classification_pipeline,
                label_map=["No Heart Disease", "Heart Disease"]
                )

The performance was the same as the previous round of optimisation.

* Recall on heart disease was 86% for the train set and 79% for the test set.
* Precision on no heart disease was 84% for the train set and 72% for the test set.

For the third round of optimisation, subsample and colsample_bytree hyperparameters were both added.

In [None]:
params_search = {
    "XGBClassifier": {
        "model__n_estimators": [30, 80, 200],
        "model__max_depth": [None, 3, 15],
        "model__learning_rate": [0.001, 0.01, 0.1],
        "model__gamma": [0, 0.1],
        "model__min_child_weight": [1, 3, 5, 7],
        "model__subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
        "model__colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
        }
        }

In [None]:
search = HyperparameterOptimizationSearch(models=model, params=params_search)
search.fit(X_train, y_train,
           scoring =  make_scorer(recall_score, pos_label=1),
           n_jobs=-1, cv=5)

In [None]:
extensive_grid_search_summary, extensive_grid_search_pipelines = search.score_summary(sort_by='mean_score')

best_model = extensive_grid_search_summary.iloc[0,0]
best_parameters = extensive_grid_search_pipelines[best_model].best_params_

classification_pipeline = extensive_grid_search_pipelines[best_model].best_estimator_

best_parameters

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=classification_pipeline,
                label_map=["No Heart Disease", "Heart Disease"]
                )

The performance in this round was slight worse than the previous round of optimisation.

* Recall on heart disease was 84% for the train set and 79% for the test set.
* Precision on no heart disease was 83% for the train set and 72% for the test set.

 The best parameters chosen between the last two rounds of optimisation was quite different. 
 
 For the fourth round, the range of values for n_estimators and gamma was increased while keeping other values the same.

In [None]:
params_search = {
    "XGBClassifier": {
        "model__n_estimators": [10, 30, 50, 100, 200],
        "model__max_depth": [None, 3, 15],
        "model__learning_rate": [0.001, 0.01, 0.1],
        "model__gamma": [0, 0.05, 0.075, 0.1],
        "model__min_child_weight": [1, 3, 5, 7],
        "model__subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
        "model__colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1.0],
        }
        }

In [None]:
search = HyperparameterOptimizationSearch(models=model, params=params_search)
search.fit(X_train, y_train,
           scoring =  make_scorer(recall_score, pos_label=1),
           n_jobs=-1, cv=5)

In [None]:
extensive_grid_search_summary, extensive_grid_search_pipelines = search.score_summary(sort_by='mean_score')

best_model = extensive_grid_search_summary.iloc[0,0]
best_parameters = extensive_grid_search_pipelines[best_model].best_params_

classification_pipeline = extensive_grid_search_pipelines[best_model].best_estimator_

best_parameters

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=classification_pipeline,
                label_map=["No Heart Disease", "Heart Disease"]
                )

The performance in this round was the same as the previous round of optimisation.

* Recall on heart disease was 84% for the train set and 79% for the test set.
* Precision on no heart disease was 83% for the train set and 72% for the test set.

For the fifth round of optimisation, the subsample and colsample_bytree parameters were removed in order to determine whether they were negatively impacting the optimisation.

In [None]:
params_search = {
    "XGBClassifier": {
        "model__n_estimators": [10, 30, 50, 100, 200],
        "model__max_depth": [None, 3, 15],
        "model__learning_rate": [0.001, 0.01, 0.1],
        "model__gamma": [0, 0.05, 0.075, 0.1],
        "model__min_child_weight": [1, 3, 5, 7],
        }
        }

In [None]:
search = HyperparameterOptimizationSearch(models=model, params=params_search)
search.fit(X_train, y_train,
           scoring =  make_scorer(recall_score, pos_label=1),
           n_jobs=-1, cv=5)

In [None]:
extensive_grid_search_summary, extensive_grid_search_pipelines = search.score_summary(sort_by='mean_score')

best_model = extensive_grid_search_summary.iloc[0,0]
best_parameters = extensive_grid_search_pipelines[best_model].best_params_

classification_pipeline = extensive_grid_search_pipelines[best_model].best_estimator_

best_parameters

In [None]:
clf_performance(X_train=X_train, y_train=y_train,
                X_test=X_test, y_test=y_test,
                pipeline=classification_pipeline,
                label_map=["No Heart Disease", "Heart Disease"]
                )

The performance on both the train and test sets recovered to the same as the first two rounds of optimisation.

* Recall on heart disease was 86% for the train set and 79% for the test set.
* Precision on no heart disease was 84% for the train set and 72% for the test set.

As no further improvements were made, these hyperparameter value ranges were used in final testing.

---

# Conclusions

Several rounds of hyperparameter optimisation were carried out using the RandomForestClassifier, LogisticRegression and XGBClassifier algorithms.

After individually testing each algorithm, the hyperparameter search values were compiled to be used in the final hyperparameter optimisation in the Modelling and Evaluation notebook.

In [None]:
params_search = {
    "RandomForestClassifier": {
        "model__n_estimators": [50, 140, 300, 500],
        "model__max_depth": [None, 4, 15],
        "model__min_samples_split": [2, 5, 10, 15, 50],
        "model__min_samples_leaf": [1, 2, 5, 10, 50],
        "model__max_leaf_nodes": [None, 50],
        "model__max_features": [None, "sqrt", "log2"],
        },
    "LogisticRegression":{
        "model__penalty": ["l2", "l1", "elasticnet", None],
        "model__C": [10, 2, 1.0, 0.5, 0.1],
        "model__tol": [1e-3, 1e-4, 1e-5],
        },
    "XGBClassifier": {
        "model__n_estimators": [10, 30, 50, 100, 200],
        "model__max_depth": [None, 3, 15],
        "model__learning_rate": [0.001, 0.01, 0.1],
        "model__gamma": [0, 0.05, 0.075, 0.1],
        "model__min_child_weight": [1, 3, 5, 7],
        },
        }