In [3]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, cross_val_score 
from sklearn.linear_model import LogisticRegression

In [4]:
RANDOM_STATE = 1

# Grid Search Demo

## Bank dataset
Датасет от [тук](https://archive.ics.uci.edu/ml/datasets/bank+marketing).

Описание:
> The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed.

### Prepare data

In [5]:
bank_data = pd.read_csv("../datasets/bank/bank.csv", sep=";")
bank_features = bank_data.drop(columns="y")

bank_features = pd.get_dummies(bank_features)

bank_labels = bank_data["y"]
bank_labels = bank_labels.replace({"no": 0, "yes": 1})

bank_features_train, bank_features_test, bank_labels_train, bank_labels_test = \
train_test_split(bank_features, bank_labels, train_size=0.7, test_size=0.3, stratify=bank_labels)

In [6]:
print('Test and train split shapes:')
for a_set in [bank_features_train, bank_features_test, bank_labels_train, bank_labels_test]:
    print(a_set.shape)

Test and train split shapes:
(3164, 51)
(1357, 51)
(3164,)
(1357,)


### Tune hyper-parameters


https://scikit-learn.org/stable/modules/grid_search.html#tuning-the-hyper-parameters-of-an-estimator


In [7]:
LogisticRegression().get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

> cv : int, cross-validation generator or an iterable, default=None
>     Determines the cross-validation splitting strategy.
>     Possible inputs for cv are:
> 
> - None, to use the default 5-fold cross validation,
> - integer, to specify the number of folds in a `(Stratified)KFold`,
> - :term:`CV splitter`,
> - An iterable yielding (train, test) splits as arrays of indices.

In [11]:
param_grid = {
    "C": [1e-3, 1, 1e2],
    "max_iter": [100, 500],
    "penalty": ["l2", "l1", "elasticnet"],
}

grid_search = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=param_grid,
    scoring="f1",
    cv=6,
)

In [12]:
grid_search.fit(bank_features_train, bank_labels_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [13]:
grid_search.best_estimator_

In [14]:
grid_search.best_params_

{'C': 100.0, 'max_iter': 500, 'penalty': 'l2'}

In [15]:
grid_search.best_index_

15

In [16]:
grid_search.best_score_

0.4105925297465225

Дълъг и полезен репорт в cv_results_.

In [27]:
list(grid_search.cv_results_.keys())

['mean_fit_time',
 'std_fit_time',
 'mean_score_time',
 'std_score_time',
 'param_C',
 'param_max_iter',
 'param_penalty',
 'params',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'split5_test_score',
 'mean_test_score',
 'std_test_score',
 'rank_test_score']

In [24]:
grid_search.cv_results_["rank_test_score"]

array([ 6, 16, 15,  5, 14, 13,  4, 12, 11,  2, 10,  9,  3,  8,  7,  1, 17,
       18])

In [28]:
grid_search.cv_results_["mean_test_score"]

array([0.21075592,        nan,        nan, 0.25822066,        nan,
              nan, 0.29944044,        nan,        nan, 0.38221089,
              nan,        nan, 0.31584466,        nan,        nan,
       0.41059253,        nan,        nan])

In [18]:
grid_search.classes_

array([0, 1], dtype=int64)

In [19]:
grid_search.multimetric_

False

In [20]:
grid_search.scorer_

make_scorer(f1_score, average=binary)

In [21]:
grid_search.refit_time_

0.21883225440979004

In [22]:
grid_search.n_splits_

6

In [23]:
grid_search.feature_names_in_

array(['age', 'balance', 'day', 'duration', 'campaign', 'pdays',
       'previous', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student',
       'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'default_no', 'default_yes', 'housing_no',
       'housing_yes', 'loan_no', 'loan_yes', 'contact_cellular',
       'contact_telephone', 'contact_unknown', 'month_apr', 'month_aug',
       'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown'], dtype=object)

### Tune hyper-parameters 2

In [30]:
param_grid2 = {
    "C": [0.001, 1, 10, 100, 1000],
    "max_iter": [100, 500, 1000, 2000],
    "penalty": ["l2",],
}

grid_search2 = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=param_grid2,
    scoring="f1",
    cv=6,
)

In [31]:
grid_search2.fit(bank_features_train, bank_labels_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt