In [1]:
%matplotlib inline

In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score

# Cross Validation

Стъпки:
1. Разделяне на данните на $k$ групи.
2. Сформиране на трениращи данни от $k-1$ групи. Последната оставяме за тестване на модела.
3. Повтаряне на 2 докато изчепаме възможните данни.

Данните са почистени. Суровият датасет е от [тук](https://archive.ics.uci.edu/ml/datasets/bank+marketing).

Описание:
> The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed.

### Preprocess data

In [67]:
numeric_transformer = Pipeline([
    ("scaler", MinMaxScaler()),
])

categorical_transformer = Pipeline([
    ("one hot encoding", FunctionTransformer(func=lambda data: pd.get_dummies(data))),
])


numeric_selector = make_column_selector(dtype_exclude=object)
categorical_selector = make_column_selector(dtype_include=object)

features_preprocessor = ColumnTransformer(transformers=[
    ('categorical', categorical_transformer, categorical_selector),
    ('numeric', numeric_transformer, numeric_selector),
])

In [68]:
bank_data = pd.read_csv("../datasets/bank/bank.csv", sep=";")

bank_features = bank_data.drop(columns="y")
bank_features.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown


In [69]:
pd.DataFrame(features_preprocessor.fit_transform(bank_features))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.161765,0.068455,0.600000,0.024826,0.000000,0.000000,0.00
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.205882,0.108750,0.333333,0.071500,0.000000,0.389908,0.16
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.235294,0.062590,0.500000,0.059914,0.000000,0.379587,0.04
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.161765,0.064281,0.066667,0.064548,0.061224,0.000000,0.00
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.588235,0.044469,0.133333,0.073486,0.000000,0.000000,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.205882,0.039999,0.966667,0.107580,0.081633,0.000000,0.00
4517,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.558824,0.000000,0.266667,0.049321,0.000000,0.000000,0.00
4518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.558824,0.048429,0.600000,0.048659,0.204082,0.000000,0.00
4519,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.132353,0.059731,0.166667,0.041377,0.061224,0.243119,0.12


In [None]:

replacer = FunctionTransformer(func = lambda data: data.replace({"no": 0, "yes": 1}))
# replacer.fit_transform(bank_data.y)



In [28]:
bank_data = pd.read_csv("../datasets/bank/bank.csv", sep=";")

bank_features = bank_data.drop(columns="y")

In [30]:
bank_features = pd.get_dummies(bank_features)

In [21]:
bank_labels = bank_data["y"]
bank_labels = bank_labels.replace({"no": 0, "yes": 1})

bank_features_train, bank_features_test, bank_labels_train, bank_labels_test = \
train_test_split(bank_features, bank_labels, train_size=0.7, test_size=0.3, stratify=bank_labels)

for a_set in [bank_features_train, bank_features_test, bank_labels_train, bank_labels_test]:
    print(a_set.shape)

(3164, 51)
(1357, 51)
(3164,)
(1357,)


### Model data

In [24]:
logistic_regression = LogisticRegression(C=10e6)

In [25]:
k_fold = StratifiedKFold(n_splits=5)

In [26]:
cross_val_score(
    estimator = logistic_regression,
    X = bank_features_train,
    y = bank_labels_train,
    cv=k_fold,
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

array([0.88941548, 0.88467615, 0.88151659, 0.88625592, 0.90189873])