In [13]:
%matplotlib inline

In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline

In [21]:
RANDOM_STATE = 1

# Cross Validation

## Bank dataset
Стъпки:
1. Разделяне на данните на $k$ групи.
2. Сформиране на трениращи данни от $k-1$ групи. Последната оставяме за тестване на модела.
3. Повтаряне на 2 докато изчепаме възможните данни.

Данните са почистени. Суровият датасет е от [тук](https://archive.ics.uci.edu/ml/datasets/bank+marketing).

Описание:
> The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed.

### Prepare data

In [15]:
bank_data = pd.read_csv("../datasets/bank/bank.csv", sep=";")
bank_features = bank_data.drop(columns="y")

bank_features = pd.get_dummies(bank_features)

bank_labels = bank_data["y"]
bank_labels = bank_labels.replace({"no": 0, "yes": 1})

bank_features_train, bank_features_test, bank_labels_train, bank_labels_test = \
train_test_split(bank_features, bank_labels, train_size=0.7, test_size=0.3, stratify=bank_labels)

In [17]:
print('Test and train split shapes:')
for a_set in [bank_features_train, bank_features_test, bank_labels_train, bank_labels_test]:
    print(a_set.shape)

Test and train split shapes:
(3164, 51)
(1357, 51)
(3164,)
(1357,)


### Model data

In [5]:
logistic_regression = LogisticRegression(C=10e6)

In [27]:
stratified_kfold = StratifiedKFold(n_splits=5)

In [12]:
bank_features_train.shape

(3164, 51)

https://scikit-learn.org/stable/modules/cross_validation.html#stratified-k-fold

In [38]:
for train_indeces, test_indeces in stratified_kfold.split(bank_features_train, bank_labels_train):
    print(
        f"train set indeces: {train_indeces.min()} - {train_indeces.max()}",
        f"test set indeces: {test_indeces.min()} - {test_indeces.max()}",
        "=" * 30,
        sep="\n",
    )

train set indeces: 617 - 3163
test set indeces: 0 - 635
train set indeces: 0 - 3163
test set indeces: 617 - 1268
train set indeces: 0 - 3163
test set indeces: 1222 - 1961
train set indeces: 0 - 3163
test set indeces: 1891 - 2539
train set indeces: 0 - 2539
test set indeces: 2509 - 3163


Ще осредни

In [44]:
scores = cross_val_score(
    estimator = LogisticRegression(C=10e6 , max_iter=500),
    scoring="accuracy",
    X = bank_features_train,
    y = bank_labels_train,
    cv=stratified_kfold,
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [45]:
scores

array([0.88625592, 0.89731438, 0.90363349, 0.90679305, 0.89082278])

5 scores since the num of folds k in kfold cross validation indexes is 5.

## TODO: Using pipeline

In [67]:
numeric_transformer = Pipeline([
    ("scaler", MinMaxScaler()),
])

categorical_transformer = Pipeline([
    ("one hot encoding", FunctionTransformer(func=lambda data: pd.get_dummies(data))),
])


numeric_selector = make_column_selector(dtype_exclude=object)
categorical_selector = make_column_selector(dtype_include=object)

features_preprocessor = ColumnTransformer(transformers=[
    ('categorical', categorical_transformer, categorical_selector),
    ('numeric', numeric_transformer, numeric_selector),
])

In [4]:
bank_data = pd.read_csv("../datasets/bank/bank.csv", sep=";")

bank_features = bank_data.drop(columns="y")
bank_features.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown


In [5]:
pd.DataFrame(features_preprocessor.fit_transform(bank_features))

NameError: name 'features_preprocessor' is not defined

In [None]:

replacer = FunctionTransformer(func = lambda data: data.replace({"no": 0, "yes": 1}))
# replacer.fit_transform(bank_data.y)

