# Cross Validation

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Carregando os dados

In [3]:
import numpy as np
import pandas as pd

In [4]:
df_abt = pd.read_csv('/content/drive/MyDrive/projeto-ia-codigos/projeto-ia-aula8/propensao_revenda_abt (1).csv')
df_abt.head()

Unnamed: 0,data_ref_safra,seller_id,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,nao_revendeu_next_6m
0,2018-01-01,0015a82c2db000af6aaaf3ae2ecb0532,SP,3,3,1,2685.0,74,1
1,2018-01-01,001cca7ae9ae17fb1caed9dfb1094831,ES,171,207,9,21275.23,2,0
2,2018-01-01,002100f778ceb8431b7a1020ff7ab48f,SP,38,42,15,781.8,2,0
3,2018-01-01,003554e2dce176b5555353e4f3555ac8,GO,1,1,1,120.0,16,1
4,2018-01-01,004c9cd9d87a3c30c522c48c4fc07416,SP,130,141,75,16228.88,8,0


In [5]:
key_vars = ['data_ref_safra', 'seller_id']
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']
cat_vars = ['uf']
target = 'nao_revendeu_next_6m'

features = cat_vars + num_vars

# filtra as colunas com as features
X = df_abt[features]
# filtra o target
y = df_abt[target]

# Train-Test Split / Hold-Out

In [6]:
from sklearn.model_selection import train_test_split

# Criando o conjunto de treino e teste -> Hold-Out
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

## Pipeline utilizado

Vamos utilizar o estimador DecisionTreeClassifier para testar todos os cenários

In [7]:
!pip install feature-engine==1.0.2

Collecting feature-engine==1.0.2
[?25l  Downloading https://files.pythonhosted.org/packages/57/6d/0c7594c89bf07a7c447b1a251d4e04b07104d4a9332de71e1de42b78b838/feature_engine-1.0.2-py2.py3-none-any.whl (152kB)
[K     |██▏                             | 10kB 14.2MB/s eta 0:00:01[K     |████▎                           | 20kB 19.8MB/s eta 0:00:01[K     |██████▌                         | 30kB 13.0MB/s eta 0:00:01[K     |████████▋                       | 40kB 7.9MB/s eta 0:00:01[K     |██████████▊                     | 51kB 9.2MB/s eta 0:00:01[K     |█████████████                   | 61kB 10.7MB/s eta 0:00:01[K     |███████████████                 | 71kB 11.5MB/s eta 0:00:01[K     |█████████████████▏              | 81kB 11.3MB/s eta 0:00:01[K     |███████████████████▍            | 92kB 11.7MB/s eta 0:00:01[K     |█████████████████████▌          | 102kB 9.9MB/s eta 0:00:01[K     |███████████████████████▋        | 112kB 9.9MB/s eta 0:00:01[K     |███████████████████████

In [8]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

dt = Pipeline(steps=[
                ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
                ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='missing')),
                ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
                ('algoritmo', DecisionTreeClassifier(random_state=42))
])

In [9]:
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, y_pred)
print(f'Acurácia = {acc:.3f}')

Acurácia = 0.770


# K-Fold Cross-Validation

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_results = cross_val_score(dt, X, y, scoring='accuracy', cv=kf, n_jobs=-1)

print(f'Acurácia = {cv_results.mean():.3f} ({cv_results.std():.3f})')

Acurácia = 0.776 (0.005)


In [12]:
# cross-validation com multiplas métricas
from sklearn.model_selection import cross_validate

cv_results = cross_validate(dt, X, y, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=kf, n_jobs=-1)
cv_results_df = pd.DataFrame(cv_results)

In [None]:
cv_results_df

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_roc_auc
0,0.151442,0.086118,0.757524,0.728346,0.719844,0.72407,0.753605
1,0.156328,0.08164,0.757954,0.737098,0.720952,0.728936,0.754567
2,0.153021,0.082114,0.738925,0.731755,0.688951,0.709708,0.735864
3,0.147635,0.087487,0.739785,0.6881,0.719157,0.703286,0.737214
4,0.096457,0.046873,0.753118,0.719661,0.73487,0.727186,0.751288


In [None]:
cv_results_df.mean()

fit_time          0.140977
score_time        0.076846
test_accuracy     0.749461
test_precision    0.720992
test_recall       0.716755
test_f1           0.718637
test_roc_auc      0.746508
dtype: float64

# Stratified K-Fold Cross-Validation

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = cross_val_score(dt, X, y, scoring='accuracy', cv=skf, n_jobs=-1)

print(f'Acurácia = {cv_results.mean():.3f} ({cv_results.std():.3f})')

Acurácia = 0.774 (0.011)


In [14]:
# stratified cross-validation com multiplas métricas
from sklearn.model_selection import cross_validate

cv_results = cross_validate(dt, X, y, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=skf, n_jobs=-1)
cv_results_df = pd.DataFrame(cv_results)
cv_results_df

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_roc_auc
0,0.084829,0.066134,0.783985,0.718826,0.715328,0.717073,0.770937
1,0.089217,0.068451,0.756983,0.674419,0.705596,0.689655,0.747217
2,0.103594,0.084603,0.765363,0.693431,0.693431,0.693431,0.751693
3,0.107839,0.070669,0.785847,0.71327,0.734146,0.723558,0.775959
4,0.060416,0.03655,0.77726,0.701176,0.726829,0.713772,0.767638


In [None]:
cv_results_df.mean()

fit_time          0.137763
score_time        0.075901
test_accuracy     0.752388
test_precision    0.722990
test_recall       0.722702
test_f1           0.722789
test_roc_auc      0.749628
dtype: float64

# Leave-One Out Cross-Validation

In [None]:
from sklearn.model_selection import LeaveOneOut

loot = LeaveOneOut()

cv_results = cross_val_score(dt, X, y, scoring='accuracy', cv=loot, n_jobs=-1)

print(f'Acurácia = {cv_results.mean():.3f} ({cv_results.std():.3f})')