In [1]:
import numpy as np
import pandas as pd

# Carregando a ABT

In [4]:
# carrega a abt
df_abt = pd.read_csv('/content/drive/MyDrive/projeto-ia-datasets/olist/output/propensao_revenda_abt.csv')

# pega a base de treinamento
df_train = df_abt.query('data_ref_safra < "2018-06-01"')

# pega a base de avaliação (out of time)
df_oot   = df_abt.query('data_ref_safra == "2018-06-01"')

# Identificar as Variáveis de Modelagem

In [11]:
key_vars = ['data_ref_safra', 'seller_id']
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']
cat_vars = ['uf']
target = 'nao_revendeu_next_6m'

features = cat_vars + num_vars

# dados de treinamento
X_train = df_train[features]
y_train = df_train[target]

# dados de avaliação (out of time)
X_oot = df_oot[features]
y_oot = df_oot[target]

In [None]:
# instalando o pacote feature-engine
!pip install feature-engine==1.0.2

# Treinando uma Regressão Logística

In [7]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler

In [30]:
from sklearn.linear_model import LogisticRegression

lr_model_pipe = Pipeline(steps=[
                ('numeric_imputer', MeanMedianImputer(variables=num_vars, imputation_method='median')),
                ('numeric_scaler', SklearnTransformerWrapper(variables=num_vars, transformer=StandardScaler())),
                ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='missing')),
                ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
                ('algoritmo', LogisticRegression(random_state=42))
])

In [32]:
# Avaliando o modelo na base de treino
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = cross_validate(lr_model_pipe, X_train, y_train, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=skf, n_jobs=-1)
cv_results_df = pd.DataFrame(cv_results)
cv_results_df

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_roc_auc
0,0.240301,0.099492,0.839618,0.865672,0.732323,0.793434,0.906935
1,0.211055,0.097639,0.814126,0.843168,0.685606,0.756267,0.889642
2,0.230808,0.103882,0.814126,0.846395,0.681818,0.755245,0.899581
3,0.209977,0.122066,0.82103,0.846037,0.701643,0.767104,0.896102
4,0.144978,0.054462,0.816153,0.852615,0.680152,0.756681,0.903026


In [33]:
# Avaliando o modelo na base out of time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

lr_model_pipe.fit(X_train, y_train)

y_pred_oot  = lr_model_pipe.predict(X_oot)
acc_oot       = accuracy_score(y_oot, y_pred_oot)
precision_oot = precision_score(y_oot, y_pred_oot)
recall_oot    = recall_score(y_oot, y_pred_oot)
f1_oot        = f1_score(y_oot, y_pred_oot)

y_proba_oot = lr_model_pipe.predict_proba(X_oot)[:,1]
rocauc_oot = roc_auc_score(y_oot, y_proba_oot) 

metricas_nomes   = ['oot_accuracy', 'oot_precision', 'oot_recall', 'oot_f1', 'oot_rocauc']
metricas_valores = [[acc_oot, precision_oot, recall_oot, f1_oot, rocauc_oot]]

pd.DataFrame(metricas_valores, columns=metricas_nomes)

Unnamed: 0,oot_accuracy,oot_precision,oot_recall,oot_f1,oot_rocauc
0,0.74469,0.938482,0.580567,0.717359,0.879107


## Árvore de Decisão

In [34]:
from sklearn.tree import DecisionTreeClassifier

tree_model_pipe = Pipeline(steps=[
                ('numeric_imputer', MeanMedianImputer(variables=num_vars, imputation_method='median')),
                ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='missing')),
                ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
                ('algoritmo', DecisionTreeClassifier(random_state=42))
])

In [35]:
# avaliando na base de treino
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = cross_validate(tree_model_pipe, X_train, y_train, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=skf, n_jobs=-1)
cv_results_df = pd.DataFrame(cv_results)
cv_results_df

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_roc_auc
0,0.139412,0.079412,0.773765,0.734615,0.723485,0.729008,0.766749
1,0.128628,0.0799,0.744557,0.695597,0.698232,0.696912,0.738209
2,0.141346,0.08042,0.771641,0.72796,0.729798,0.728878,0.765907
3,0.148478,0.078717,0.776421,0.72896,0.744627,0.73671,0.772039
4,0.085746,0.044736,0.776302,0.730673,0.740834,0.735719,0.771932


In [36]:
# avaliando na base out of time
tree_model_pipe.fit(X_train, y_train)

y_pred_oot  = tree_model_pipe.predict(X_oot)
acc_oot       = accuracy_score(y_oot, y_pred_oot)
precision_oot = precision_score(y_oot, y_pred_oot)
recall_oot    = recall_score(y_oot, y_pred_oot)
f1_oot        = f1_score(y_oot, y_pred_oot)

y_proba_oot = tree_model_pipe.predict_proba(X_oot)[:,1]
rocauc_oot = roc_auc_score(y_oot, y_proba_oot) 

metricas_nomes   = ['oot_accuracy', 'oot_precision', 'oot_recall', 'oot_f1', 'oot_rocauc']
metricas_valores = [[acc_oot, precision_oot, recall_oot, f1_oot, rocauc_oot]]

pd.DataFrame(metricas_valores, columns=metricas_nomes)

Unnamed: 0,oot_accuracy,oot_precision,oot_recall,oot_f1,oot_rocauc
0,0.714415,0.817703,0.62834,0.710623,0.726057
