In [1]:
import numpy as np
import pandas as pd

In [3]:
df_abt = pd.read_csv('propensao_revenda_abt.csv')

df_train = df_abt.query('data_ref < "2018-03-01"')
df_oot   = df_abt.query('data_ref == "2018-03-01"')

In [4]:
df_abt.head()

Unnamed: 0,data_ref,seller_id,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,nao_revendeu_next_6m
0,2018-01-01,0015a82c2db000af6aaaf3ae2ecb0532,SP,3,3,1,2685.0,74,1
1,2018-01-01,001cca7ae9ae17fb1caed9dfb1094831,ES,171,207,9,21275.23,2,0
2,2018-01-01,002100f778ceb8431b7a1020ff7ab48f,SP,38,42,15,781.8,2,0
3,2018-01-01,003554e2dce176b5555353e4f3555ac8,GO,1,1,1,120.0,16,1
4,2018-01-01,004c9cd9d87a3c30c522c48c4fc07416,SP,130,141,75,16228.88,8,0


In [5]:
key_vars = ['data_ref', 'seller_id']
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']
cat_vars = ['uf']
target = 'nao_revendeu_next_6m'

features = cat_vars + num_vars

# Iniciando o Pycaret

In [7]:
from pycaret.classification import *

experimento = setup(
    data=df_train,
    test_data=df_oot,
    target=target,
    ignore_features=key_vars,
    categorical_features=cat_vars,
    numeric_features=num_vars,
    session_id=24,
    verbose=False
)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
uf,Categorical
tot_orders_12m,Numeric
tot_items_12m,Numeric
tot_items_dist_12m,Numeric
receita_12m,Numeric
recencia,Numeric
nao_revendeu_next_6m,Label


In [10]:
best_model = compare_models(fold=5, sort='auc')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8509,0.9221,0.7756,0.8231,0.7985,0.6804,0.6813,0.35
xgboost,Extreme Gradient Boosting,0.8518,0.921,0.7973,0.8108,0.8038,0.6847,0.6851,0.18
et,Extra Trees Classifier,0.8558,0.9195,0.7778,0.8331,0.8044,0.6904,0.6915,0.362
catboost,CatBoost Classifier,0.8435,0.9179,0.7771,0.8058,0.791,0.666,0.6664,8.22
lightgbm,Light Gradient Boosting Machine,0.8466,0.9159,0.7876,0.8059,0.7965,0.6735,0.6737,0.042
gbc,Gradient Boosting Classifier,0.8352,0.9127,0.7681,0.7936,0.7803,0.6485,0.6491,0.12
lda,Linear Discriminant Analysis,0.8103,0.8979,0.5803,0.8816,0.6997,0.5696,0.5963,0.01
ada,Ada Boost Classifier,0.8223,0.8974,0.7425,0.7821,0.7611,0.6199,0.6212,0.064
lr,Logistic Regression,0.8235,0.8973,0.6532,0.849,0.7381,0.6084,0.6204,2.022
nb,Naive Bayes,0.5525,0.8662,0.9474,0.4582,0.6176,0.2132,0.3049,0.482


In [12]:
predict_model(best_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8618,0.9224,0.8031,0.8319,0.8172,0.7062,0.7064


Unnamed: 0,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,uf_AM,uf_BA,uf_CE,uf_DF,uf_ES,...,uf_RJ,uf_RN,uf_RO,uf_RS,uf_SC,uf_SE,uf_SP,nao_revendeu_next_6m,Label,Score
0,3.0,3.0,1.0,2685.000000,133.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1,0.73
1,178.0,209.0,9.0,21621.130859,8.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.98
2,44.0,48.0,20.0,1029.199951,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.93
3,1.0,1.0,1.0,120.000000,75.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.82
4,124.0,132.0,72.0,15104.919922,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1869,4.0,4.0,3.0,124.599998,12.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0.76
1870,5.0,5.0,5.0,385.589996,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.85
1871,11.0,12.0,8.0,1450.199951,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.92
1872,13.0,13.0,3.0,1709.869995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.86


In [13]:
best_model = compare_models(fold=5, sort='auc', probability_threshold=0.8)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8189,0.9221,0.5556,0.9471,0.7,0.5828,0.6254,0.512
xgboost,Extreme Gradient Boosting,0.8292,0.921,0.6059,0.9179,0.7291,0.6123,0.6409,0.158
et,Extra Trees Classifier,0.8249,0.9195,0.5841,0.9305,0.7174,0.6001,0.6344,0.304
catboost,CatBoost Classifier,0.8072,0.9179,0.5331,0.9325,0.6775,0.5544,0.5989,5.916
lightgbm,Light Gradient Boosting Machine,0.81,0.9159,0.5496,0.92,0.6868,0.5631,0.6023,0.038
gbc,Gradient Boosting Classifier,0.7946,0.9127,0.5053,0.9202,0.6519,0.5232,0.5707,0.11
lda,Linear Discriminant Analysis,0.7637,0.8979,0.4144,0.9237,0.5717,0.4396,0.5071,0.01
ada,Ada Boost Classifier,0.6189,0.8974,0.0,0.0,0.0,0.0,0.0,0.052
lr,Logistic Regression,0.7845,0.8973,0.4768,0.9192,0.6275,0.4967,0.5498,0.054
nb,Naive Bayes,0.5691,0.8662,0.9444,0.4679,0.6257,0.2363,0.3255,0.01


In [14]:
best_model = compare_models(fold=5, sort='auc', cross_validation=False)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.8639,0.9246,0.8294,0.8192,0.8243,0.7133,0.7133,124.74
rf,Random Forest Classifier,0.8618,0.9224,0.8031,0.8319,0.8172,0.7062,0.7064,0.79
catboost,CatBoost Classifier,0.8575,0.9205,0.81,0.8179,0.8139,0.6985,0.6985,6.24
et,Extra Trees Classifier,0.865,0.9178,0.8031,0.8391,0.8207,0.7125,0.713,0.75
lightgbm,Light Gradient Boosting Machine,0.8527,0.9176,0.8017,0.8129,0.8073,0.6881,0.6881,12.22
gbc,Gradient Boosting Classifier,0.8388,0.9128,0.792,0.7898,0.7909,0.6598,0.6598,0.49
lr,Logistic Regression,0.826,0.8991,0.6865,0.8319,0.7523,0.6201,0.6269,6.18
lda,Linear Discriminant Analysis,0.8074,0.8984,0.5687,0.8913,0.6943,0.5635,0.5939,0.03
ada,Ada Boost Classifier,0.8335,0.8982,0.7601,0.7977,0.7784,0.6452,0.6457,0.19
nb,Naive Bayes,0.5678,0.8645,0.9431,0.4693,0.6267,0.2322,0.3209,0.0


In [15]:
best_model = compare_models(fold=5, sort='auc', include=['rf', 'xgboost', 'lightgbm', 'catboost'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8509,0.9221,0.7756,0.8231,0.7985,0.6804,0.6813,1.722
xgboost,Extreme Gradient Boosting,0.8518,0.921,0.7973,0.8108,0.8038,0.6847,0.6851,0.664
catboost,CatBoost Classifier,0.8435,0.9179,0.7771,0.8058,0.791,0.666,0.6664,6.596
lightgbm,Light Gradient Boosting Machine,0.8466,0.9159,0.7876,0.8059,0.7965,0.6735,0.6737,0.482


In [16]:
rf = create_model('rf', fold=5)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8498,0.9242,0.7603,0.832,0.7945,0.6765,0.6782
1,0.8312,0.91,0.7378,0.8041,0.7695,0.6367,0.6382
2,0.8612,0.9186,0.7857,0.8394,0.8117,0.702,0.7029
3,0.8484,0.9276,0.7782,0.815,0.7962,0.6755,0.676
4,0.8641,0.9299,0.8158,0.8251,0.8204,0.7111,0.7111
Mean,0.8509,0.9221,0.7756,0.8231,0.7985,0.6804,0.6813
Std,0.0116,0.0071,0.026,0.0124,0.0174,0.0259,0.0255


In [19]:
rf = create_model('rf', max_depth=5, n_estimators=300, fold=5)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8355,0.8978,0.7416,0.8115,0.775,0.6457,0.6473
1,0.8083,0.8849,0.7378,0.7548,0.7462,0.5922,0.5923
2,0.8541,0.8993,0.7594,0.8417,0.7984,0.6845,0.6868
3,0.8455,0.9057,0.7857,0.8038,0.7947,0.6708,0.671
4,0.8298,0.8982,0.7669,0.7816,0.7742,0.6376,0.6377
Mean,0.8346,0.8972,0.7583,0.7987,0.7777,0.6462,0.647
Std,0.0156,0.0068,0.0175,0.0292,0.0186,0.0318,0.0324


In [21]:
pull().loc[['Mean']]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Mean,0.8346,0.8972,0.7583,0.7987,0.7777,0.6462,0.647


In [18]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


In [22]:
models = []
results = []

for max_depth in range(3, 21):
    model = create_model('rf', max_depth=max_depth, fold=5)
    model_results = pull().loc[['Mean']]
    models.append(model)
    results.append(model_results)

results_df = pd.concat(results, axis=0)
results_df.index = range(3, 21)
results_df.plot()

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8526,0.9278,0.764,0.8361,0.7984,0.6827,0.6844
1,0.8226,0.9107,0.7416,0.7826,0.7615,0.6205,0.621
2,0.8598,0.9191,0.7782,0.8415,0.8086,0.6983,0.6996
3,0.8584,0.9279,0.797,0.8249,0.8107,0.6976,0.6979
4,0.8612,0.9294,0.812,0.8213,0.8166,0.705,0.705
Mean,0.8509,0.923,0.7786,0.8213,0.7992,0.6808,0.6816
Std,0.0145,0.0071,0.0247,0.0207,0.0197,0.031,0.031


In [29]:
rf_tuned = tune_model(rf, fold=5)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8255,0.9088,0.7303,0.7959,0.7617,0.6244,0.6259
1,0.8083,0.8871,0.7416,0.7529,0.7472,0.5928,0.5928
2,0.8555,0.9058,0.7744,0.834,0.8031,0.6892,0.6904
3,0.8369,0.9173,0.7895,0.7836,0.7865,0.6546,0.6546
4,0.8255,0.9072,0.7782,0.7667,0.7724,0.6309,0.6309
Mean,0.8303,0.9052,0.7628,0.7866,0.7742,0.6384,0.6389
Std,0.0156,0.0099,0.0228,0.0279,0.0194,0.0322,0.0324


In [30]:
rf_tuned = tune_model(rf, fold=5, n_iter=50, optimize='AUC')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8369,0.922,0.8352,0.7611,0.7964,0.6609,0.6629
1,0.8112,0.898,0.8165,0.7243,0.7676,0.6095,0.6126
2,0.8455,0.9137,0.8421,0.7724,0.8058,0.6779,0.6796
3,0.8326,0.9193,0.8459,0.7475,0.7937,0.6538,0.6573
4,0.8283,0.9166,0.8684,0.731,0.7938,0.6486,0.6557
Mean,0.8309,0.9139,0.8416,0.7473,0.7915,0.6501,0.6536
Std,0.0114,0.0084,0.0168,0.018,0.0127,0.0226,0.0222


In [31]:
params = {
    'max_depth': [3, 5, 7, 11, 13, 15, 17, 21, 23, 25],
    'n_estimators': [100, 200, 300, 400, 500, 1000]
}

rf_tuned = tune_model(rf, fold=5, n_iter=50, optimize='AUC', custom_grid=params)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8455,0.9321,0.7528,0.8272,0.7882,0.667,0.6689
1,0.8212,0.912,0.7453,0.7773,0.761,0.6182,0.6186
2,0.8598,0.9192,0.782,0.8387,0.8093,0.6987,0.6998
3,0.8526,0.9295,0.797,0.8123,0.8046,0.6863,0.6864
4,0.8627,0.9306,0.8271,0.8148,0.8209,0.7095,0.7096
Mean,0.8484,0.9247,0.7808,0.8141,0.7968,0.676,0.6766
Std,0.0148,0.0078,0.0298,0.0206,0.0208,0.0321,0.0321


# Avaliando a performance do modelo