## Separa os dados de treino e teste, treina o modelo e faz a classificação predict_proba

In [2]:
# importa bibliotecas
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import warnings 

In [3]:
# desativa exibição de warnings
warnings.filterwarnings('ignore') 

In [4]:
# carrega arquivos limpos e tratados
dfvendas = pd.read_csv('arquivos_tratados/dfvendas.csv')

In [5]:
# verifica primeiras linhas
dfvendas.head(3)

Unnamed: 0,order_id,customer_id,payment_type,payment_installments,payment_value,price_item,tipo_pagto_nro
0,00010242fe8c5a6d1ba2dd792cb16214,3ce436f183e68e07877b285a838db11a,credit_card,2,72.19,58.9,0
1,00018f77f2f0320c557190d7a144bdd3,f6dd3ec061db4e3987629fe6b26e5cce,credit_card,3,259.83,239.9,0
2,000229ec398224ef6ca0657da4fc703e,6489ae5e4333f3693df5ad4372dab6d3,credit_card,5,216.87,199.0,0


In [6]:
# separa variáveis independentes e dependentes
X = dfvendas[['payment_installments', 'payment_value', 'price_item']]
y = dfvendas[['tipo_pagto_nro']]

In [7]:
# separa os sets de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [8]:
# verifica o tamanho de cada set resultante do split
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(84160, 3)
(84160, 1)
(21040, 3)
(21040, 1)


In [9]:
# treina o modelo
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

In [10]:
# prediz y com predict e predict_proba
y_pred_proba = clf.predict_proba(X_test)
y_pred = clf.predict(X_test)

In [11]:
# visualiza a predição de y de X_test
y_pred

array([0, 2, 1, ..., 0, 0, 0], dtype=int64)

In [12]:
# gera a matriz de confusão para análise
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[15186,   362,   100,     0],
       [ 3447,   582,     0,     0],
       [  806,    32,   176,     0],
       [  307,    42,     0,     0]], dtype=int64)

In [21]:
# verifica a acurácia, precisão e recall do resultado
print('acurácia =', metrics.accuracy_score(y_test, y_pred))
print('precisão =', metrics.precision_score(y_test, y_pred, average='weighted'))
print('recall   =', metrics.recall_score(y_test, y_pred, average='weighted'))
print('f1-score   =', metrics.f1_score(y_test, y_pred, average='weighted'))

acurácia = 0.7577946768060837
precisão = 0.7121857417422233
recall   = 0.7577946768060837
f1-score   = 0.6955149421129944


In [14]:
# visualiza a predição das probabilidades de y
print(y_pred_proba)

[[9.99995334e-01 7.48632006e-07 3.91015600e-06 7.13209051e-09]
 [3.33932236e-01 1.43226087e-01 5.15298556e-01 7.54312170e-03]
 [4.79940642e-01 4.80075534e-01 1.41827772e-02 2.58010468e-02]
 ...
 [1.00000000e+00 7.31681972e-11 1.26213220e-11 2.09156951e-13]
 [5.42070015e-01 3.35602551e-01 1.05607423e-01 1.67200110e-02]
 [9.67447984e-01 2.05585789e-02 1.13091869e-02 6.84250350e-04]]


In [15]:
# transforma array em dataframe
dfproba = pd.DataFrame(y_pred_proba)

In [16]:
# renomeia as colunas
dfproba.columns = ['perc_credit_card', 'perc_boleto', 'perc_voucher', 'perc_debit_card']

In [17]:
# arredonda os resultados com duas casas decimai
dfproba = round(dfproba*100, 2)

In [18]:
# soma o total de percentuais para checar se fecha em 100%
dfproba['soma_perc'] = dfproba.perc_credit_card + dfproba.perc_boleto + dfproba.perc_voucher + dfproba.perc_debit_card

In [19]:
# exibe o dataframe para checar os percentuais
dfproba

Unnamed: 0,perc_credit_card,perc_boleto,perc_voucher,perc_debit_card,soma_perc
0,100.00,0.00,0.00,0.00,100.00
1,33.39,14.32,51.53,0.75,99.99
2,47.99,48.01,1.42,2.58,100.00
3,53.67,37.08,7.34,1.91,100.00
4,100.00,0.00,0.00,0.00,100.00
...,...,...,...,...,...
21035,53.73,33.75,10.81,1.71,100.00
21036,99.84,0.15,0.00,0.00,99.99
21037,100.00,0.00,0.00,0.00,100.00
21038,54.21,33.56,10.56,1.67,100.00


## Faz o Deploy do Modelo

In [20]:
# faz o deploy do modelo
from joblib import dump

dump(clf, 'modelos/classifica_proba.joblib')

['modelos/classifica_proba.joblib']