## Separa os dados de treino e teste, treina o modelo e faz a classificação predict_proba

In [116]:
# importa bibliotecas
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', None) # permite exibição de todas as linhas
pd.set_option('display.max_columns', None) # permite exibição de todas as colunas
import warnings
warnings.simplefilter('ignore') # desativa avisos de warning

In [117]:
# carrega arquivos limpos e tratados
dfx = pd.read_csv('arquivos_tratados/df_winequality-red.csv')

In [118]:
# verifica primeiras linhas
dfx.head(3)

Unnamed: 0,volatile acidity,citric acid,density,sulphates,alcohol,quality
0,0.7,0.0,0.9978,0.56,9.4,5.0
1,0.88,0.0,0.9968,0.68,9.8,5.0
2,0.76,0.04,0.997,0.65,9.8,5.0


In [119]:
# verifica os códigos de quality
dfx['quality'].value_counts()

quality
6.0    365
5.0    347
7.0     93
4.0     28
Name: count, dtype: int64

In [120]:
# separa variáveis independentes e dependentes
X = dfx[['volatile acidity', 'citric acid', 'density', 'sulphates', 'alcohol']]
y = dfx[['quality']]

In [121]:
# separa os sets de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [122]:
# verifica o tamanho de cada set resultante do split
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(666, 5)
(666, 1)
(167, 5)
(167, 1)


In [123]:
# treina o modelo
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

In [124]:
# prediz y com predict e predict_proba
y_pred_proba = clf.predict_proba(X_test)
y_pred = clf.predict(X_test)

In [125]:
# visualiza a predição de y de X_test
y_pred

array([5., 6., 6., 6., 5., 6., 6., 5., 5., 5., 7., 5., 5., 6., 6., 6., 5.,
       6., 6., 6., 5., 6., 5., 6., 5., 6., 5., 5., 6., 5., 5., 5., 5., 6.,
       6., 5., 5., 6., 5., 6., 5., 5., 5., 5., 6., 5., 6., 6., 6., 6., 6.,
       6., 6., 6., 6., 7., 5., 6., 6., 5., 6., 5., 6., 5., 6., 6., 5., 5.,
       6., 5., 6., 5., 5., 6., 5., 6., 5., 6., 5., 5., 6., 6., 6., 5., 5.,
       6., 6., 6., 5., 5., 6., 6., 5., 6., 5., 6., 5., 6., 6., 5., 6., 6.,
       6., 6., 5., 5., 5., 5., 6., 5., 5., 6., 5., 6., 5., 6., 5., 5., 6.,
       5., 5., 5., 6., 5., 6., 5., 5., 5., 5., 5., 6., 5., 5., 5., 5., 5.,
       5., 6., 6., 5., 5., 6., 6., 6., 6., 6., 6., 5., 6., 7., 5., 5., 5.,
       5., 5., 5., 6., 6., 5., 6., 6., 6., 5., 5., 5., 6., 6.])

In [126]:
# gera a matriz de confusão para análise
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[ 0,  4,  2,  0],
       [ 0, 53, 16,  0],
       [ 0, 24, 53,  2],
       [ 0,  3,  9,  1]], dtype=int64)

In [127]:
# verifica a acurácia, precisão e recall do resultado
print('acurácia =', metrics.accuracy_score(y_test, y_pred))
print('precisão =', metrics.precision_score(y_test, y_pred, average='weighted'))
print('recall   =', metrics.recall_score(y_test, y_pred, average='weighted'))
print('f1-score   =', metrics.f1_score(y_test, y_pred, average='weighted'))

acurácia = 0.6407185628742516
precisão = 0.6000392072996863
recall   = 0.6407185628742516
f1-score   = 0.6113508277562522


In [128]:
# visualiza a predição das probabilidades de y
print(y_pred_proba)

[[0.03225241 0.59341721 0.35374959 0.0205808 ]
 [0.02876333 0.23202296 0.60059714 0.13861657]
 [0.02631348 0.44692586 0.46813387 0.05862678]
 [0.0112249  0.12700564 0.63490084 0.22686863]
 [0.04407465 0.64532277 0.2966714  0.01393118]
 [0.03156358 0.24510351 0.60179828 0.12153463]
 [0.0129152  0.14867681 0.53803956 0.30036843]
 [0.03778531 0.51570238 0.41623725 0.03027506]
 [0.04194505 0.56414385 0.36264209 0.03126902]
 [0.03700149 0.69976823 0.24593366 0.01729662]
 [0.00315614 0.02043854 0.40597298 0.57043234]
 [0.05623715 0.7216331  0.20524194 0.01688781]
 [0.03680767 0.60954432 0.33759508 0.01605293]
 [0.02241041 0.27305522 0.54703407 0.15750031]
 [0.03414431 0.4187798  0.49024553 0.05683036]
 [0.01818863 0.17934541 0.60585739 0.19660856]
 [0.04805749 0.61820663 0.29486525 0.03887063]
 [0.0541318  0.34113107 0.49489969 0.10983745]
 [0.02761408 0.25192668 0.6210194  0.09943985]
 [0.01208572 0.11647283 0.52129308 0.35014837]
 [0.02767438 0.70589922 0.24882612 0.01760028]
 [0.02276243 

In [129]:
# transforma array em dataframe
dfproba = pd.DataFrame(y_pred_proba)

In [130]:
# renomeia as colunas
dfproba.columns = ['p6', 'p5', 'p7', 'p4']

In [131]:
# arredonda os resultados com duas casas decimai
dfproba = round(dfproba*100, 2)

In [132]:
# soma o total de percentuais para checar se fecha em 100%
dfproba['soma_perc'] = dfproba.p6 + dfproba.p5 + dfproba.p7 + dfproba.p4

In [133]:
# exibe o dataframe para checar os percentuais
dfproba

Unnamed: 0,p6,p5,p7,p4,soma_perc
0,3.23,59.34,35.37,2.06,100.0
1,2.88,23.2,60.06,13.86,100.0
2,2.63,44.69,46.81,5.86,99.99
3,1.12,12.7,63.49,22.69,100.0
4,4.41,64.53,29.67,1.39,100.0
5,3.16,24.51,60.18,12.15,100.0
6,1.29,14.87,53.8,30.04,100.0
7,3.78,51.57,41.62,3.03,100.0
8,4.19,56.41,36.26,3.13,99.99
9,3.7,69.98,24.59,1.73,100.0


## Faz o Deploy do Modelo

In [134]:
# faz o deploy do modelo
from joblib import dump

dump(clf, 'modelos/classifica_proba_wine.joblib')

['modelos/classifica_proba_wine.joblib']