## Separa os dados de treino e teste, treina o modelo e faz a classificação predict_proba

In [39]:
# importa bibliotecas
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', None) # permite exibição de todas as linhas
pd.set_option('display.max_columns', None) # permite exibição de todas as colunas
import warnings
warnings.simplefilter('ignore') # desativa avisos de warning

In [40]:
# carrega arquivos limpos e tratados
dfx = pd.read_csv('arquivos_tratados/df_winequality-red.csv')

In [41]:
# verifica primeiras linhas
dfx.head(3)

Unnamed: 0,volatile acidity,citric acid,density,sulphates,alcohol,quality
0,0.7,0.0,0.9978,0.56,9.4,5.0
1,0.88,0.0,0.9968,0.68,9.8,5.0
2,0.76,0.04,0.997,0.65,9.8,5.0


In [42]:
# verifica os códigos de quality
dfx['quality'].value_counts()

quality
5.0    434
6.0    423
7.0    115
4.0     33
Name: count, dtype: int64

In [43]:
# separa variáveis independentes e dependentes
X = dfx[['volatile acidity', 'citric acid', 'density', 'sulphates', 'alcohol']]
y = dfx[['quality']]

In [44]:
# separa os sets de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [45]:
# verifica o tamanho de cada set resultante do split
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(804, 5)
(804, 1)
(201, 5)
(201, 1)


In [46]:
# treina o modelo
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

In [47]:
# prediz y com predict e predict_proba
y_pred_proba = clf.predict_proba(X_test)
y_pred = clf.predict(X_test)

In [48]:
# visualiza a predição de y de X_test
y_pred

array([6., 6., 6., 6., 5., 7., 5., 6., 5., 5., 6., 5., 5., 5., 5., 5., 6.,
       6., 6., 5., 6., 6., 6., 5., 5., 6., 5., 6., 5., 6., 5., 6., 6., 5.,
       5., 6., 6., 5., 5., 6., 6., 5., 5., 6., 5., 6., 5., 6., 6., 6., 6.,
       6., 5., 5., 6., 6., 7., 5., 5., 5., 6., 5., 6., 5., 5., 6., 5., 5.,
       5., 6., 5., 6., 6., 6., 6., 6., 5., 6., 5., 7., 6., 5., 5., 6., 5.,
       6., 6., 6., 5., 6., 6., 6., 5., 5., 6., 5., 6., 5., 5., 5., 6., 6.,
       6., 5., 5., 6., 6., 6., 6., 6., 5., 5., 6., 6., 7., 6., 6., 5., 7.,
       6., 6., 5., 5., 5., 6., 7., 5., 6., 5., 6., 6., 6., 5., 6., 6., 6.,
       5., 5., 5., 6., 6., 5., 5., 5., 5., 5., 5., 5., 5., 6., 6., 6., 5.,
       5., 7., 5., 5., 6., 6., 5., 6., 5., 5., 6., 6., 5., 6., 5., 5., 5.,
       6., 5., 6., 6., 5., 5., 6., 6., 6., 5., 6., 5., 5., 6., 5., 5., 6.,
       5., 5., 6., 6., 6., 6., 6., 5., 6., 6., 5., 5., 7., 5.])

In [49]:
# gera a matriz de confusão para análise
# tivemos 123 acertos(diagonal principal) contra 78 erros(vrs restantes) de 201 total dando 61,19% de acerto
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[ 0,  4,  1,  0],
       [ 0, 63, 26,  0],
       [ 0, 27, 54,  2],
       [ 0,  0, 18,  6]], dtype=int64)

In [67]:
# verifica a acurácia, precisão e recall do resultado
print('acurácia =', metrics.accuracy_score(y_test, y_pred))
print('precisão =', metrics.precision_score(y_test, y_pred, average='weighted'))
print('recall   =', metrics.recall_score(y_test, y_pred, average='weighted'))
print('f1-score   =', metrics.f1_score(y_test, y_pred, average='weighted'))

acurácia = 0.6119402985074627
precisão = 0.6115505643927365
recall   = 0.6119402985074627
f1-score   = 0.5946837600862558


In [51]:
# visualiza a predição das probabilidades de y
print(y_pred_proba)

[[0.03747655 0.21617846 0.59348292 0.15286207]
 [0.0351363  0.40447451 0.49392746 0.06646172]
 [0.02094403 0.11764522 0.58431013 0.27710063]
 [0.01327838 0.12870961 0.58766676 0.27034524]
 [0.05377915 0.77935571 0.16163257 0.00523257]
 [0.00518763 0.02118824 0.39775795 0.57586617]
 [0.04516373 0.70759295 0.2339974  0.01324592]
 [0.01679473 0.10374138 0.59960635 0.27985754]
 [0.03665431 0.54965925 0.37025205 0.04343438]
 [0.04407808 0.52866048 0.38700211 0.04025933]
 [0.01022099 0.06096444 0.5995153  0.32929928]
 [0.04417537 0.54935056 0.37749403 0.02898004]
 [0.03597652 0.60524365 0.33784593 0.0209339 ]
 [0.04529877 0.69034736 0.25117887 0.013175  ]
 [0.05336008 0.76576958 0.17029872 0.01057162]
 [0.03042832 0.68633068 0.26901825 0.01422274]
 [0.02277801 0.1277259  0.56632614 0.28316995]
 [0.03068297 0.20090916 0.61870743 0.14970044]
 [0.02717915 0.15912095 0.57249364 0.24120627]
 [0.06899676 0.73659501 0.1868686  0.00753963]
 [0.03380764 0.37336012 0.52092762 0.07190462]
 [0.03469218 

In [52]:
# transforma array em dataframe
dfproba = pd.DataFrame(y_pred_proba)

In [53]:
# renomeia as colunas
dfproba.columns = ['c4', 'c5', 'c6', 'c7']

In [54]:
# arredonda os resultados com duas casas decimai
dfproba = round(dfproba*100, 2)

In [55]:
# soma o total de percentuais para checar se fecha em 100%
dfproba['soma_perc'] = dfproba.c4 + dfproba.c5 + dfproba.c6 + dfproba.c7

In [56]:
# exibe o dataframe para checar os percentuais
dfproba

Unnamed: 0,c4,c5,c6,c7,soma_perc
0,3.75,21.62,59.35,15.29,100.01
1,3.51,40.45,49.39,6.65,100.0
2,2.09,11.76,58.43,27.71,99.99
3,1.33,12.87,58.77,27.03,100.0
4,5.38,77.94,16.16,0.52,100.0
5,0.52,2.12,39.78,57.59,100.01
6,4.52,70.76,23.4,1.32,100.0
7,1.68,10.37,59.96,27.99,100.0
8,3.67,54.97,37.03,4.34,100.01
9,4.41,52.87,38.7,4.03,100.01


## Faz o Deploy do Modelo

In [57]:
# faz o deploy do modelo
from joblib import dump

dump(clf, 'modelos/classifica_proba_wine.joblib')

['modelos/classifica_proba_wine.joblib']