In [152]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import mean_squared_error, f1_score

from joblib import dump, load
from hyperopt import hp, fmin, tpe

In [153]:
df_x = pd.read_csv('../datasets/winequality_white_x_train.csv')
df_y = pd.read_csv('../datasets/winequality_white_y_train.csv')

df = pd.concat([df_x, df_y], axis=1)
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,3,4,5,6,7,8,9
0,0.173077,0.196078,0.240964,0.084356,0.074184,0.118467,0.252900,0.083478,0.445455,0.244186,0.661290,0,0,0,0,1,0,0
1,0.432692,0.323529,0.307229,0.021472,0.109792,0.031359,0.459397,0.113553,0.272727,0.383721,0.548387,0,0,1,0,0,0,0
2,0.144231,0.225490,0.228916,0.151840,0.065282,0.177700,0.303944,0.117602,0.563636,0.279070,0.596774,0,0,0,1,0,0,0
3,0.307692,0.274510,0.180723,0.067485,0.091988,0.132404,0.310905,0.089069,0.554545,0.232558,0.677419,0,0,0,0,1,0,0
4,0.259615,0.098039,0.156627,0.012270,0.094955,0.132404,0.306265,0.134760,0.563636,0.581395,0.241935,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3276,0.230769,0.127451,0.313253,0.090491,0.112760,0.090592,0.264501,0.136302,0.454545,0.313953,0.306452,0,0,0,1,0,0,0
3277,0.307692,0.058824,0.192771,0.128834,0.089021,0.181185,0.306265,0.163678,0.454545,0.244186,0.225806,0,0,0,1,0,0,0
3278,0.365385,0.186275,0.313253,0.039877,0.100890,0.090592,0.331787,0.080586,0.272727,0.360465,0.548387,0,0,0,1,0,0,0
3279,0.240385,0.156863,0.174699,0.200920,0.077151,0.177700,0.290023,0.165028,0.409091,0.186047,0.419355,0,0,0,1,0,0,0


In [174]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=0)

In [375]:
x = x_train.values
y = y_train.values

def f(params):
    model = MLPClassifier(
        random_state=1,
        max_iter=2000,
        alpha=params['alpha'],
        learning_rate_init=params['learning_rate'],
        # hidden_layer_sizes=(150, 100, 50),
        hidden_layer_sizes=(20, 30, 30, 20),
        activation='relu',
        learning_rate='adaptive'
    )
    
    model.fit(x, y)
    preds = model.predict(x_test.values)
    score = mean_squared_error(preds, y_test.values)
    
    return score

In [376]:
params = {
    'alpha': hp.uniform('alpha', 0.001, 0.01),
    'learning_rate': hp.uniform('learning_rate', 0.0001, 0.001)
}

best_params = fmin(fn=f, space=params, max_evals=100, algo=tpe.suggest)

best_params

100%|██████████| 100/100 [06:30<00:00,  3.90s/trial, best loss: 0.11611219830397913]


{'alpha': 0.005132623589898519, 'learning_rate': 0.0006623398732465599}

In [377]:
clf = MLPClassifier(random_state=1,
        max_iter=2000,
        alpha=best_params['alpha'],
        learning_rate_init=best_params['learning_rate'],
        # hidden_layer_sizes=(150, 100, 50),
        hidden_layer_sizes=(20, 30, 30, 20),
        activation='relu',
        learning_rate='adaptive'
        )
clf.fit(x,y)

In [483]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
# clf = RandomForestClassifier(n_estimators=1000, n_jobs=200, random_state=1)
clf.fit(x, y)

In [484]:
preds = clf.predict(x_test.values)

In [485]:
accuracy_score(preds, y_test.values)

0.5722983257229832

In [486]:
f1 = f1_score(preds, y_test.values, average=None)
np.mean(f1)

0.33052552036852356

In [487]:
matrix = multilabel_confusion_matrix(preds, y_test.values)
matrix

array([[[655,   2],
        [  0,   0]],

       [[618,  18],
        [ 14,   7]],

       [[400,  95],
        [ 59, 103]],

       [[223, 106],
        [130, 198]],

       [[491,  41],
        [ 63,  62]],

       [[618,  18],
        [ 15,   6]],

       [[656,   1],
        [  0,   0]]], dtype=int64)

In [488]:
print(classification_report(preds, y_test.values))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.28      0.33      0.30        21
           2       0.52      0.64      0.57       162
           3       0.65      0.60      0.63       328
           4       0.60      0.50      0.54       125
           5       0.25      0.29      0.27        21
           6       0.00      0.00      0.00         0

   micro avg       0.57      0.57      0.57       657
   macro avg       0.33      0.34      0.33       657
weighted avg       0.58      0.57      0.58       657
 samples avg       0.57      0.57      0.57       657



  _warn_prf(average, modifier, msg_start, len(result))


In [489]:
# acc - 0.58143
# f1 - 0.32944

# dump(clf, '../models/clf_white_wine.joblib')