In [37]:
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from joblib import dump, load
import warnings
warnings.filterwarnings('ignore') 

# Preprocessing

In [38]:
x_df = pd.read_csv('../datasets/winequality_red_x_train.csv')
y_df = pd.read_csv('../datasets/winequality_red_y_train.csv')

In [39]:
#split training and testing set 
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=0)

In [40]:
y_train.value_counts()

3  4  5  6  7  8
0  0  1  0  0  0    360
      0  1  0  0    349
         0  1  0    101
   1  0  0  0  0     28
   0  0  0  0  1     11
1  0  0  0  0  0      7
dtype: int64

### SMOTE
Model with smote performed worse.

In [None]:
#smote
sm = SMOTE(random_state=0)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train.ravel())

In [None]:
y_train_res = pd.DataFrame(y_train_res)
y_train_res.value_counts()

# Model Selection

In [41]:
#note: commented out models cannot handle multiple columns in y_df
models = [
    # LogisticRegression(random_state=0), 
    MLPClassifier(random_state=0), 
    KNeighborsClassifier(), 
    # SVC(random_state=0),
    # GaussianProcessClassifier(random_state=0), 
    # QuadraticDiscriminantAnalysis(), 
    DecisionTreeClassifier(random_state=0), 
    RandomForestClassifier(random_state=0), 
    # AdaBoostClassifier(random_state=0), 
    # GaussianNB()
] 

In [42]:
for model in models:
    try:
        print('\n' + str(model))
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        print(f1_score(y_test, y_pred, average="macro"))
    except Exception as e:
        print(e)


MLPClassifier(random_state=0)
0.21342379914928933

KNeighborsClassifier()
0.24660496068368562

DecisionTreeClassifier(random_state=0)
0.3188712777041624

RandomForestClassifier(random_state=0)
0.3051047120418848


# Feature Engineering

From Feature Relevance ntbk, remove columns `["citric acid", "density", "free sulfur dioxide"]`

In [43]:
drop_features = ["citric acid", "density", "free sulfur dioxide"]

x_df_0 = x_df.drop(drop_features, axis=1)

x_df_0

Unnamed: 0,fixed acidity,volatile acidity,residual sugar,chlorides,total sulfur dioxide,pH,sulphates,alcohol
0,0.690265,0.157534,0.116438,0.111853,0.222615,0.299213,0.251497,0.307692
1,0.185841,0.431507,0.102740,0.110184,0.091873,0.637795,0.167665,0.676923
2,0.176991,0.349315,0.232877,0.135225,0.252650,0.362205,0.071856,0.169231
3,0.566372,0.123288,0.082192,0.070117,0.045936,0.448819,0.329341,0.323077
4,0.336283,0.428082,0.068493,0.130217,0.201413,0.354331,0.293413,0.184615
...,...,...,...,...,...,...,...,...
1066,0.398230,0.328767,0.068493,0.076795,0.014134,0.346457,0.179641,0.307692
1067,0.318584,0.352740,0.082192,0.101836,0.190813,0.433071,0.251497,0.384615
1068,0.230088,0.342466,0.123288,0.108514,0.279152,0.606299,0.125749,0.169231
1069,0.292035,0.054795,0.054795,0.070117,0.031802,0.456693,0.281437,0.538462


In [44]:
x_train_0, x_test_0, y_train_0, y_test_0 = train_test_split(x_df_0, y_df, test_size=0.2, random_state=0)

#checking with decision tree
model_01 = DecisionTreeClassifier(random_state=0)

model_01.fit(x_train_0, y_train_0)
pred_01 = model_01.predict(x_test_0)
print('DecisionTreeClassifier: {}'.format(f1_score(y_test_0, pred_01, average="macro")))

#checking with random forest
model_02 = RandomForestClassifier(random_state=0)

model_02.fit(x_train_0, y_train_0)
pred_02 = model_02.predict(x_test_0)
print('RandomForestClassifier: {}'.format(f1_score(y_test_0, pred_02, average="macro")))

DecisionTreeClassifier: 0.35668385633823424
RandomForestClassifier: 0.2939881114748136


In [45]:
dump(model_01, '../models/clf_red_wine.joblib')

['../models/clf_red_wine.joblib']

In [26]:
x_train = x_train_0
y_train = y_train_0
x_test = x_test_0
y_test = y_test_0

# Hyperparameter Tuning

DecisionTree hyperparameter tuning. Hypertuned models did not result to a better f1 score.

### Using hyperopt

In [30]:
#define DecisionTreeClassifier Hyperparameters
space = {'max_depth': hp.choice('max_depth', range(1,10)),
         'criterion': hp.choice('criterion', ["gini","entropy","log_loss"]),
         'min_samples_split': hp.choice('min_samples_split', range(1,10)),
         'min_samples_leaf': hp.choice('min_samples_leaf', range(1,10)),
         'splitter': hp.choice('splitter', ["best", "random"]),
        }

#define target function
def objective(params):

#create model instance with params
    model = DecisionTreeClassifier(**params)

#train model
    model.fit(x_train, y_train)

#evaluate and return model score

    scores = cross_val_score(model, x_train, y_train, cv=5, scoring='f1_macro')
    
    # Extract the best score
    best_score = max(scores)

    # Loss must be minimized
    loss = 1 - best_score

    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

#minimize target function with hyperopt
best = fmin(objective,
            space,
            algo=tpe.suggest,
            max_evals=200)

best

100%|██████████| 200/200 [00:10<00:00, 18.37trial/s, best loss: 0.5731131516447259]


{'criterion': 2,
 'max_depth': 7,
 'min_samples_leaf': 0,
 'min_samples_split': 1,
 'splitter': 1}

In [31]:
model = DecisionTreeClassifier(random_state=0, criterion="log_loss", max_depth=8, splitter="random")

model.fit(x_train, y_train)
pred = model.predict(x_test)
print('DecisionTreeClassifier: {}'.format(f1_score(y_test, pred, average="macro")))


DecisionTreeClassifier: 0.25896172272358636


### Using GridSearch

In [34]:
model = DecisionTreeClassifier(random_state=0)

param_grid = {'max_depth': range(1,10),
         'criterion':  ["gini","entropy","log_loss"],
         'min_samples_split': range(1,10),
         'min_samples_leaf': range(1,5),
         'splitter':["best", "random"],
        }

clf = GridSearchCV(model, param_grid = param_grid, verbose=1)
best_clf = clf.fit(x_train, y_train)

Fitting 5 folds for each of 1944 candidates, totalling 9720 fits


In [35]:
print(best_clf.best_params_)

{'criterion': 'gini', 'max_depth': 9, 'min_samples_leaf': 1, 'min_samples_split': 1, 'splitter': 'best'}


In [36]:
model = DecisionTreeClassifier(random_state=0, criterion="gini", max_depth=9, splitter="best",min_samples_leaf= 1, min_samples_split= 1)

model.fit(x_train, y_train)
pred = model.predict(x_test)
print('DecisionTreeClassifier: {}'.format(f1_score(y_test, pred, average="macro")))

DecisionTreeClassifier: 0.31611161107093394


# SKIPPED PARTS

### Hyperparameter Tuning (skip)

For hyperparameter tuning of original randomforestmodel

In [None]:
model = RandomForestClassifier()
param_grid = {   
    # 'n_estimators' : [25, 50, 75, 100, 125, 150, 175, 200],
    'criterion': ['gini', 'entropy', 'log_loss'],
    # 'max_depth': [20, 40, 60, 80, 100, None],
    'bootstrap': [True, False],
    # 'min_samples_split' : [2, 3, 4, 5],
    # 'min_samples_leaf' : [1, 2, 3, 4, 5],
    'max_features' : ['sqrt', 'log2', 'auto', None],
    }



clf = GridSearchCV(model, param_grid = param_grid, refit=True, verbose=3)
best_clf = clf.fit(x_train, y_train)

In [13]:
print(best_clf.best_params_)

{'bootstrap': True, 'criterion': 'gini', 'max_features': 'sqrt'}


In [None]:
y_pred = best_clf.predict(x_test)

print(classification_report(y_test, y_pred))

In [None]:
model = RandomForestClassifier()
param_grid = [{   
    'n_estimators' : [25, 50, 75, 100, 125, 150, 175, 200],
    'max_depth': [20, 40, 60, 80, 100, None],
    'min_samples_split' : [2, 3, 4, 5],
    'min_samples_leaf' : [1, 2, 3, 4, 5],
    }
]


clf = GridSearchCV(model, param_grid = param_grid, refit=True, verbose=3)
best_clf = clf.fit(x_train, y_train)

In [16]:
print(best_clf.best_params_)

{'max_depth': 60, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}


In [None]:
y_pred = best_clf.predict(x_test)

print(classification_report(y_test, y_pred))

### Test for multiple columns in y_df (skipped)

Already proven that decision trees can handle multiple columns in y_df

In [3]:
#split training and testing set 
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=0)

In [None]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print('\n\n\n' + str(model))
print(classification_report(y_test, y_pred))

In [None]:
#with results from hyperparameter tuning
model = RandomForestClassifier(max_depth=60, min_samples_leaf=1, min_samples_split=2, n_estimators=150)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print('\n\n\n' + str(model))
print(classification_report(y_test, y_pred))

In [22]:
dump(model, '../models/clf_red_wine.joblib')

['../models/clf_red_wine.joblib']

# from aldrin

In [None]:
from hyperopt import fmin, tpe, hp, Trials
from sklearn.ensemble import RandomForestClassifier

#define RandomForestClassifier Hyperparameters
space = {'max_depth': hp.choice('max_depth', range(1,100)),
         'criterion': hp.choice('criterion', ["gini","entropy","log_loss"]),
         'n_estimators': hp.choice('n_estimators', range(1,100))
        }

#define target function
def objective(params):

#create model instance with params
    model = RandomForestClassifier(**params)

#train model
    model.fit(X_train, y_train)

#evaluate and return model score

    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')
    
    # Extract the best score
    best_score = max(scores)

    # Loss must be minimized
    loss = 1 - best_score

    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'status': STATUS_OK}

#minimize target function with hyperopt
best = fmin(objective,
            space,
            algo=tpe.suggest,
            max_evals=200)

best