In [156]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,plot_confusion_matrix, f1_score

import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [157]:
white_wine_trans = pd.read_csv('transformed_wine_data/white_wine_trans.csv')
red_wine_trans = pd.read_csv('transformed_wine_data/red_wine_trans.csv')


<h3> Logistic Regression Approach </h3>

<h3> White wine  </h3>

In [158]:
X_white = white_wine_trans.drop(['quality_label', 'quality', 'type'], axis = 1)

In [159]:
y_white = white_wine_trans['quality_label']

In [160]:
# Train | Test split
X_train, X_test, y_train, y_test = train_test_split(X_white, y_white, test_size=0.3, random_state=101)

In [161]:
# Scale the data
scaler = StandardScaler()
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

<h5> Try on default settings first </h5>

In [162]:
# Default uses L2 regularisation with C = 1.
# multi_class = "ovr": do one vs rest and choose label with highest hypothesis.

log_model = LogisticRegression(solver='saga',multi_class="ovr",max_iter=5000)
log_model.fit(scaled_X_train, y_train)
y_pred = log_model.predict(scaled_X_test)

In [163]:
accuracy_score(y_test,y_pred)

0.5319727891156463

In [97]:
"""
fig, ax = plt.subplots(figsize=(5, 5))
plot_confusion_matrix(log_model,scaled_X_test,y_test, ax = ax)
plt.xticks (rotation = 90)
"""

'\nfig, ax = plt.subplots(figsize=(5, 5))\nplot_confusion_matrix(log_model,scaled_X_test,y_test, ax = ax)\nplt.xticks (rotation = 90)\n'

In [164]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        high       0.00      0.00      0.00        55
         low       0.50      0.02      0.03        60
lower_middle       0.57      0.57      0.57       438
      middle       0.53      0.75      0.62       666
upper_middle       0.37      0.12      0.18       251

    accuracy                           0.53      1470
   macro avg       0.39      0.29      0.28      1470
weighted avg       0.49      0.53      0.48      1470



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Overall accuracy of 53%. <br>
Decent f1 scores for non-outlier labels. <br>
Very poor f1 scores for outlier labels. 

<h3> GridSearch for Best Hyper Parameters </h3>

In [171]:
# Create base log reg model
log_model = LogisticRegression(solver='saga',multi_class="ovr",max_iter=5000)

penalty = ['l1', 'l2']

# Documentation  recommends logarithmic spacing. 
C = np.linspace(0.001,50,10)

grid_model = GridSearchCV(log_model,param_grid={'C':C,'penalty':penalty}, scoring = 'f1_micro')

# Note, f1 micro average is just regular accuracy. 
# f1 macro average is the mean of all f1 scores. 
# We won't look at weighted average for now, since we care about f1 scores for all classes equally.

In [172]:
grid_model.fit(scaled_X_train, y_train)

GridSearchCV(estimator=LogisticRegression(max_iter=5000, multi_class='ovr',
                                          solver='saga'),
             param_grid={'C': array([1.00000000e-03, 5.55644444e+00, 1.11118889e+01, 1.66673333e+01,
       2.22227778e+01, 2.77782222e+01, 3.33336667e+01, 3.88891111e+01,
       4.44445556e+01, 5.00000000e+01]),
                         'penalty': ['l1', 'l2']},
             scoring='f1_micro')

run time = 15 s

In [173]:
grid_model.best_params_

{'C': 16.667333333333335, 'penalty': 'l2'}

<h3> Predict and Evaluate </h3>

In [174]:
y_pred = grid_model.predict(scaled_X_test)

In [175]:
accuracy_score(y_test,y_pred)

0.5312925170068027

In [98]:
"""
fig, ax = plt.subplots(figsize=(5, 5))
plot_confusion_matrix(grid_model,scaled_X_test,y_test, ax = ax)
plt.xticks(rotation = 90)
"""

'\nfig, ax = plt.subplots(figsize=(5, 5))\nplot_confusion_matrix(grid_model,scaled_X_test,y_test, ax = ax)\nplt.xticks(rotation = 90)\n'

In [176]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        high       0.00      0.00      0.00        55
         low       0.50      0.02      0.03        60
lower_middle       0.57      0.57      0.57       438
      middle       0.53      0.75      0.62       666
upper_middle       0.37      0.12      0.19       251

    accuracy                           0.53      1470
   macro avg       0.39      0.29      0.28      1470
weighted avg       0.49      0.53      0.48      1470



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Even after hyperparameter tuning, accuracy still at 53%. <br>
The f1 scores tell us that the model struggles with outlier classes (low, upper_middle, high) <br>


<h3> Red Wine </h3>

In [177]:
X_red = red_wine_trans.drop(['quality', 'type', 'quality_label'], axis = 1)
y_red = red_wine_trans['quality_label']

In [178]:
# Train | Test split
X_train, X_test, y_train, y_test = train_test_split(X_red, y_red, test_size=0.3, random_state=101)

# Scale features
scaler = StandardScaler()
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [179]:
# Create model
# Default uses L2 regularisation with C = 1.
# multi_class = "ovr": binary classification multiple times, and choose label with highest hypothesis / probability. 

log_model = LogisticRegression(solver='saga',multi_class="ovr",max_iter=5000)
log_model.fit(scaled_X_train, y_train)
y_pred = log_model.predict(scaled_X_test)

In [180]:
accuracy_score(y_test,y_pred)

0.5833333333333334

In [181]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        high       0.00      0.00      0.00         5
         low       0.33      0.04      0.07        24
lower_middle       0.68      0.75      0.71       208
      middle       0.50      0.58      0.54       180
upper_middle       0.46      0.27      0.34        63

    accuracy                           0.58       480
   macro avg       0.39      0.33      0.33       480
weighted avg       0.56      0.58      0.56       480



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of 58%. <br>
Macro f1 score average = 33%. 

<h5> GridSearchCV to find optimal hyper parameters </h5>

In [184]:
# Create base log reg model
log_model = LogisticRegression(solver='saga',multi_class="ovr",max_iter=5000)

penalty = ['l1', 'l2']
C = np.linspace(0.1,100,10)

grid_model = GridSearchCV(log_model,param_grid={'C':C,'penalty':penalty}, scoring = 'f1_micro')

In [185]:
grid_model.fit(scaled_X_train, y_train)
grid_model.best_params_

{'C': 0.1, 'penalty': 'l2'}

In [186]:
y_pred = grid_model.predict(scaled_X_test)

In [187]:
accuracy_score(y_test,y_pred)

0.58125

In [99]:
"""
fig, ax = plt.subplots(figsize=(5, 5))
plot_confusion_matrix(grid_model,scaled_X_test,y_test, ax = ax)
"""

'\nfig, ax = plt.subplots(figsize=(5, 5))\nplot_confusion_matrix(grid_model,scaled_X_test,y_test, ax = ax)\n'

In [188]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        high       0.00      0.00      0.00         5
         low       0.00      0.00      0.00        24
lower_middle       0.67      0.77      0.72       208
      middle       0.50      0.58      0.54       180
upper_middle       0.42      0.21      0.28        63

    accuracy                           0.58       480
   macro avg       0.32      0.31      0.31       480
weighted avg       0.53      0.58      0.55       480



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of 58%. Macro avg f1 score of 33% <br>
Tuning of hyperparameters achieves no improvement.

<h3> Logistic Regression With Smote  </h3>

<h3> White Wine </h3>

<h5> Split the data into training and test data first, and also scale, before doing over/under-sampling </h5>

In [241]:
# Train | Test split
X_train, X_test, y_train, y_test = train_test_split(X_white, y_white, test_size=0.3, random_state=101)

In [242]:
# Scale the data
scaler = StandardScaler()
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [243]:
y_train.value_counts()

middle          1532
lower_middle    1019
upper_middle     629
high             125
low              123
Name: quality_label, dtype: int64

In [244]:
under_strategy_white = {'middle': 1000, 'lower_middle': 1000, 'upper_middle': 629, 'low': 123, 'high': 125}
over_strategy_white = {'middle': 1000, 'lower_middle': 1000, 'upper_middle': 800, 'low': 400, 'high': 400}

under_white = RandomUnderSampler(sampling_strategy = under_strategy_white)
over_white = SMOTE (sampling_strategy = over_strategy_white)

scaled_X_train, y_train = under_white.fit_resample(scaled_X_train, y_train)
scaled_X_train, y_train = over_white.fit_resample(scaled_X_train, y_train)

In [245]:
y_train.value_counts()

lower_middle    1000
middle          1000
upper_middle     800
high             400
low              400
Name: quality_label, dtype: int64

In [246]:
# Create model
# Default uses L2 regularisation with C = 1.
# multi_class = "ovr": binary classification multiple times, and choose label with highest hypothesis / probability. 

log_model = LogisticRegression(solver='saga',multi_class="ovr",max_iter=5000)
log_model.fit(scaled_X_train, y_train)
y_pred = log_model.predict(scaled_X_test)

In [247]:
accuracy_score(y_test,y_pred)

0.4673469387755102

In [248]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        high       0.22      0.09      0.13        55
         low       0.23      0.37      0.29        60
lower_middle       0.50      0.67      0.57       438
      middle       0.58      0.33      0.42       666
upper_middle       0.38      0.58      0.46       251

    accuracy                           0.47      1470
   macro avg       0.38      0.41      0.37      1470
weighted avg       0.50      0.47      0.46      1470



<h3> Tune hyperparameters </h3>

In [249]:
# Create base log reg model
log_model = LogisticRegression(solver='saga',multi_class="ovr",max_iter=5000)

penalty = ['l1', 'l2']
C = np.linspace(0.01,30,20)

grid_model = GridSearchCV(log_model,param_grid={'C':C,'penalty':penalty}, scoring = 'f1_micro')

In [250]:
grid_model.fit(scaled_X_train, y_train)

GridSearchCV(estimator=LogisticRegression(max_iter=5000, multi_class='ovr',
                                          solver='saga'),
             param_grid={'C': array([1.00000000e-02, 1.58842105e+00, 3.16684211e+00, 4.74526316e+00,
       6.32368421e+00, 7.90210526e+00, 9.48052632e+00, 1.10589474e+01,
       1.26373684e+01, 1.42157895e+01, 1.57942105e+01, 1.73726316e+01,
       1.89510526e+01, 2.05294737e+01, 2.21078947e+01, 2.36863158e+01,
       2.52647368e+01, 2.68431579e+01, 2.84215789e+01, 3.00000000e+01]),
                         'penalty': ['l1', 'l2']},
             scoring='f1_micro')

In [251]:
grid_model.best_params_

{'C': 17.37263157894737, 'penalty': 'l1'}

In [252]:
y_pred = grid_model.predict(scaled_X_test)

In [253]:
accuracy_score(y_test,y_pred)

0.4666666666666667

In [254]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        high       0.21      0.09      0.13        55
         low       0.23      0.35      0.27        60
lower_middle       0.50      0.67      0.57       438
      middle       0.59      0.33      0.42       666
upper_middle       0.38      0.58      0.46       251

    accuracy                           0.47      1470
   macro avg       0.38      0.40      0.37      1470
weighted avg       0.50      0.47      0.46      1470



Logistic regression with SMOTE - White wine <br> <br>

accuracy: 46% <br>
f1 score macro avg: 37%

<h3> Logistic Regression With Smote  </h3>

<h3> Red Wine </h3>

<h5> Split the data into training and test data first, and also scale, before doing over/under-sampling </h5>

In [255]:
# Train | Test split
X_train, X_test, y_train, y_test = train_test_split(X_red, y_red, test_size=0.3, random_state=101)

In [256]:
# Scale the data
scaler = StandardScaler()
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [257]:
y_train.value_counts()

lower_middle    473
middle          458
upper_middle    136
low              39
high             13
Name: quality_label, dtype: int64

In [258]:
under_strategy_red = {'middle': 400, 'lower_middle': 400, 'upper_middle': 136, 'low': 39, 'high': 13}
over_strategy_red = {'middle': 400, 'lower_middle': 400, 'upper_middle': 300, 'low': 100, 'high': 100}

under_red = RandomUnderSampler(sampling_strategy = under_strategy_red)
over_red = SMOTE (sampling_strategy = over_strategy_red)

scaled_X_train, y_train = under_red.fit_resample(scaled_X_train, y_train)
scaled_X_train, y_train = over_red.fit_resample(scaled_X_train, y_train)

In [259]:
y_train.value_counts()

lower_middle    400
middle          400
upper_middle    300
high            100
low             100
Name: quality_label, dtype: int64

In [260]:
# Create model
# Default uses L2 regularisation with C = 1.
# multi_class = "ovr": binary classification multiple times, and choose label with highest hypothesis / probability. 

log_model = LogisticRegression(solver='saga',multi_class="ovr",max_iter=5000)
log_model.fit(scaled_X_train, y_train)
y_pred = log_model.predict(scaled_X_test)

In [261]:
accuracy_score(y_test,y_pred)

0.5229166666666667

In [262]:
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        high       0.00      0.00      0.00         5
         low       0.25      0.12      0.17        24
lower_middle       0.69      0.70      0.70       208
      middle       0.47      0.34      0.40       180
upper_middle       0.34      0.63      0.45        63

    accuracy                           0.52       480
   macro avg       0.35      0.36      0.34       480
weighted avg       0.53      0.52      0.52       480



<h3> Tune hyperparameters </h3>

In [263]:
# Create base log reg model
log_model = LogisticRegression(solver='saga',multi_class="ovr",max_iter=5000)

penalty = ['l1', 'l2']
C = np.linspace(0.01,30,20)

grid_model = GridSearchCV(log_model,param_grid={'C':C,'penalty':penalty}, scoring = 'f1_micro')

In [264]:
grid_model.fit(scaled_X_train, y_train)

GridSearchCV(estimator=LogisticRegression(max_iter=5000, multi_class='ovr',
                                          solver='saga'),
             param_grid={'C': array([1.00000000e-02, 1.58842105e+00, 3.16684211e+00, 4.74526316e+00,
       6.32368421e+00, 7.90210526e+00, 9.48052632e+00, 1.10589474e+01,
       1.26373684e+01, 1.42157895e+01, 1.57942105e+01, 1.73726316e+01,
       1.89510526e+01, 2.05294737e+01, 2.21078947e+01, 2.36863158e+01,
       2.52647368e+01, 2.68431579e+01, 2.84215789e+01, 3.00000000e+01]),
                         'penalty': ['l1', 'l2']},
             scoring='f1_micro')

In [265]:
grid_model.best_params_

{'C': 4.745263157894737, 'penalty': 'l2'}

In [266]:
y_pred = grid_model.predict(scaled_X_test)

In [267]:
accuracy_score(y_test,y_pred)

0.5229166666666667

In [268]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        high       0.00      0.00      0.00         5
         low       0.25      0.12      0.17        24
lower_middle       0.69      0.69      0.69       208
      middle       0.48      0.36      0.41       180
upper_middle       0.35      0.63      0.45        63

    accuracy                           0.52       480
   macro avg       0.35      0.36      0.34       480
weighted avg       0.54      0.52      0.52       480



Logistic Regression with SMOTE – red wine: <br>

accuracy: 52% <br>
f1 macro avg: 34%