In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV, LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,plot_confusion_matrix

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline

import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.tree import DecisionTreeClassifier

In [38]:
white_wine_trans = pd.read_csv('transformed_wine_data/white_wine_trans.csv')
red_wine_trans = pd.read_csv('transformed_wine_data/red_wine_trans.csv')

In [39]:
X_white = white_wine_trans.drop(['quality_label', 'quality', 'type'], axis = 1)
y_white = white_wine_trans['quality_label']

X_red = red_wine_trans.drop(['quality_label', 'quality', 'type'], axis = 1)
y_red = red_wine_trans['quality_label']

NOTE: Since decision tree evaluates each feature individually (one at a time), there's no need for feature scaling

<h3> Decision Tree Approach – Red Wine </h3>

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_red, y_red, test_size=0.3, random_state=101)

<h5> Start with default hyper parameters </h3>

In [41]:
model = DecisionTreeClassifier()

In [42]:
model.fit(X_train,y_train)

DecisionTreeClassifier()

In [43]:
base_pred = model.predict(X_test)

<h3> Evaluate </h3>

In [44]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

        high       0.11      0.20      0.14         5
         low       0.20      0.17      0.18        24
lower_middle       0.75      0.68      0.71       208
      middle       0.57      0.61      0.59       180
upper_middle       0.48      0.52      0.50        63

    accuracy                           0.60       480
   macro avg       0.42      0.44      0.43       480
weighted avg       0.61      0.60      0.61       480



Accuracy: 61% <br>
f1 macro avg: 43%

In [45]:
model.feature_importances_

array([0.08339032, 0.10068184, 0.07905877, 0.04475705, 0.07328515,
       0.06197075, 0.1055492 , 0.07003865, 0.06127897, 0.14071855,
       0.17927074])

In [46]:
pd.DataFrame(index=X_red.columns,data=model.feature_importances_, columns=['Feature Importance'])

Unnamed: 0,Feature Importance
fixed acidity,0.08339
volatile acidity,0.100682
citric acid,0.079059
residual sugar,0.044757
chlorides,0.073285
free sulfur dioxide,0.061971
total sulfur dioxide,0.105549
density,0.070039
pH,0.061279
sulphates,0.140719


<h3> Visualise the Tree </h3>

In [47]:
from sklearn.tree import plot_tree

In [48]:
"""
plt.figure(figsize=(5,5))
plot_tree(model);
"""

'\nplt.figure(figsize=(5,5))\nplot_tree(model);\n'

run time = 17 secs

In [49]:
"""
plt.figure(figsize=(5,5),dpi=150)
plot_tree(model,filled=True,feature_names=X_red.columns);
"""

'\nplt.figure(figsize=(5,5),dpi=150)\nplot_tree(model,filled=True,feature_names=X_red.columns);\n'

Code has been suppressed due to the tree's illegibility. 

<h3> Experiment with hyper parameters </h3>

In [50]:
decision_tree = DecisionTreeClassifier(max_depth=15)
decision_tree.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=15)

In [51]:
preds = decision_tree.predict(X_test)

In [52]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

        high       0.14      0.20      0.17         5
         low       0.14      0.12      0.13        24
lower_middle       0.72      0.68      0.70       208
      middle       0.57      0.61      0.59       180
upper_middle       0.50      0.49      0.50        63

    accuracy                           0.60       480
   macro avg       0.42      0.42      0.42       480
weighted avg       0.60      0.60      0.60       480



In [53]:
decision_tree = DecisionTreeClassifier(max_depth=30, max_leaf_nodes = 100)
decision_tree.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=30, max_leaf_nodes=100)

In [54]:
preds = decision_tree.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

        high       0.00      0.00      0.00         5
         low       0.40      0.08      0.14        24
lower_middle       0.74      0.71      0.72       208
      middle       0.57      0.66      0.61       180
upper_middle       0.46      0.51      0.48        63

    accuracy                           0.62       480
   macro avg       0.43      0.39      0.39       480
weighted avg       0.62      0.62      0.61       480



Here you've increased the overall accuracy but worsened the f1 scores.

<h3> Grid Search for hyper parameters </h3>

In [60]:
decision_tree = DecisionTreeClassifier()
param_grid = {'max_depth': [3, 5, 10,15,20,30,50],
              'max_leaf_nodes': [10, 30, 50, 70, 100],
              'min_samples_split': [2, 5, 10, 15, 20]}
grid_model = GridSearchCV(decision_tree, param_grid, scoring = 'f1_macro')

In [61]:
grid_model.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [3, 5, 10, 15, 20, 30, 50],
                         'max_leaf_nodes': [10, 30, 50, 70, 100],
                         'min_samples_split': [2, 5, 10, 15, 20]},
             scoring='f1_macro')

In [62]:
grid_model.best_params_

{'max_depth': 50, 'max_leaf_nodes': 100, 'min_samples_split': 5}

In [63]:
y_pred = grid_model.predict(X_test)

In [64]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        high       0.00      0.00      0.00         5
         low       0.29      0.08      0.13        24
lower_middle       0.73      0.70      0.71       208
      middle       0.56      0.63      0.59       180
upper_middle       0.46      0.49      0.48        63

    accuracy                           0.61       480
   macro avg       0.41      0.38      0.38       480
weighted avg       0.60      0.61      0.60       480



Even after tuning the hyper parameters, still have: <br>
accuracy: 61% <br>
f1 macro avg: 38%

<h3> Decision Tree Approach – White Wine </h3>

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X_white, y_white, test_size=0.3, random_state=101)

<h5> Start with default hyper parameters </h3>

In [67]:
model = DecisionTreeClassifier()

In [68]:
model.fit(X_train,y_train)

DecisionTreeClassifier()

In [69]:
base_pred = model.predict(X_test)

<h3> Evaluate </h3>

In [70]:
print(classification_report(y_test,base_pred))

              precision    recall  f1-score   support

        high       0.37      0.42      0.39        55
         low       0.28      0.25      0.26        60
lower_middle       0.60      0.62      0.61       438
      middle       0.65      0.61      0.63       666
upper_middle       0.50      0.55      0.53       251

    accuracy                           0.58      1470
   macro avg       0.48      0.49      0.48      1470
weighted avg       0.58      0.58      0.58      1470



Accuracy: 58% <br>
f1 macro avg: 48%

In [71]:
model.feature_importances_

array([0.07751163, 0.09744865, 0.09065341, 0.09390186, 0.06030415,
       0.1003461 , 0.08812947, 0.07460188, 0.0869908 , 0.0791402 ,
       0.15097184])

In [72]:
pd.DataFrame(index=X_red.columns,data=model.feature_importances_, columns=['Feature Importance'])

Unnamed: 0,Feature Importance
fixed acidity,0.077512
volatile acidity,0.097449
citric acid,0.090653
residual sugar,0.093902
chlorides,0.060304
free sulfur dioxide,0.100346
total sulfur dioxide,0.088129
density,0.074602
pH,0.086991
sulphates,0.07914


<h3> Visualise the Tree </h3>

In [None]:
from sklearn.tree import plot_tree

In [None]:
"""
plt.figure(figsize=(5,5))
plot_tree(model);
"""

'\nplt.figure(figsize=(5,5))\nplot_tree(model);\n'

run time = 17 secs

In [None]:
"""
plt.figure(figsize=(5,5),dpi=150)
plot_tree(model,filled=True,feature_names=X_red.columns);
"""

'\nplt.figure(figsize=(5,5),dpi=150)\nplot_tree(model,filled=True,feature_names=X_red.columns);\n'

Code has been suppressed due to the tree's illegibility. 

<h3> Experiment with hyper parameters </h3>

In [73]:
decision_tree = DecisionTreeClassifier(max_depth=15)
decision_tree.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=15)

In [74]:
preds = decision_tree.predict(X_test)

In [75]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

        high       0.38      0.33      0.35        55
         low       0.18      0.15      0.17        60
lower_middle       0.59      0.60      0.59       438
      middle       0.61      0.62      0.61       666
upper_middle       0.49      0.48      0.48       251

    accuracy                           0.56      1470
   macro avg       0.45      0.44      0.44      1470
weighted avg       0.56      0.56      0.56      1470



In [76]:
decision_tree = DecisionTreeClassifier(max_depth=30, max_leaf_nodes = 100)
decision_tree.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=30, max_leaf_nodes=100)

In [77]:
preds = decision_tree.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

        high       0.35      0.15      0.21        55
         low       0.31      0.08      0.13        60
lower_middle       0.55      0.59      0.57       438
      middle       0.55      0.63      0.59       666
upper_middle       0.43      0.32      0.37       251

    accuracy                           0.53      1470
   macro avg       0.44      0.35      0.37      1470
weighted avg       0.51      0.53      0.51      1470



<h3> Grid Search for hyper parameters </h3>

In [90]:
decision_tree = DecisionTreeClassifier()
param_grid = {'max_depth': [3, 5, 10,15,20,30,50],
              'max_leaf_nodes': [10, 30, 50, 70, 100],
              'min_samples_split': [2, 5, 10, 15, 20]}
grid_model = GridSearchCV(decision_tree, param_grid, scoring = 'f1_micro')

In [91]:
grid_model.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [3, 5, 10, 15, 20, 30, 50],
                         'max_leaf_nodes': [10, 30, 50, 70, 100],
                         'min_samples_split': [2, 5, 10, 15, 20]},
             scoring='f1_micro')

In [92]:
grid_model.best_params_

{'max_depth': 30, 'max_leaf_nodes': 100, 'min_samples_split': 10}

In [93]:
y_pred = grid_model.predict(X_test)

In [94]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        high       0.35      0.15      0.21        55
         low       0.31      0.08      0.13        60
lower_middle       0.55      0.59      0.57       438
      middle       0.55      0.63      0.59       666
upper_middle       0.43      0.32      0.37       251

    accuracy                           0.53      1470
   macro avg       0.44      0.35      0.37      1470
weighted avg       0.51      0.53      0.51      1470



Accuracy: 53% <br>
f1 macro avg: 37%

<h2> Decision Tree with Smote </h2>

<h3> White Wine </h3>

In [114]:
# Working with white wine data
X_train, X_test, y_train, y_test = train_test_split(X_white, y_white, test_size=0.3, random_state=101)

In [115]:
y_train.value_counts()

middle          1532
lower_middle    1019
upper_middle     629
high             125
low              123
Name: quality_label, dtype: int64

In [116]:
under_strategy_white = {'middle': 1200, 'lower_middle': 900, 'upper_middle': 600, 'low': 123, 'high': 125}
over_strategy_white = {'middle': 1200, 'lower_middle': 900, 'upper_middle': 600, 'low': 600, 'high': 600}

under_white = RandomUnderSampler(sampling_strategy = under_strategy_white)
over_white = SMOTE (sampling_strategy = over_strategy_white)

X_train, y_train = under_white.fit_resample(X_train, y_train)
X_train, y_train = over_white.fit_resample(X_train, y_train)

In [117]:
y_train.value_counts()

middle          1200
lower_middle     900
high             600
low              600
upper_middle     600
Name: quality_label, dtype: int64

In [118]:
decision_tree = DecisionTreeClassifier()
param_grid = {'max_depth': [3, 5, 10,15,20,30,50],
              'max_leaf_nodes': [10, 30, 50, 70, 100],
              'min_samples_split': [2, 5, 10, 15, 20]}
grid_model = GridSearchCV(decision_tree, param_grid, scoring = 'f1_micro')

In [110]:
grid_model.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [3, 5, 10, 15, 20, 30, 50],
                         'max_leaf_nodes': [10, 30, 50, 70, 100],
                         'min_samples_split': [2, 5, 10, 15, 20]},
             scoring='f1_micro')

In [111]:
grid_model.best_params_

{'max_depth': 20, 'max_leaf_nodes': 70, 'min_samples_split': 20}

In [112]:
y_pred = grid_model.predict(X_test)

In [113]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        high       0.14      0.31      0.19        55
         low       0.16      0.37      0.22        60
lower_middle       0.56      0.46      0.51       438
      middle       0.54      0.58      0.56       666
upper_middle       0.39      0.20      0.27       251

    accuracy                           0.46      1470
   macro avg       0.36      0.38      0.35      1470
weighted avg       0.49      0.46      0.47      1470



Without smote: <br>
Accuracy: 53% <br>
f1 macro avg: 37% <br><br>

With smote: <br>
Accuracy: 46% <br>
f1 macro avg: 35% <br>

Performance got worse

<h3> Red Wine </h3>

In [128]:
# Working with white wine data
X_train, X_test, y_train, y_test = train_test_split(X_red, y_red, test_size=0.3, random_state=101)

In [129]:
y_train.value_counts()

lower_middle    473
middle          458
upper_middle    136
low              39
high             13
Name: quality_label, dtype: int64

In [130]:
under_strategy_red = {'middle': 450, 'lower_middle': 450, 'upper_middle': 136, 'low': 39, 'high': 13}
over_strategy_red = {'middle': 450, 'lower_middle': 450, 'upper_middle': 400, 'low': 200, 'high': 200}

under_red = RandomUnderSampler(sampling_strategy = under_strategy_red)
over_red = SMOTE (sampling_strategy = over_strategy_red)

X_train, y_train = under_red.fit_resample(X_train, y_train)
X_train, y_train = over_red.fit_resample(X_train, y_train)

In [131]:
y_train.value_counts()

lower_middle    450
middle          450
upper_middle    400
high            200
low             200
Name: quality_label, dtype: int64

In [132]:
decision_tree = DecisionTreeClassifier()
param_grid = {'max_depth': [3, 5, 10,15,20,30,50],
              'max_leaf_nodes': [10, 30, 50, 70, 100],
              'min_samples_split': [2, 5, 10, 15, 20]}
grid_model = GridSearchCV(decision_tree, param_grid, scoring = 'f1_micro')

In [133]:
grid_model.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [3, 5, 10, 15, 20, 30, 50],
                         'max_leaf_nodes': [10, 30, 50, 70, 100],
                         'min_samples_split': [2, 5, 10, 15, 20]},
             scoring='f1_micro')

In [134]:
grid_model.best_params_

{'max_depth': 30, 'max_leaf_nodes': 100, 'min_samples_split': 10}

In [135]:
y_pred = grid_model.predict(X_test)

In [136]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        high       0.14      0.40      0.21         5
         low       0.26      0.33      0.29        24
lower_middle       0.68      0.62      0.65       208
      middle       0.51      0.46      0.48       180
upper_middle       0.39      0.51      0.44        63

    accuracy                           0.53       480
   macro avg       0.40      0.46      0.41       480
weighted avg       0.55      0.53      0.54       480



Red wine without smote: <br>
accuracy: 61% <br>
f1 macro avg: 38% <br>

Red wine with smote: <br>
accuracy: 53% <br>
f1 macro avg: 41% 