<a href="https://colab.research.google.com/github/leechanwoo-kor/wine-quality-prediction/blob/main/kaggle/Wine_Quality_Model_Comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# For ML models
from sklearn.linear_model import LinearRegression ,LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC ,SVR
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

# For Data Processing
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 

# For Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Miscellaneous
import os
import random

In [2]:
# df = pd.read_csv('/kaggle/input/wine-quality-dataset/WineQT.csv')
url = 'https://raw.githubusercontent.com/leechanwoo-kor/wine-quality-prediction/main/kaggle/WineQT.csv'
df = pd.read_csv(url)
del df['Id']
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6


In [11]:
#categorize wine quality
bins = (2,6.5,8)
group_names = ['bad','good']
categories = pd.cut(df['quality'], bins, labels = group_names)
df['quality'] = categories

In [13]:
#splitting data to X ve y
X = df.drop(['quality'], axis = 1)
y = df['quality']

In [14]:
# Encoding our dependent variable:Quality column
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
y

array([0, 0, 0, ..., 0, 0, 0])

In [15]:
# Splitting the dataset into the Training set and Test set.%20 of dataset for test set,%80 for training set.
# from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Models

In [17]:
model_comparison = {}

## Decision Tree Classifier

In [18]:
parameters = {'max_depth': [5,10,15,20]}

Tree_model = DecisionTreeClassifier()

clf = GridSearchCV(Tree_model, parameters)
print("Searching for best hyperparameters ...")
clf.fit(X_train, y_train)
print(f'Best Hyperparameters: {clf.best_params_}')

y_pred = clf.predict(X_test)
model_comparison['DecisionTreeClassifier'] = [accuracy_score(y_test,y_pred), f1_score(y_test,y_pred, average='weighted')]
print('\n')
print(classification_report(y_test,y_pred, zero_division=1))

Searching for best hyperparameters ...
Best Hyperparameters: {'max_depth': 10}


              precision    recall  f1-score   support

           0       0.92      0.93      0.92       200
           1       0.46      0.41      0.44        29

    accuracy                           0.86       229
   macro avg       0.69      0.67      0.68       229
weighted avg       0.86      0.86      0.86       229



## KNeighbors Classifier

In [21]:
parameters = {'n_neighbors': [10,20,30,40,50]}

K_model = KNeighborsClassifier()

clf = GridSearchCV(K_model, parameters)
print("Searching for best hyperparameters ...")
clf.fit(X_train, y_train)
print(f'Best Hyperparameters: {clf.best_params_}')

y_pred = clf.predict(X_test)
model_comparison['KNeighborsClassifier'] = [accuracy_score(y_test,y_pred), f1_score(y_test,y_pred, average='weighted')]
print('\n')
print(classification_report(y_test,y_pred, zero_division=1))

Searching for best hyperparameters ...
Best Hyperparameters: {'n_neighbors': 10}


              precision    recall  f1-score   support

           0       0.91      0.95      0.93       200
           1       0.50      0.31      0.38        29

    accuracy                           0.87       229
   macro avg       0.70      0.63      0.66       229
weighted avg       0.85      0.87      0.86       229



## Random Forest Classifier

In [24]:
parameters = {'n_estimators': [160,180,200], 'max_depth':[18,20,22,24]}

rf = RandomForestClassifier()

clf = GridSearchCV(rf, parameters)
print("Searching for best hyperparameters ...")
clf.fit(X_train, y_train)
print(f'Best Hyperparameters: {clf.best_params_}')

y_pred = clf.predict(X_test)
model_comparison['RandomForestClassifier'] = [accuracy_score(y_test,y_pred), f1_score(y_test,y_pred, average='weighted')]
print('\n')
print(classification_report(y_test,y_pred, zero_division=1))

Searching for best hyperparameters ...
Best Hyperparameters: {'max_depth': 20, 'n_estimators': 160}


              precision    recall  f1-score   support

           0       0.93      0.96      0.94       200
           1       0.64      0.48      0.55        29

    accuracy                           0.90       229
   macro avg       0.78      0.72      0.75       229
weighted avg       0.89      0.90      0.89       229



## XGBoost

In [25]:
parameters = {'n_estimators': [100, 150, 200], 'max_depth':[16, 18, 20]}

xgboost = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

clf = GridSearchCV(xgboost, parameters)
print("Searching for best hyperparameters ...")
clf.fit(X_train, y_train)
print(f'Best Hyperparameters: {clf.best_params_}')

y_pred = clf.predict(X_test)
model_comparison['XGBoost'] = [accuracy_score(y_test, y_pred), f1_score(y_test,y_pred, average='weighted')]
print('\n')
print(classification_report(y_test,y_pred, zero_division=1))

Searching for best hyperparameters ...
Best Hyperparameters: {'max_depth': 16, 'n_estimators': 200}


              precision    recall  f1-score   support

           0       0.93      0.96      0.94       200
           1       0.64      0.48      0.55        29

    accuracy                           0.90       229
   macro avg       0.78      0.72      0.75       229
weighted avg       0.89      0.90      0.89       229



## Model Comparison

In [27]:
model_comparison_df = pd.DataFrame.from_dict(model_comparison).T
model_comparison_df.columns = ['Accuracy', 'F1 Score']
model_comparison_df = model_comparison_df.sort_values('F1 Score', ascending=True)
model_comparison_df.style.background_gradient(cmap='Blues')

Unnamed: 0,Accuracy,F1 Score
KNeighborsClassifier,0.873362,0.860238
DecisionTreeClassifier,0.864629,0.861441
RandomForestClassifier,0.899563,0.893534
XGBoost,0.899563,0.893534


In [28]:
fig = go.Figure(data=[
    go.Bar(name='F1 Score', y=model_comparison_df.index, x=model_comparison_df['F1 Score'], orientation='h'),
    go.Bar(name='Accuracy', y=model_comparison_df.index, x=model_comparison_df['Accuracy'], orientation='h')
])
fig.update_layout(barmode='group')
fig.show()