In [25]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import resample
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, classification_report, accuracy_score

In [3]:
full_df = pd.read_csv("C:/Users/Kiat Kai/Desktop/NOTES/Y4S2/ST4248/winequality-red.csv")
full_df.shape

(1599, 12)

## Train Test Split

In [7]:
# Split into X and Y
Y = full_df.iloc[:,-1]
X = full_df.drop(full_df.columns[-1], axis=1)

# Now, train test split
X_train, Xtest, Y_train, Ytest = train_test_split(X, Y, train_size=0.8, random_state=100)

## 1) LDA

In [32]:
lda = LinearDiscriminantAnalysis()
y_pred = lda.fit(X_train, Y_train).predict(Xtest)
f1_lda = f1_score(Ytest, y_pred, average='macro')
acc_lda = accuracy_score(y_pred, Ytest)
print("F1 Score for Linear Discriminant Analysis Classifier is", f1_lda)
print("Accuracy Score for Linear Discriminant Analysis Classifier is", acc_lda)
print(classification_report(Ytest,y_pred))

F1 Score for Linear Discriminant Analysis Classifier is 0.6
Accuracy Score for Linear Discriminant Analysis Classifier is 0.6
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       1.00      0.08      0.15        12
           5       0.68      0.70      0.69       133
           6       0.58      0.61      0.59       137
           7       0.37      0.47      0.41        32
           8       0.00      0.00      0.00         3

    accuracy                           0.60       320
   macro avg       0.44      0.31      0.31       320
weighted avg       0.61      0.60      0.59       320



  _warn_prf(average, modifier, msg_start, len(result))


## 2) QDA

In [31]:
qda = QuadraticDiscriminantAnalysis()
y_pred = qda.fit(X_train, Y_train).predict(Xtest)
f1_qda = f1_score(Ytest, y_pred, average='macro')
acc_qda = accuracy_score(y_pred, Ytest)
print("F1 Score for Quadratic Discriminant Analysis Classifier is", f1_qda)
print("Accuracy Score for Quadratic Discriminant Analysis Classifier is", acc_qda)
print(classification_report(Ytest,y_pred))

F1 Score for Quadratic Discriminant Analysis Classifier is 0.553125
Accuracy Score for Quadratic Discriminant Analysis Classifier is 0.553125
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.20      0.08      0.12        12
           5       0.64      0.66      0.65       133
           6       0.56      0.50      0.53       137
           7       0.40      0.62      0.49        32
           8       0.00      0.00      0.00         3

    accuracy                           0.55       320
   macro avg       0.30      0.31      0.30       320
weighted avg       0.55      0.55      0.55       320



  _warn_prf(average, modifier, msg_start, len(result))


## 3) Naive Bayes

In [30]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, Y_train).predict(Xtest)
f1_nb = f1_score(Ytest, y_pred, average='macro')
acc_nb = accuracy_score(y_pred, Ytest)
print("F1 Score for Naive Bayes Classifier is", f1_nb)
print("Accuracy Score for Naive Bayes Classifier is", acc_nb)
print(classification_report(Ytest,y_pred))

F1 Score for Naive Bayes Classifier is 0.53125
Accuracy Score for Naive Bayes Classifier is 0.53125
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.25      0.08      0.12        12
           5       0.66      0.69      0.67       133
           6       0.56      0.43      0.49       137
           7       0.28      0.56      0.37        32
           8       0.00      0.00      0.00         3

    accuracy                           0.53       320
   macro avg       0.29      0.29      0.28       320
weighted avg       0.55      0.53      0.53       320



## 4) Random Forest

In [44]:
rfc=RandomForestClassifier(random_state=100)

param_grid = { 
    'n_estimators': [200, 300, 400, 500],
    'max_features': [3,4],
    'max_depth' : [4,6,8,10,12,14],
    'criterion' :['gini', 'entropy']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 10)
CV_rfc.fit(X_train, Y_train)



GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=100),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 6, 8, 10, 12, 14],
                         'max_features': [3, 4],
                         'n_estimators': [200, 300, 400, 500]})

In [45]:
CV_rfc.best_params_

{'criterion': 'gini', 'max_depth': 12, 'max_features': 4, 'n_estimators': 400}

In [46]:
# Best parameters after tuning:
rf = RandomForestClassifier(random_state=100, n_estimators=400, criterion='gini', max_depth=12, max_features=4)
y_pred = rf.fit(X_train, Y_train).predict(Xtest)
f1_rf = f1_score(Ytest, y_pred, average='macro')
acc_rf = accuracy_score(y_pred, Ytest)
print("F1 Score for Random Forest Classifier is", f1_rf)
print("Accuracy Score for Random Forest Classifier is", acc_rf)
print(classification_report(Ytest,y_pred))

F1 Score for Random Forest Classifier is 0.35445901049082656
Accuracy Score for Random Forest Classifier is 0.721875
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00        12
           5       0.79      0.81      0.80       133
           6       0.71      0.74      0.72       137
           7       0.54      0.69      0.60        32
           8       0.00      0.00      0.00         3

    accuracy                           0.72       320
   macro avg       0.34      0.37      0.35       320
weighted avg       0.69      0.72      0.70       320



  _warn_prf(average, modifier, msg_start, len(result))


## 5) Support Vector Classifier

In [54]:
svc=SVC(random_state=100)

param_grid = { 
    'C': [0.1,0.8,0.9,1,1.1,1.2,1.3,1.4],
    'kernel': ['linear','rbf']
}

CV_svc = GridSearchCV(estimator=svc, param_grid=param_grid, cv= 5)
CV_svc.fit(X_train, Y_train)

GridSearchCV(cv=5, estimator=SVC(random_state=100),
             param_grid={'C': [0.1, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4],
                         'kernel': ['linear', 'rbf']})

In [55]:
CV_svc.best_params_

{'C': 1.2, 'kernel': 'linear'}

In [51]:
# Best parameters after tuning:
svc = SVC(random_state=100, C=1.2, kernel="linear")
y_pred = svc.fit(X_train, Y_train).predict(Xtest)
f1_svc = f1_score(Ytest, y_pred, average='macro')
acc_svc = accuracy_score(y_pred, Ytest)
print("F1 Score for Support Vector Classifier is", f1_svc)
print("Accuracy Score for Support Vector Classifier is", acc_svc)
print(classification_report(Ytest,y_pred))

F1 Score for Support Vector Classifier is 0.220895315531738
Accuracy Score for Support Vector Classifier is 0.609375
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00        12
           5       0.67      0.71      0.69       133
           6       0.56      0.74      0.64       137
           7       0.00      0.00      0.00        32
           8       0.00      0.00      0.00         3

    accuracy                           0.61       320
   macro avg       0.21      0.24      0.22       320
weighted avg       0.52      0.61      0.56       320



  _warn_prf(average, modifier, msg_start, len(result))


## 6) Logistic Regression

In [17]:
LR = LogisticRegression(random_state=100)

param_grid = {
    'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']
}

# Create grid search object
CV_Lr = GridSearchCV(estimator=LR, param_grid=param_grid, cv= 5)
CV_Lr.fit(X_train, Y_train)



GridSearchCV(cv=5, estimator=LogisticRegression(random_state=100),
             param_grid={'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                         'penalty': ['l1', 'l2'], 'solver': ['liblinear']})

In [18]:
CV_Lr.best_params_

{'C': 78.47599703514607, 'penalty': 'l2', 'solver': 'liblinear'}

In [27]:
# Best parameters after tuning:
Lr = LogisticRegression(random_state=100, C=78.47599703514607, penalty='l2',solver='liblinear')
y_pred = Lr.fit(X_train, Y_train).predict(Xtest)
f1_Lr = f1_score(Ytest, y_pred, average='macro')
acc_lr = accuracy_score(y_pred, Ytest)
print("F1 Score for Multinomial Logistic Regression is", f1_Lr)
print("Accuracy Score for Logistics Regression is", acc_lr)
print(classification_report(Ytest,y_pred))

F1 Score for Multinomial Logistic Regression is 0.60625
Accuracy Score for Logistics Regression is 0.60625
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00        12
           5       0.67      0.72      0.70       133
           6       0.57      0.67      0.62       137
           7       0.35      0.19      0.24        32
           8       0.00      0.00      0.00         3

    accuracy                           0.61       320
   macro avg       0.27      0.26      0.26       320
weighted avg       0.56      0.61      0.58       320



  _warn_prf(average, modifier, msg_start, len(result))


## 7) Decision tree

In [20]:
DT = DecisionTreeClassifier(random_state=100)

param_grid = {
    'criterion' : ['gini', 'entropy'],
    'max_features' :['auto', 'sqrt', 'log2'],
    'max_depth':np.arange(1,20)
}

# Create grid search object
CV_tree = GridSearchCV(estimator=DT, param_grid=param_grid, cv= 5)
CV_tree.fit(X_train, Y_train)


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=100),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19]),
                         'max_features': ['auto', 'sqrt', 'log2']})

In [21]:
CV_tree.best_params_

{'criterion': 'entropy', 'max_depth': 18, 'max_features': 'auto'}

In [26]:
# Best parameters after tuning:
Dt = DecisionTreeClassifier(random_state=100, criterion='gini', max_depth=16, max_features='auto')
y_pred = Dt.fit(X_train, Y_train).predict(Xtest)
f1_Dt = f1_score(Ytest, y_pred, average='macro')
acc_dt = accuracy_score(y_pred, Ytest)
print("F1 Score for Decision Tree is", f1_Dt)
print("Accuracy Score for decision tree classifier is", acc_dt)
print(classification_report(Ytest,y_pred))

F1 Score for Decision Tree is 0.615625
Accuracy Score for decision tree classifier is 0.615625
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         3
           4       0.22      0.17      0.19        12
           5       0.69      0.74      0.71       133
           6       0.66      0.55      0.60       137
           7       0.42      0.66      0.51        32
           8       0.00      0.00      0.00         3

    accuracy                           0.62       320
   macro avg       0.33      0.35      0.34       320
weighted avg       0.62      0.62      0.61       320



  _warn_prf(average, modifier, msg_start, len(result))


## Evaluating all models

In [56]:
# Dataframe to contain model results
model_results = pd.DataFrame(columns=["Models","F1 Score","Accuracy Score"])

# LDA
model_results = model_results.append(pd.DataFrame({"Models":"Linear Discriminant Analysis", 
                                  "F1 Score":f1_lda, "Accuracy Score":acc_lda}
                                                  , index = [0]), ignore_index = False)
# QDA
model_results = model_results.append(pd.DataFrame({"Models":"Quadratic Discriminant Analysis", 
                                  "F1 Score":f1_qda, "Accuracy Score":acc_qda}
                                                  , index = [1]), ignore_index = False)
# Naive Bayes
model_results = model_results.append(pd.DataFrame({"Models":"Naive Bayes", 
                                  "F1 Score":f1_nb, "Accuracy Score":acc_nb}
                                                  , index = [2]), ignore_index = False)
# Random Forest
model_results = model_results.append(pd.DataFrame({"Models":"Random Forest", 
                                  "F1 Score":f1_rf, "Accuracy Score":acc_rf}
                                                  , index = [3]), ignore_index = False)
# Support Vector Classifier
model_results = model_results.append(pd.DataFrame({"Models":"Support Vector Classifier", 
                                  "F1 Score":f1_svc, "Accuracy Score":acc_svc}
                                                  , index = [4]), ignore_index = False)
# Multinomial Logistic Regression
model_results = model_results.append(pd.DataFrame({"Models":"Multinomial Logistic Regression", 
                                  "F1 Score":f1_Lr, "Accuracy Score":acc_lr}
                                                  , index = [5]), ignore_index = False)
# Decision Tree
model_results = model_results.append(pd.DataFrame({"Models":"Decision Tree", 
                                  "F1 Score":f1_Dt, "Accuracy Score":acc_dt}
                                                  , index = [6]), ignore_index = False)

model_results.to_csv("model_results_noupsample.csv")
model_results.sort_values(by="F1 Score", ascending = False)

Unnamed: 0,Models,F1 Score,Accuracy Score
6,Decision Tree,0.615625,0.615625
5,Multinomial Logistic Regression,0.60625,0.60625
0,Linear Discriminant Analysis,0.6,0.6
1,Quadratic Discriminant Analysis,0.553125,0.553125
2,Naive Bayes,0.53125,0.53125
3,Random Forest,0.354459,0.721875
4,Support Vector Classifier,0.220895,0.609375
