# Import libraries

In [None]:
# Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import pickle

# Import data
Needed to use the "sep" argument because data is separate by semicolons

In [5]:
df_red = pd.read_csv("Data/winequality-red.csv")
df_white = pd.read_csv("Data/winequality-white.csv")              


In [6]:
df_red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [7]:
df_white.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


# clean data 
(turns out no cleaning was really needed)

In [8]:
print(f'red wine DF initial: {df_red.shape}')
print(f'white wine DF initial: {df_white.shape}')

red wine DF initial: (1599, 12)
white wine DF initial: (4898, 12)


In [9]:
#drop any null rows
df_red = df_red.dropna()
df_white = df_white.dropna()

In [10]:
print(f'red wine DF after dropNA: {df_red.shape}')
print(f'white wine DF dropNA: {df_white.shape}')

red wine DF after dropNA: (1599, 12)
white wine DF dropNA: (4898, 12)


# train_test_split

In [11]:
#red wine y-values
red_targets = df_red["quality"]

#white wine y-values
white_targets = df_white["quality"]

In [12]:
#red wine x-values
red_features = df_red.drop(columns="quality")

#white wine x-values
white_features = df_white.drop(columns="quality")


In [13]:
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(red_features, red_targets, stratify= red_targets, random_state = 43)

X_train_white, X_test_white, y_train_white, y_test_white = train_test_split(white_features, white_targets, stratify= white_targets, random_state = 43)

# scale data

In [14]:
X_red_scaler = MinMaxScaler().fit(X_train_red)
X_white_scaler = MinMaxScaler().fit(X_train_white)
X_train_red_scaled = X_red_scaler.transform(X_train_red)
X_train_white_scaled = X_white_scaler.transform(X_train_white)
X_test_red_scaled = X_red_scaler.transform(X_test_red)
X_test_white_scaled = X_white_scaler.transform(X_test_white)

pickle.dump(X_red_scaler, open("X_red_scaler.pkl", "wb"))

# instantiate and train the initial models

In [15]:
rf_red = RandomForestClassifier(n_estimators = 100)
rf_red = rf_red.fit(X_train_red_scaled, y_train_red)




In [16]:
rf_white = RandomForestClassifier(n_estimators = 100)
rf_white = rf_white.fit(X_train_white_scaled, y_train_white)


# test initial models

In [18]:
rf_red_training_score = rf_red.score(X_train_red_scaled, y_train_red)
rf_red_test_score = rf_red.score(X_test_red_scaled, y_test_red)



print(rf_red_training_score)
print(rf_red_test_score)


1.0
0.71


In [None]:
rf_white_training_score = rf_white.score(X_train_white_scaled, y_train_white)
rf_white_test_score = rf_white.score(X_test_white_scaled, y_test_white)

print(rf_white_training_score)
print(rf_white_test_score)


# grid search to optimize hyperparameters for red wine values

In [None]:
# Create the GridSearchCV model for red 

n_estimators = [360, 361, 362, 363, 364, 365, 366]
criterion= ["gini", "entropy"]
max_depth = [40, 50, 55, 60, 62, 63, 64, 65, 67, 69, 70]
bootstrap = [True, False]
param_grid = {"n_estimators": n_estimators,
              "criterion": criterion,
              "max_depth": max_depth,
              "bootstrap": bootstrap}

red_grid = GridSearchCV(rf_red, param_grid, verbose=3)



In [None]:
red_grid.fit(X_train_red_scaled, y_train_red)

In [None]:
print(f'best parameters are: {red_grid.best_params_}')
print(f'best grid training score is: {red_grid.best_score_}')

In [None]:
print(f'test data score: {red_grid.score(X_test_red_scaled, y_test_red)}')

# grid search notes for red wine values

First grid search: <br>
n_estimators = [10, 100, 250] <br>
criterion= ["gini", "entropy"] <br>
max_depth = [2, 8, 16, 32, 64] <br>
bootstrap = [True, False] <br>
best grid training score is: 0.6697350069735006 <br>
best parameters are: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 16, 'n_estimators': 100} <br>
best grid training score is: 0.6697350069735006 <br>
test data score: 0.7075 <br>


Second grid search: <br>
n_estimators = [10, 100, 250, 350, 500, 750 ] <br>
criterion= ["gini", "entropy"] <br>
max_depth = [2, 8, 16, 32, 64, 250, 500, 1000] <br>
bootstrap = [True, False] <br>
best parameters are: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 64, 'n_estimators': 500} <br>
best grid training score is: 0.6764086471408646 <br>
test data score: 0.7125 <br>

Third grid search: <br>
n_estimators = [10, 100, 250, 350, 400, 450, 475, 500, 550] <br>
criterion= ["gini", "entropy"] <br>
max_depth = [2, 8, 16, 32, 64, 74, 84, 94, 120] <br>
bootstrap = [True, False] <br>
best parameters are: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 32, 'n_estimators': 350} <br>
best grid training score is: 0.6747350069735006 <br>
test data score: 0.7125 <br>

Fourth grid search: <br>
n_estimators = [250, 275, 300, 350, 375, 400, 425, 450] <br>
criterion= ["entropy"] <br>
max_depth = [16, 32, 36, 40, 44, 52, 58, 64, 74, 84] <br>
bootstrap = [True] <br>
best parameters are: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 64, 'n_estimators': 375} <br>
best grid training score is: 0.67557880055788 <br>
test data score: 0.705 <br>

Fifth grid search: <br>
n_estimators = [350, 360, 375, 385, 395, 400] <br>
criterion= ["entropy"] <br>
max_depth = [32, 36, 40, 44, 52, 58, 62, 64, 66, 68, 74] <br>
bootstrap = [True] <br>
best parameters are: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 44, 'n_estimators': 360} <br>
best grid training score is: 0.6755822873082288 <br>
test data score: 0.7 <br>

Sixth grid search: <br>
**This grid search returns the best training score as well as the best test score for the red wine data.** <br>
n_estimators = [350, 352, 354, 356, 358, 360, 362, 364, 366, 368, 375, 385] <br>
criterion= ["gini","entropy"] <br>
max_depth = [32, 36, 40, 41, 42,43,44,45,46,47,48,52, 58, 62, 64, 66, 68] <br>
bootstrap = [True,False] <br>
best parameters are: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 64, 'n_estimators': 362} <br>
best grid training score is: 0.6780718270571826 <br>
test data score: 0.7125 <br>



Seventh grid search: <br>
n_estimators = [355, 360, 361, 362, 363, 364, 365, 366, 370] <br>
criterion= ["entropy"] <br>
max_depth = [30, 35, 40, 45, 50, 55, 60, 65, 70] <br>
bootstrap = [True] <br>
best parameters are: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 65, 'n_estimators': 363} <br>
best grid training score is: 0.6772454672245467 <br>
test data score: 0.71 <br>


Eighth grid search: <br>
**This grid search as well as the previous one both seem to be overfitting to the training data since the test data accuracies have declined in the past two searches.** <br>
n_estimators = [360, 361, 362, 363, 364, 365, 366] <br>
criterion= ["gini", "entropy"] <br>
max_depth = [40, 50, 55, 60, 62, 63, 64, 65, 67, 69, 70] <br>
bootstrap = [True, False] <br>
best parameters are: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 62, 'n_estimators': 362} <br>
best grid training score is: 0.67807880055788 <br>
test data score: 0.7075 <br>


# retraining red model with optimized parameters 

In [40]:
#instantiate optimized model
rf_red_optimized = RandomForestClassifier(n_estimators = 362, max_depth = 64, criterion = 'entropy', bootstrap = True)

#train optimized model
rf_red_optimized = rf_red_optimized.fit(X_train_red_scaled, y_train_red)

#pickling optimized red wine model
pickle.dump( rf_red_optimized, open( "rf_red.pkl", "wb" ) )

In [41]:
predictions = rf_red_optimized.predict(X_test_red_scaled)

In [42]:
predictions

array([5, 7, 5, 6, 5, 7, 7, 6, 6, 6, 5, 6, 6, 5, 6, 5, 6, 5, 5, 6, 6, 6,
       5, 5, 6, 7, 6, 6, 5, 6, 5, 5, 5, 5, 6, 6, 5, 6, 5, 5, 5, 7, 5, 5,
       5, 6, 5, 5, 7, 5, 5, 5, 5, 6, 6, 6, 6, 6, 5, 6, 6, 5, 7, 5, 5, 6,
       5, 6, 6, 5, 7, 5, 6, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 6, 6, 5, 7, 7,
       6, 6, 5, 6, 5, 7, 7, 5, 6, 6, 6, 5, 6, 5, 6, 5, 5, 5, 5, 5, 6, 6,
       7, 7, 5, 6, 6, 6, 6, 5, 6, 6, 5, 5, 5, 6, 5, 5, 6, 6, 5, 6, 5, 5,
       5, 5, 5, 6, 6, 5, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 6, 7, 6, 5, 5,
       6, 5, 6, 6, 5, 5, 7, 5, 6, 6, 5, 5, 6, 5, 5, 5, 6, 6, 6, 5, 5, 6,
       6, 5, 6, 5, 5, 5, 6, 5, 5, 6, 5, 7, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6,
       5, 5, 5, 5, 6, 5, 7, 5, 5, 5, 5, 5, 6, 7, 5, 5, 6, 7, 5, 5, 5, 6,
       5, 6, 6, 6, 5, 6, 7, 6, 6, 6, 6, 7, 6, 7, 6, 6, 6, 6, 6, 6, 5, 5,
       5, 5, 6, 6, 5, 6, 5, 6, 6, 6, 5, 5, 6, 7, 6, 7, 5, 6, 5, 5, 6, 5,
       5, 5, 5, 5, 6, 7, 6, 5, 5, 6, 6, 5, 6, 5, 5, 5, 6, 5, 5, 5, 6, 5,
       5, 6, 6, 5, 6, 6, 5, 6, 5, 7, 6, 6, 7, 5, 5,

# grid search to optimize hyperparameters for white wine values

In [None]:
# Create the GridSearchCV model for white wine values

n_estimators = [750, 800, 850, 900, 950, 1000, 1100, 1250]
criterion= ["gini", "entropy"]
max_depth = [15, 35, 45, 50, 65, 75, 95, 105]
bootstrap = [True, False]
param_grid = {"n_estimators":n_estimators,
              "criterion": criterion,
              "max_depth": max_depth,
              "bootstrap": bootstrap}

white_grid = GridSearchCV(rf_white, param_grid, verbose=3)


In [None]:
white_grid.fit(X_train_white_scaled, y_train_white)

In [None]:
print(f'best white parameters are: {white_grid.best_params_}')
print(f'best white grid training score is: {white_grid.best_score_}')

In [None]:
print(f'test data score: {white_grid.score(X_test_white_scaled, y_test_white)}')

# grid search notes for white wine values

first white wine grid search: <br>
n_estimators = [100, 250, 500, 750, 1000] <br>
criterion= ["gini", "entropy"] <br>
max_depth = [25, 50, 75, 100, 150, 250] <br>
bootstrap = [True, False] <br>
best white parameters are: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 25, 'n_estimators': 1000}
best white grid training score is: 0.6689469684331499
test data score: 0.6971428571428572

second white wine grid search: <br>
n_estimators = [750, 1000, 1250, 1500, 1750]
criterion= ["gini", "entropy"]
max_depth = [15, 20, 25, 30, 35, 40, 45, 50]
bootstrap = [True, False]
best white parameters are: {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 50, 'n_estimators': 750}
best white grid training score is: 0.6705788800533837
test data score: 0.6914285714285714

third white wine grid search: 
n_estimators = [750, 800, 850, 900, 950, 1000, 1100, 1250]
criterion= ["gini", "entropy"]
max_depth = [15, 35, 45, 50, 65, 75, 95, 105]
bootstrap = [True, False]
best white parameters are: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 45, 'n_estimators': 850}
best white grid training score is: 0.6692179651152014
test data score: 0.6979591836734694







# list out most important prediction factors

In [None]:
sorted(zip(rf_red.feature_importances_, list(red_features.columns)), reverse=True)

In [None]:
sorted(zip(rf_white.feature_importances_, list(white_features.columns)), reverse=True)

# create a prediction tool with user input
maybe use sliders?


In [None]:
test_input = input("alcohol: ")

In [None]:
print(test_input)
X_train_red_scaled = X_red_scaler.transform(X_train_red)

# output confusion matrices

In [None]:
# red confusion matrix
predictions = rf_red.predict(X_test_red_scaled)
cm = confusion_matrix(y_test_red, predictions, labels=rf_red.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf_red.classes_)

disp.plot()


plt.savefig('model_figures/red_random_forest_confusion.jpg')

In [None]:
# white confusion matrix

predictions = rf_white.predict(X_test_white_scaled)
cm = confusion_matrix(y_test_white, predictions, labels=rf_white.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf_white.classes_)

disp.plot()


plt.savefig('model_figures/white_random_forest_confusion.jpg')