In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

In [2]:
# read the wine data CSV file
wineDF = pd.read_csv("winequality-red.csv")

wineDF.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
# split the data into y (quality) X (all other columns)
y = wineDF["quality"]
X = wineDF.drop(columns="quality")

In [4]:
y[:5]

0    5
1    5
2    5
3    6
4    5
Name: quality, dtype: int64

In [5]:
X[:5]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [6]:
# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# make the standardScaler object
scaler = StandardScaler()

In [8]:
# fit the data onto the training data
xScaler = scaler.fit(x_train)

In [9]:
# scale the data using .transform function
x_train_scaled = xScaler.transform(x_train)
x_test_scaled = xScaler.transform(x_test)

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize the RandomForestRegressor model with desired hyperparameters
model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

# Train the model on the scaled training data
model.fit(x_train_scaled, y_train)

# Make predictions on the scaled testing data
y_pred = model.predict(x_test_scaled)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("RandomForestRegressor Evaluation Metrics:")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")



RandomForestRegressor Evaluation Metrics:
MSE: 0.3337913324666683
RMSE: 0.5777467719223262
MAE: 0.43659240640828384
R-squared: 0.48271335364448764


In [11]:
# import the tree Classifier module
from sklearn import tree

In [12]:
# make an object to hold the decision tree classifier using the .DecisionTreeClassifier function
treeModel = tree.DecisionTreeClassifier()

In [13]:
# use .fit() to fit the model on our scaled x training data and the ytraining data
treeModel = treeModel.fit(x_train_scaled, y_train)

In [14]:
# make a list of predicted values
predictedValues = treeModel.predict(x_test_scaled)
predictedValues[:3]

array([7, 6, 4])

In [15]:
predictedValues

array([7, 6, 4, 6, 5, 7, 6, 5, 6, 8, 4, 6, 5, 6, 5, 6, 6, 5, 7, 7, 6, 6,
       5, 6, 5, 6, 5, 7, 6, 5, 5, 5, 6, 6, 7, 5, 5, 7, 5, 5, 6, 5, 6, 6,
       6, 7, 7, 6, 5, 5, 5, 7, 5, 5, 5, 5, 4, 6, 5, 6, 6, 6, 6, 5, 6, 5,
       6, 5, 5, 6, 6, 6, 6, 6, 5, 6, 6, 6, 5, 5, 5, 6, 5, 6, 5, 5, 6, 6,
       7, 6, 5, 5, 6, 7, 5, 5, 5, 5, 6, 4, 8, 6, 7, 7, 6, 8, 7, 7, 7, 6,
       5, 5, 5, 5, 7, 6, 4, 5, 7, 7, 7, 6, 4, 5, 6, 7, 7, 5, 6, 7, 5, 5,
       5, 7, 7, 6, 5, 7, 5, 4, 5, 6, 6, 5, 7, 6, 5, 6, 5, 6, 8, 5, 5, 6,
       5, 5, 5, 5, 8, 6, 6, 5, 5, 8, 6, 7, 5, 6, 5, 6, 6, 5, 6, 6, 6, 6,
       8, 5, 7, 6, 7, 5, 6, 5, 5, 6, 5, 5, 6, 8, 6, 5, 5, 5, 5, 8, 6, 5,
       5, 5, 7, 5, 6, 6, 5, 6, 5, 5, 7, 6, 5, 6, 5, 6, 5, 3, 5, 5, 7, 6,
       6, 4, 5, 5, 5, 7, 6, 7, 7, 6, 5, 5, 7, 7, 5, 6, 5, 7, 6, 5, 5, 5,
       5, 6, 5, 5, 7, 6, 7, 6, 5, 5, 6, 5, 7, 6, 5, 6, 6, 6, 6, 6, 6, 5,
       6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 5, 6, 6, 5, 6, 5, 6, 5, 5, 6, 7, 6,
       6, 6, 6, 5, 5, 7, 5, 6, 5, 6, 5, 5, 5, 7, 5,

In [16]:
# import the modules for the accuracy score and classification report
from sklearn.metrics import accuracy_score, classification_report

In [17]:
# calculate the accuracy score
treeAccuracyScore = accuracy_score(y_test, predictedValues)
print(f"Decision Tree Accuracy Score: {treeAccuracyScore * 100:.2f}%")

Decision Tree Accuracy Score: 61.25%


In [18]:
# classification report
print(classification_report(y_test, predictedValues))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.22      0.18      0.20        11
           5       0.71      0.67      0.69       136
           6       0.61      0.59      0.60       128
           7       0.55      0.65      0.60        40
           8       0.11      0.33      0.17         3

    accuracy                           0.61       320
   macro avg       0.37      0.40      0.38       320
weighted avg       0.62      0.61      0.62       320



In [19]:
# attempt to improve the prerformance using an ensemble classifier (Random Forest)
# import the random forest classifier module
from sklearn.ensemble import RandomForestClassifier

In [20]:
# make the random forest classifier model - use 500 estimators
rfModel =  RandomForestClassifier(n_estimators=500, random_state=42)

In [21]:
# fit the model on the scaled trained data
rfModel = rfModel.fit(x_train_scaled, y_train)

In [22]:
# make a list of predicted values using the scaled testing data
rfPredictedValues = rfModel.predict(x_test_scaled)
rfPredictedValues[:3]

array([6, 6, 5])

In [23]:
# check the accuracy score
rfAccuracyScore = accuracy_score(y_test, rfPredictedValues)
print(f"Random Forest Accuracy Score: {rfAccuracyScore * 100:.2f}%")

Random Forest Accuracy Score: 67.50%


In [24]:
# check the classification report
print(classification_report(y_test, rfPredictedValues))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00        11
           5       0.70      0.75      0.72       136
           6       0.64      0.72      0.68       128
           7       0.78      0.53      0.63        40
           8       0.50      0.33      0.40         3

    accuracy                           0.68       320
   macro avg       0.44      0.39      0.40       320
weighted avg       0.65      0.68      0.66       320



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
# extract the feature importances from the random forest classifier
importantFeatures = rfModel.feature_importances_

# sort the features by their importance, zip them, then display them
zippedFeatures = sorted(zip(importantFeatures, X.columns), reverse=True)
zippedFeatures

[(0.15013120034318636, 'alcohol'),
 (0.11050550137030789, 'sulphates'),
 (0.10588744184231347, 'total sulfur dioxide'),
 (0.10240114261026227, 'volatile acidity'),
 (0.08961770969687342, 'density'),
 (0.0803094366609652, 'chlorides'),
 (0.07644213772198243, 'pH'),
 (0.07553100907034485, 'fixed acidity'),
 (0.07269342722714263, 'residual sugar'),
 (0.0708693560248985, 'citric acid'),
 (0.06561163743172306, 'free sulfur dioxide')]

In [26]:
# define a basic neural network model: 14 inputs (11 columns) -> 20 neurons on 1 layer -> 1 output (0 or 1)
nnModel01 = tf.keras.models.Sequential()

In [27]:
# Add the layers
nnModel01.add(
    tf.keras.layers.Dense(
        units=20,
        activation='relu',
        input_shape=(11,)
    )
)

nnModel01.add(
    tf.keras.layers.Dense(
        units=1,
        activation='linear'
    )
)

In [31]:
# Import the required metric
from tensorflow.keras.metrics import RootMeanSquaredError

# Compile the model
nnModel01.compile(loss='mean_squared_error', metrics=['mae', RootMeanSquaredError()])

# Train the model
fitModel01 = nnModel01.fit(x_train_scaled, y_train, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [32]:
# Make predictions on the test set
y_pred1 = nnModel01.predict(x_test_scaled)

# Calculate and print the R-squared score
r2 = r2_score(y_test, y_pred1)
print(f"R-squared (R2): {r2:.2f}")

R-squared (R2): 0.34


In [33]:
import numpy as np
from sklearn.metrics import accuracy_score

# Make rounded predictions by rounding the predicted values
roundedPred = np.round(y_pred1)

# Convert the rounded predictions to integers
roundedPred = roundedPred.astype(int)

# Calculate and print the accuracy score
accuracy = accuracy_score(y_test, roundedPred)
print(f"Accuracy Score: {accuracy*100:.2f}%")

Accuracy Score: 61.88%


In [35]:
# Prompt the user for input
alcohol = float(input("Enter alcohol content: "))
sulphates = float(input("Enter sulphates content: "))
total_sulfur_dioxide = float(input("Enter total sulfur dioxide: "))
volatile_acidity = float(input("Enter volatile acidity: "))
density = float(input("Enter density: "))
chlorides = float(input("Enter chlorides: "))
pH = float(input("Enter pH: "))
fixed_acidity = float(input("Enter fixed acidity: "))
residual_sugar = float(input("Enter residual sugar: "))
citric_acid = float(input("Enter citric acid: "))
free_sulfur_dioxide = float(input("Enter free sulfur dioxide: "))

# Create a DataFrame from the user input
input_data = pd.DataFrame({
    'fixed acidity': [fixed_acidity],
    'volatile acidity': [volatile_acidity],
    'citric acid': [citric_acid],
    'residual sugar': [residual_sugar],
    'chlorides': [chlorides],
    'free sulfur dioxide': [free_sulfur_dioxide],
    'total sulfur dioxide': [total_sulfur_dioxide],
    'density': [density],
    'pH': [pH],
    'sulphates': [sulphates],
    'alcohol': [alcohol]
})

# Scale the input data using the pre-trained scaler
input_scaled = pd.DataFrame(scaler.transform(input_data), columns=input_data.columns)

# Make predictions for the user input
predicted_quality = nnModel01.predict(input_scaled)[0][0]
rounded_predicted_quality = round(predicted_quality)

# Print the predicted quality
print(f"Predicted Wine Quality: {rounded_predicted_quality}")


Enter alcohol content: 9.4
Enter sulphates content: 0.56
Enter total sulfur dioxide: 34
Enter volatile acidity: 0.7
Enter density: 0.9978
Enter chlorides: 0.076
Enter pH: 3.51
Enter fixed acidity: 7.4
Enter residual sugar: 1.9
Enter citric acid: 0
Enter free sulfur dioxide: 11
Predicted Wine Quality: 5


In [36]:
# Prompt the user for input
alcohol = float(input("Enter alcohol content: "))
sulphates = float(input("Enter sulphates content: "))
total_sulfur_dioxide = float(input("Enter total sulfur dioxide: "))
volatile_acidity = float(input("Enter volatile acidity: "))
density = float(input("Enter density: "))
chlorides = float(input("Enter chlorides: "))
pH = float(input("Enter pH: "))
fixed_acidity = float(input("Enter fixed acidity: "))
residual_sugar = float(input("Enter residual sugar: "))
citric_acid = float(input("Enter citric acid: "))
free_sulfur_dioxide = float(input("Enter free sulfur dioxide: "))

# Create a DataFrame from the user input
input_data = pd.DataFrame({
    'fixed acidity': [fixed_acidity],
    'volatile acidity': [volatile_acidity],
    'citric acid': [citric_acid],
    'residual sugar': [residual_sugar],
    'chlorides': [chlorides],
    'free sulfur dioxide': [free_sulfur_dioxide],
    'total sulfur dioxide': [total_sulfur_dioxide],
    'density': [density],
    'pH': [pH],
    'sulphates': [sulphates],
    'alcohol': [alcohol]
})

# Scale the input data using the pre-trained scaler
input_scaled = pd.DataFrame(scaler.transform(input_data), columns=input_data.columns)

# Make predictions for the user input
predicted_quality = nnModel01.predict(input_scaled)[0][0]
rounded_predicted_quality = round(predicted_quality)

# Print the predicted quality
print(f"Predicted Wine Quality: {rounded_predicted_quality}")


Enter alcohol content: 10
Enter sulphates content: 0.47
Enter total sulfur dioxide: 21
Enter volatile acidity: 0.65
Enter density: 0.9946
Enter chlorides: 0.065
Enter pH: 3.39
Enter fixed acidity: 7.3
Enter residual sugar: 1.2
Enter citric acid: 0
Enter free sulfur dioxide: 15
Predicted Wine Quality: 5
