In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

In [2]:
# read the wine data CSV file
wineDF = pd.read_csv("winequality-red.csv")

wineDF.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
# split the data into y (quality) X (all other columns)
y = wineDF["quality"]
X = wineDF.drop(columns="quality")

In [4]:
y[:5]

0    5
1    5
2    5
3    6
4    5
Name: quality, dtype: int64

In [5]:
X[:5]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [6]:
# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# make the standardScaler object
scaler = StandardScaler()

In [8]:
# fit the data onto the training data
xScaler = scaler.fit(x_train)

In [9]:
# scale the data using .transform function
x_train_scaled = xScaler.transform(x_train)
x_test_scaled = xScaler.transform(x_test)

In [10]:
# import the modules for the accuracy score and classification report
from sklearn.metrics import accuracy_score, classification_report

In [11]:
# attempt to improve the prerformance using an ensemble classifier (Random Forest)
# import the random forest classifier module
from sklearn.ensemble import RandomForestClassifier

In [12]:
# import the random forest classifier module
from sklearn.ensemble import RandomForestClassifier

# Ignore the warning related to feature names
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Create a modified version of the RandomForestClassifier class
class ModifiedRandomForest(RandomForestClassifier):
    def fit(self, X, y):
        return super().fit(X, y)

# make the random forest classifier model - use 500 estimators
rfModel = ModifiedRandomForest(n_estimators=500, random_state=42)

In [13]:
# fit the model on the scaled trained data
rfModel = rfModel.fit(x_train_scaled, y_train)

In [14]:
# make a list of predicted values using the scaled testing data
rfPredictedValues = rfModel.predict(x_test_scaled)
rfPredictedValues[:3]

array([6, 6, 5])

In [15]:
# check the accuracy score
rfAccuracyScore = accuracy_score(y_test, rfPredictedValues)
print(f"Random Forest Accuracy Score: {rfAccuracyScore * 100:.2f}%")

Random Forest Accuracy Score: 67.50%


In [16]:
# check the classification report
print(classification_report(y_test, rfPredictedValues))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00        11
           5       0.70      0.75      0.72       136
           6       0.64      0.72      0.68       128
           7       0.78      0.53      0.63        40
           8       0.50      0.33      0.40         3

    accuracy                           0.68       320
   macro avg       0.44      0.39      0.40       320
weighted avg       0.65      0.68      0.66       320



In [17]:
# extract the feature importances from the random forest classifier
importantFeatures = rfModel.feature_importances_

# sort the features by their importance, zip them, then display them
zippedFeatures = sorted(zip(importantFeatures, X.columns), reverse=True)
zippedFeatures

[(0.15013120034318636, 'alcohol'),
 (0.11050550137030789, 'sulphates'),
 (0.10588744184231347, 'total sulfur dioxide'),
 (0.10240114261026227, 'volatile acidity'),
 (0.08961770969687342, 'density'),
 (0.0803094366609652, 'chlorides'),
 (0.07644213772198243, 'pH'),
 (0.07553100907034485, 'fixed acidity'),
 (0.07269342722714263, 'residual sugar'),
 (0.0708693560248985, 'citric acid'),
 (0.06561163743172306, 'free sulfur dioxide')]

In [18]:
def predict_wine_quality():
    # Prompt the user for input
    alcohol = float(input("Enter alcohol content: "))
    sulphates = float(input("Enter sulphates content: "))
    total_sulfur_dioxide = float(input("Enter total sulfur dioxide: "))
    volatile_acidity = float(input("Enter volatile acidity: "))
    density = float(input("Enter density: "))
    chlorides = float(input("Enter chlorides: "))
    pH = float(input("Enter pH: "))
    fixed_acidity = float(input("Enter fixed acidity: "))
    residual_sugar = float(input("Enter residual sugar: "))
    citric_acid = float(input("Enter citric acid: "))
    free_sulfur_dioxide = float(input("Enter free sulfur dioxide: "))

    # Create a DataFrame from the user input
    input_data = pd.DataFrame({
        'fixed acidity': [fixed_acidity],
        'volatile acidity': [volatile_acidity],
        'citric acid': [citric_acid],
        'residual sugar': [residual_sugar],
        'chlorides': [chlorides],
        'free sulfur dioxide': [free_sulfur_dioxide],
        'total sulfur dioxide': [total_sulfur_dioxide],
        'density': [density],
        'pH': [pH],
        'sulphates': [sulphates],
        'alcohol': [alcohol]
    })

    # Scale the input data using the pre-trained scaler
    input_scaled = pd.DataFrame(scaler.transform(input_data), columns=input_data.columns)

    # Make predictions for the user input using the Random Forest model
    predicted_quality = rfModel.predict(input_scaled)[0]
    rounded_predicted_quality = round(predicted_quality)

    # Print the predicted quality
    print(f"Predicted Wine Quality: {rounded_predicted_quality}")

In [19]:
# Call the function to predict wine quality and print the result
predict_wine_quality()

Enter alcohol content: 9.4
Enter sulphates content: 0.65
Enter total sulfur dioxide: 34
Enter volatile acidity: 0.7
Enter density: 0.9978
Enter chlorides: 0.076
Enter pH: 3.51
Enter fixed acidity: 7.4
Enter residual sugar: 1.9
Enter citric acid: 0
Enter free sulfur dioxide: 11
Predicted Wine Quality: 5


In [20]:
import joblib

# Save the trained model to a file
joblib.dump(rfModel, 'qualityWineRandomForestModel.pkl')

['qualityWineRandomForestModel.pkl']

In [21]:
#load the model back to the notebook
loaded_model = joblib.load('qualityWineRandomForestModel.pkl')

In [None]:
predict_wine_quality()