# Wine Quality

In [11]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [14]:
df = pd.read_csv("winequality.csv")
df.sample(5)

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
6029,red,5.9,0.19,0.21,1.7,0.045,57.0,135.0,0.99341,3.32,0.44,9.5,5
3914,white,6.2,0.39,0.24,4.8,0.037,45.0,138.0,0.99174,3.23,0.43,11.2,7
3951,white,6.3,0.33,0.2,17.9,0.066,36.0,161.0,0.9991,3.14,0.51,8.8,5
2043,white,7.4,0.18,0.34,2.7,0.03,30.0,107.0,0.992,2.97,0.53,11.0,6
6442,red,8.4,0.37,0.43,2.3,0.063,12.0,19.0,0.9955,3.17,0.81,11.2,7


In [15]:
# wine type
df["wine_type"] = [1 if wine_type == "white" else 0 for wine_type in df["type"]]
df.sample(5)

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type
5118,red,7.8,0.34,0.37,2.0,0.082,24.0,58.0,0.9964,3.34,0.59,9.4,6,0
462,white,5.9,0.25,0.19,12.4,0.047,50.0,162.0,0.9973,3.35,0.38,9.5,5,1
5807,red,7.3,0.305,0.39,1.2,0.059,7.0,11.0,0.99331,3.29,0.52,11.5,6,0
317,white,6.6,0.24,0.35,7.7,0.031,36.0,135.0,0.9938,3.19,0.37,10.5,5,1
4816,white,6.1,0.41,0.2,12.6,0.032,54.0,136.0,0.99516,2.91,0.43,10.6,6,1


In [16]:
# wine quality
df["wine_quality"] = [1 if quality > 5 else 0 for quality in df["quality"]]
df.sample(5)

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type,wine_quality
5214,red,9.6,0.56,0.23,3.4,0.102,37.0,92.0,0.9996,3.3,0.65,10.1,5,0,0
1482,white,6.7,0.16,0.49,2.4,0.046,57.0,187.0,0.9952,3.62,0.81,10.4,6,1,1
2300,white,7.3,0.3,0.33,2.3,0.043,28.0,125.0,0.99084,3.34,0.44,12.6,7,1,1
5845,red,8.3,0.28,0.48,2.1,0.093,6.0,12.0,0.99408,3.26,0.62,12.4,7,0,1
4071,white,7.6,0.26,0.32,1.3,0.048,23.0,76.0,0.9903,2.96,0.46,12.0,6,1,1


In [17]:
df.drop(columns=["type", "quality"], inplace=True)
df.sample(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,wine_type,wine_quality
418,7.4,0.21,0.27,1.2,0.041,27.0,99.0,0.9927,3.19,0.33,9.8,1,1
1944,6.0,0.45,0.42,1.1,0.051,61.0,197.0,0.9932,3.02,0.4,9.0,1,0
422,7.0,0.21,0.28,8.6,0.045,37.0,221.0,0.9954,3.25,0.54,10.4,1,1
2277,7.4,0.14,0.3,1.3,0.033,25.0,91.0,0.99268,3.53,0.39,10.6,1,1
3234,6.6,0.25,0.34,3.0,0.054,22.0,141.0,0.99338,3.26,0.47,10.4,1,1


In [21]:
X = df.drop(columns=["wine_quality"])
y = df["wine_quality"]


In [22]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [24]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y, test_size = 0.2, stratify=y, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5197, 12)
(1300, 12)
(5197,)
(1300,)


In [27]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

tf.random.set_seed(42) #This is crucial for ensuring reproducibility in your experiments.

## Callbacks

#### Model Checkpoint - saves the model on a current epoch if the validation metric improved from the previous epoch

In [65]:
cb_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath = "/content/drive/MyDrive/checkpoints/model-{epoch:02d}-{val_accuracy:2f}.hdf5",
    monitor = "val_accuracy",
    mode="max",
    save_best_only=True,
    verbose=1
)




#### ReduceLROnPlateau - reduces learning rate when there's no progress in training

In [66]:
cb_reducelr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    mode="min",
    factor=0.1,
    patience=10,
    verbose=1,
    min_lr=0.0001
)

In [67]:
cb_earlystop = tf.keras.callbacks.EarlyStopping(
    monitor = "val_accuracy",
    mode="max",
    min_delta=0.001,
    patience=10,
    verbose=1
)

In [68]:
cb_csvlogger = tf.keras.callbacks.CSVLogger(
    filename = "training_log.csv",
    separator = ",",
    append=False
)

## Model Training

In [74]:
model = Sequential([
    Dense(units=64, activation="relu"),
    Dense(units=64, activation="relu"),
    Dense(units=64, activation="relu"),
    Dense(units=1, activation="sigmoid")]
)

In [75]:
loss = tf.keras.losses.BinaryCrossentropy()
opt = tf.keras.optimizers.Adam()
metrics = tf.keras.metrics.BinaryAccuracy(name="accuracy")

model.compile(
    loss=loss,
    optimizer=opt,
    metrics=metrics
)

In [78]:
model.fit(
    X_train,
    y_train,
    epochs = 1000,
    validation_data = (X_test, y_test),
    callbacks=[cb_checkpoint, cb_reducelr, cb_earlystop, cb_csvlogger]
)

Epoch 1/1000
Epoch 1: val_accuracy did not improve from 0.36692
Epoch 2/1000
Epoch 2: val_accuracy did not improve from 0.36692
Epoch 3/1000
Epoch 3: val_accuracy did not improve from 0.36692
Epoch 4/1000
Epoch 4: val_accuracy did not improve from 0.36692
Epoch 5/1000
Epoch 5: val_accuracy did not improve from 0.36692
Epoch 6/1000
Epoch 6: val_accuracy did not improve from 0.36692
Epoch 7/1000
Epoch 7: val_accuracy did not improve from 0.36692
Epoch 8/1000
Epoch 8: val_accuracy did not improve from 0.36692
Epoch 9/1000
Epoch 9: val_accuracy did not improve from 0.36692
Epoch 10/1000
Epoch 10: val_accuracy did not improve from 0.36692

Epoch 10: ReduceLROnPlateau reducing learning rate to 0.0001.
Epoch 11/1000
Epoch 11: val_accuracy did not improve from 0.36692
Epoch 11: early stopping


<keras.src.callbacks.History at 0x7bf0143de290>