<a href="https://colab.research.google.com/github/imiro/bangkit-w05-winequality/blob/notebook-refactor/bangkit-w05-winequality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparation

In [1]:
!pip install -q git+https://github.com/tensorflow/docs

  Building wheel for tensorflow-docs (setup.py) ... [?25l[?25hdone


In [2]:
#@title Import statements
#@content Tensorflow version

import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

2.2.0-rc2


In [0]:
#@title (Misc) GDrive integration
import os
from google.colab import drive
drive.mount('/content/gdrive')

# Read dataset from CSV file

In [0]:
url = 'https://raw.githubusercontent.com/imiro/bangkit-w05-winequality/master/datasets/winequality-red.csv'
df = pd.read_csv(url, error_bad_lines=False)

# Dataset characteristics

In [0]:
#@title Import plotting functions

import matplotlib.pyplot as plt
import seaborn as sns
import pandas.util.testing as tm
from collections import Counter

In [0]:
df.head(10)

Here we explore what our dataset has to offer.

In [0]:
df.info()

In [0]:
df.shape

In [0]:
df.columns

In [0]:
Counter(df['quality'])

In [0]:
sns.countplot(x='quality', data=df)

In [0]:
sns.pairplot(df)

In [0]:
qualityString = []
for i in df['quality'] :
  if 0 <= i < 5 :
    qualityString.append(0)
  elif 5 <= i < 7 :
    qualityString.append(1)
  else :
    qualityString.append(2)
df['Rating'] = qualityString
df['Rating2'] = df['quality'].copy()
df['Rating2'].map(lambda x: 0 if x < 5 else 1 if x < 7 else 2)

In [0]:
# df['Rating'] = df['Rating'].map({0: 'Bad', 1: 'Good'})
# df = pd.get_dummies(df, prefix='', prefix_sep='')
# df.tail()

In [0]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Rating
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023,1.09631
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569,0.407354
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0,0.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0,1.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0,1.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0,1.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0,2.0


# Data Prep

In [0]:
dataset = df.copy()
dataset.pop("quality")

0       5
1       5
2       5
3       6
4       5
       ..
1594    5
1595    6
1596    6
1597    5
1598    6
Name: quality, Length: 1599, dtype: int64

## Create test data

We don't have test data available as is from the source, so we have to create it from the data pool.

We decided to train on 80% of the data, and make the rest 20% as the test data.

To avoid bias, we divide the train and test data with random sampling.

In [0]:
train_dataset = dataset.sample(frac=0.8,random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [0]:
train_dataset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,Rating
1109,10.8,0.470,0.43,2.10,0.171,27.0,66.0,0.99820,3.17,0.76,10.8,1
1032,8.1,0.820,0.00,4.10,0.095,5.0,14.0,0.99854,3.36,0.53,9.6,1
1002,9.1,0.290,0.33,2.05,0.063,13.0,27.0,0.99516,3.26,0.84,11.7,2
487,10.2,0.645,0.36,1.80,0.053,5.0,14.0,0.99820,3.17,0.42,10.0,1
979,12.2,0.450,0.49,1.40,0.075,3.0,6.0,0.99690,3.13,0.63,10.4,1
...,...,...,...,...,...,...,...,...,...,...,...,...
281,7.7,0.270,0.68,3.50,0.358,5.0,10.0,0.99720,3.25,1.08,9.9,2
932,7.6,0.400,0.29,1.90,0.078,29.0,66.0,0.99710,3.45,0.59,9.5,1
732,7.3,0.835,0.03,2.10,0.092,10.0,19.0,0.99660,3.39,0.47,9.6,1
715,7.2,0.490,0.18,2.70,0.069,13.0,34.0,0.99670,3.29,0.48,9.2,1


In [0]:
train_stats = train_dataset.describe()
train_stats.pop("Rating")
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1279.0,8.310164,1.74419,4.6,7.1,7.9,9.25,15.9
volatile acidity,1279.0,0.525571,0.176645,0.12,0.39,0.52,0.635,1.58
citric acid,1279.0,0.271618,0.196822,0.0,0.09,0.26,0.43,0.79
residual sugar,1279.0,2.516341,1.300985,0.9,1.9,2.2,2.6,13.9
chlorides,1279.0,0.087347,0.045666,0.012,0.071,0.079,0.091,0.611
free sulfur dioxide,1279.0,15.868647,10.43888,1.0,7.0,14.0,21.0,72.0
total sulfur dioxide,1279.0,46.488663,32.952841,6.0,22.0,38.0,62.0,289.0
density,1279.0,0.996739,0.001888,0.99007,0.9956,0.99676,0.997855,1.0032
pH,1279.0,3.312588,0.153923,2.87,3.21,3.31,3.4,4.01
sulphates,1279.0,0.655012,0.160192,0.33,0.55,0.62,0.73,1.98


## Separate label and features

In [0]:
train_labels = train_dataset.pop('Rating')
test_labels = test_dataset.pop('Rating')

## Normalize data

Earlier on descriptions of dataset characteristic, we see that range of the data varies.

For example, values of `fixed acidity` ranges from `4.6 - 15.9`, while `total sulfur dioxide` has a range of `6-62`. Meanwhile, `pH` values only range from `2.74` to `4.01`.

We fear that this can cause  issues when training the model. Therefore, we convert each value of the features to its Z-score.

In [0]:
def norm(x):
  return (x - train_stats['mean']) / train_stats['std']
normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)

In [0]:
train_labels.shape

(1279,)

# Define function that build model

In [0]:
def build_model(my_learning_rate):
  model = keras.Sequential([
    layers.Flatten(input_shape=[(len(train_dataset.keys()))]),
    layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    layers.Dense(3),
  ])

  optimizer = tf.keras.optimizers.Adam(learning_rate=my_learning_rate)

  model.compile(optimizer=optimizer,
              # loss='binary_crossentropy',
              # loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
  return model

In [0]:
# example_batch = normed_train_data[:10]
# example_result = model.predict(example_batch)
# example_result

# Set hyperparameters

In [0]:
EPOCHS = 100
learning_rate = 0.01

# Build and train the model

In [0]:
model = build_model(learning_rate)
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 11)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               1536      
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 387       
Total params: 1,923
Trainable params: 1,923
Non-trainable params: 0
_________________________________________________________________


In [0]:
checkpoint_path = "/content/gdrive/My Drive/Trained_Models/wine_binary_quality/wine_binary_quality.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)
history = model.fit(
  normed_train_data, train_labels,
  # train_dataset, train_labels,
  epochs=EPOCHS, 
  # validation_split = 0.2, 
  # verbose=0,
  # callbacks=[cp_callback])
  # callbacks=[tfdocs.modeling.EpochDots()])
  # batch_size=1
  )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [0]:
test_loss, test_acc = model.evaluate(normed_test_data, test_labels, verbose=2)
# test_loss, test_acc = model.evaluate(test_dataset, test_labels, verbose=2)
print('Test accuracy:', test_acc)

10/10 - 0s - loss: 1.1818 - accuracy: 0.8656
Test accuracy: 0.8656250238418579


In [0]:
probability_model = tf.keras.Sequential([model, 
                                         tf.keras.layers.Softmax()])

In [0]:
predictions = probability_model.predict(
    normed_test_data)
# predictions

In [0]:
test_labels

In [0]:
i = len(predictions)
np.argmax(predictions[i-1])

1

# (Misc) Export model(s) to GDrive

In [0]:
checkpoint_path = "/content/gdrive/My Drive/Trained_Models/wine_binary_classifier/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

In [0]:
model.save_weights('/content/gdrive/My Drive/Trained_Models/wine_binary_classifier/wine_binary_quality_model')

In [0]:
model_save_name = 'wine_binary_classifier'
save_model_path = '/content/gdrive/My Drive/Trained_Models/wine_binary_classifier'

In [0]:
model.save(save_model_path)

INFO:tensorflow:Assets written to: /content/gdrive/My Drive/Trained_Models/wine_binary_classifier/assets


INFO:tensorflow:Assets written to: /content/gdrive/My Drive/Trained_Models/wine_binary_classifier/assets


In [0]:
model.save('/content/gdrive/My Drive/Trained_Models/wine_binary_classifier/wine_binary_classifier.h5') 

In [0]:
converter = tf.lite.TFLiteConverter.from_saved_model(save_model_path)
tflite_model = converter.convert()
open("/content/gdrive/My Drive/Trained_Models/wine_binary_classifier/wine_binary_classifier.tflite", "wb").write(tflite_model)

8776