In [1]:
import pandas as pd 
import numpy as np

import tensorflow as tf

from tensorflow.keras import layers, optimizers, regularizers
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.models import Sequential

from tensorflow.keras.utils import plot_model
#from kt_utils import *
import tensorflow.keras.backend as K

import seaborn as sns

from sklearn import preprocessing, model_selection 

import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

In [2]:
data = pd.read_csv("../data/wine_red_white.csv")
data.drop("index", axis=1, inplace=True)
data["quality"] = data["quality"].astype('object')
data.sample(5)


Unnamed: 0,color,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
247,red,8.2,0.6,0.17,2.3,0.072,11.0,73.0,0.9963,3.2,0.45,9.3,5
5134,white,6.4,0.28,0.44,7.1,0.048,49.0,179.0,0.99528,3.15,0.48,9.2,5
2546,white,8.5,0.16,0.35,1.6,0.039,24.0,147.0,0.9935,2.96,0.36,10.0,5
6159,white,5.4,0.27,0.22,4.6,0.022,29.0,107.0,0.98889,3.33,0.54,13.8,6
511,red,10.0,0.59,0.31,2.2,0.09,26.0,62.0,0.9994,3.18,0.63,10.2,6


In [3]:
# Split red / white
df_splitter = data["color"] == 'red'
df_red = data[df_splitter]
df_white = data[~df_splitter]

In [4]:
# method for detecting the outliers using interquantilerange technique 
def detect_outliers(data): 
    quantile1, quantile3 = np.percentile(data, [25, 75])  # create two quantiles for 25% and 75%
    iqr_val = quantile3 - quantile1                       # interquantilerange value
    lower_bound_value = quantile1 - (1.5 * iqr_val)       # lower limit of the data, anything greater are not outliers
    upper_bound_value = quantile3 + (1.5 * iqr_val)       # upper limit of the data, anything less are not outliers
            
    return lower_bound_value, upper_bound_value

In [5]:
def drop_outliers(df):
    for column, dtype in df.dtypes.iteritems():
        if dtype != "object":
            min_outlier, max_outlier = detect_outliers(df[column])
            #print(column, min_outlier, max_outlier)
            # drop lower than min
            df = df.drop(df[df[column] < min_outlier].index)
            # drop upper than mean
            df = df.drop(df[df[column] > max_outlier].index)
    return df

In [6]:
df_red = drop_outliers(df_red)
df_white = drop_outliers(df_white)

In [7]:
data = pd.concat([df_red, df_white], ignore_index=True)

In [8]:
data["color"] = pd.get_dummies(data["color"], drop_first=True)
data

Unnamed: 0,color,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,0,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,0,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,0,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5103,1,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
5104,1,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
5105,1,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
5106,1,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [9]:
X = data.drop(["quality","citric acid","chlorides", "free sulfur dioxide"], axis=1)
y = data["quality"] 

X_train,X_test,y_train,y_test = model_selection.train_test_split(X,y,test_size=0.2, stratify=y)

print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

(4086, 9) (4086,) (1022, 9) (1022,)


In [10]:
#from sklearn.preprocessing import StandardScaler

# Define the scaler 
#scaler = StandardScaler().fit(X_train)
# Scale the train set
#X_train = scaler.transform(X_train)
# Scale the test set
#X_test = scaler.transform(X_test)

In [11]:
import pandas as pd
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

In [12]:
X_train

array([[1.        , 0.33333333, 0.11891892, ..., 0.13953488, 0.26388889,
        0.06896552],
       [1.        , 0.14666667, 0.27027027, ..., 0.46511628, 0.19444444,
        0.74137931],
       [1.        , 0.21333333, 0.20540541, ..., 0.3255814 , 0.51388889,
        0.46551724],
       ...,
       [1.        , 0.18666667, 0.27027027, ..., 0.45348837, 0.36111111,
        0.17241379],
       [0.        , 0.56      , 0.77837838, ..., 0.59302326, 0.59722222,
        0.34482759],
       [1.        , 0.33333333, 0.15135135, ..., 0.48837209, 0.61111111,
        0.10344828]])

In [13]:
y_train_hot = pd.get_dummies(y_train)
y_train_hot

Unnamed: 0,3,4,5,6,7,8,9
4638,0,0,0,0,0,1,0
3600,0,0,0,0,1,0,0
4549,0,0,0,1,0,0,0
603,0,0,1,0,0,0,0
1751,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...
831,0,0,0,0,1,0,0
1673,0,0,1,0,0,0,0
3086,0,0,1,0,0,0,0
597,0,0,0,1,0,0,0


In [14]:
y_test_hot = pd.get_dummies(y_test)
y_test_hot

Unnamed: 0,3,4,5,6,7,8,9
722,0,0,0,1,0,0,0
1309,0,0,0,1,0,0,0
285,0,0,0,1,0,0,0
2299,0,0,0,0,1,0,0
461,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
3728,0,0,0,1,0,0,0
3872,0,0,0,0,1,0,0
1181,0,0,0,1,0,0,0
1942,0,0,0,0,0,1,0


In [15]:
winemod1 = Sequential()
winemod1.add(Dense(30, input_dim=X.shape[1], activation='relu'))
winemod1.add(Dense(7, activation='softmax'))
#winemod1.add(Dense(1, input_dim=11, activation='relu'))
#winemod1.add(Dense(512, input_dim=X.shape[1], activation='relu'))
#winemod1.add(Dropout(.2))
#winemod1.add(Dense(256, activation='relu'))
#winemod1.add(Dropout(.2))
#winemod1.add(Dense(128, activation='relu'))
#winemod1.add(Dropout(.2))
#winemod1.add(Dense(64, activation='relu'))
#winemod1.add(Dropout(.2))
#winemod1.add(Dense(32, activation='relu'))
#winemod1.add(Dropout(.2))
#winemod1.add(Dense(16, activation='relu'))
#winemod1.add(Dropout(.2))
#winemod1.add(Dense(1))

In [16]:
winemod1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 30)                300       
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 217       
Total params: 517
Trainable params: 517
Non-trainable params: 0
_________________________________________________________________


In [17]:
class NeuralNet:
    
    # initalize the class variables
    def __init__(self, learning_rate = 0.001, batch_size = 100, epochs = 1000):
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epochs = epochs
    
    # One hot encoding of the labels
    def dense_to_one_hot(self, labels_dense, num_classes = 2):
        # Intialize the output variable
        labels_one_hot = []
        # Frame the label array
        for label in labels_dense:
            indices = [1]*num_classes
            indices[label] = 0
            labels_one_hot.append(indices)
        return labels_one_hot

    # Create batches of the training set
    def make_batch(self, X, y, batch_size):
        # Compute the length of the data
        y_size = len(y)
        # Create random batches
        index_sample = np.random.choice(X.shape[0], batch_size, replace=False)
        y_array = np.array(y)
        # Subset by the index
        X_batch = X[index_sample, :]
        y_batch = y_array[index_sample]
        return X_batch, y_batch
    
    # Computes the soft max layer
    def softmax_layer(self, X_tensor, num_units):
        # Get the size of the input from the tensor
        num_inputs = X_tensor.get_shape()[1].value
        # Create weight, bias and label variables
        W = tf.Variable(tf.zeros([num_inputs, num_units]), name='W')
        b = tf.Variable(tf.zeros([num_units]), name='b')
        # Compute out y on the this layer
        y = tf.nn.softmax(tf.matmul(X_tensor, W) + b)
        return y
    
    # Computes the Rectified Linear Units
    def relu_layer(self, X_tensor, num_units):
        # Get the size of the input from the tensor
        num_inputs = X_tensor.get_shape()[1].value
        # Create weight, bias and label variables
        W = tf.Variable(tf.random_uniform([num_features, num_units]), name='W')
        b = tf.Variable(tf.zeros([num_units]), name='b')
        # Compute the rectified linear unit activation on this point
        y = tf.nn.relu(tf.matmul(X_tensor, W) + b, name='relu')
        return y
    
    # Compute teh cost function
    def define_cost_function(self, y, y_tensor, batch_size):
        # Define the cross entropy cost
        cost = -tf.reduce_sum(y_tensor * tf.log(y), name='cross_entropy') / batch_size
        return cost
    
    # Train the model
    def train(self, cost, learning_rate):
        # Use Gradient descent to compute the optimum
        training_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
        return training_step
    
    # Formulates the accuracy
    def compute_accuracy(self, y, y_tensor):
        # Check the predictions
        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_tensor, 1))
        # Compute the accuracy
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"), name='accuracy')
        return accuracy
    
    # bulid the two layer Net
    def two_layers(self):
        # Create hidden, relu layer
        with tf.name_scope("hidden_layer") as scope:
            y_relu = self.relu_layer(X_placeholder, hidden_layer_units)

        # Create softmax layer
        with tf.name_scope("softmax") as scope:
            y_softmax = self.softmax_layer(y_relu, num_classes)

        # Define cost function
        with tf.name_scope("cost_function") as scope:
            global cost
            cost = self.define_cost_function(y_softmax, y_placeholder, batch_size)
            tf.summary.scalar("cost", cost)

        # Define training step
        with tf.name_scope("training") as scope:
            global training_step
            training_step = self.train(cost, learning_rate)

        # Calculate model accuracy
        with tf.name_scope("accuracy") as scope:
            global accuracy
            accuracy = self.compute_accuracy(y_softmax, y_placeholder)
            tf.summary.scalar("accuracy", accuracy)

In [None]:
# Hyperparamters
learning_rate = 0.001
batch_size = X_train.shape[0] // 10
num_features = X_train.shape[1]
num_classes = 2
epochs = 1000
epoch_list = []
epochs_to_print = epochs // 10
hidden_layer_units = 30
avg_cost_list = []

# Define the placeholder
X_placeholder = tf.placeholder(tf.float32, [None, num_features], name='X')
y_placeholder = tf.placeholder(tf.float32, [None, num_classes], name='y')

# Create the NeuralNet Class object
NN = NeuralNet()

# Do one hot encoding for the labels
y_train_one_hot = NN.dense_to_one_hot(y_train, num_classes = num_classes)
y_test_one_hot = NN.dense_to_one_hot(y_test, num_classes = num_classes)

# Call the layers 
NN.two_layers()

# Merge summaries for TensorBoard
merged_summaries = tf.summary.merge_all()

# Start the tensorflow session
with tf.Session() as sess:
    # Create the log directory
    log_directory = 'tmp/logs'
    summary_writer = tf.summary.FileWriter(log_directory, sess.graph)
    
    # Intialize the global variables
    tf.global_variables_initializer().run()
    
    cost_sum = 0
    for i in range(epochs):
        # Make the batches
        X_batch, y_batch = NN.make_batch(np.array(X_train), np.array(y_train_one_hot), batch_size)
        # Feeder
        feed_dict = {X_placeholder: X_batch, y_placeholder: y_batch}
        # Compute the cost
        _, current_cost = sess.run([training_step, cost], feed_dict)
        # Sum the overall cost
        cost_sum += current_cost
        
        # Print average cost periodically
        if i % epochs_to_print == 99:
            average_cost = cost_sum / epochs_to_print
            avg_cost_list.append(round(average_cost, 4))
            epoch_list.append(i+1)
            print("Epoch: {:4d}, average cost = {:0.3f}".format(i+1, average_cost))
            cost_sum = 0
    
    print('\nFinished model fitting.')
 
    # Calculate final accuracy
    X_batch, y_batch = NN.make_batch(np.array(X_test), np.array(y_test_one_hot), batch_size)
    feed_dict = {X_placeholder: X_test, y_placeholder: y_test_one_hot}
    print("\nFinal accuracy = {:0.3f}%".format(sess.run(accuracy * 100, feed_dict)))

In [18]:
#winemod1.compile(optimizer = "adam", loss = "mse", metrics = ["mae", "mse"])
winemod1.compile(optimizer='adam', loss=define_cost_function, metrics=['accuracy'])

In [19]:
history = winemod1.fit(x = X_train, y = y_train_hot, epochs = 1000,verbose=1, batch_size = 100,validation_data=(X_test, y_test_hot))

Epoch 1/1000


TypeError: in user code:

    /Users/jeromecoumont/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)

    TypeError: tf__define_cost_function() missing 2 required positional arguments: 'y_tensor' and 'batch_size'


In [None]:
from tensorflow.keras.callbacks import History
def plot_history(history):
    if isinstance(history, History):
        history = {"": history}
        
    fig, axs = plt.subplots(nrows=1,ncols=2,figsize=(10,4))
    axs[0].set(xlabel="Epochs", ylabel="Loss", title="Training loss")
    axs[1].set(xlabel="Epochs", ylabel="Accuracy", title="Training accuracy")
    
    for k,h in history.items():
        loss = h.history['loss']
        accuracy = h.history['accuracy']
        axs[0].plot(range(len(loss)), loss, label=k)
        axs[1].plot(range(len(accuracy)), accuracy, label=k)    
    
    if len(history) > 1:
        axs[0].legend()
        axs[1].legend()
    plt.show()

In [None]:
plot_history(history)

In [None]:
preds = winemod1.evaluate(x = X_test, y = y_test_hot)
print()
print ("Loss (MSE)= " + str(preds[0]))
print ("Test Accuracy = " + str(preds[1]))
#print ("Test MSE = " + str(preds[2]))


In [None]:
test_predictions = winemod1.predict(X_test)
test_predictions.shape

In [None]:
test_predictions

In [None]:
pred_classes = test_predictions.argmax(axis=-1) + 3
pred_classes.shape

In [None]:
print(np.round(test_predictions.flatten(),1))

df = pd.DataFrame({'true':y_test, 'pred':pred_classes}) #test_predictions.flatten(), 'v':1})
#df = df.groupby(["true", "pred"]).agg(['count']).reset_index()
df

In [None]:
a = plt.axes(aspect='equal')
plt.scatter(df['true'], df['pred'], s=df['v'])
plt.xlabel("True values [Quality score]")
plt.ylabel("Predictions [Quality score]")
lims = [0,10]
plt.plot(lims,lims, c="r")
plt.xlim(lims)
plt.ylim(lims)


In [None]:
from sklearn.metrics import r2_score

r2_score(y_test, test_predictions)

In [None]:
import seaborn as sns

In [None]:
a = plt.axes(aspect='equal')
sns.boxplot(x="true", y="pred", data=df, palette="Set2" )