# Final model
In this script we will run our final model. We will run a k-fold to obtain a reliable result of our performance. We will also run a seperate model on all the train data in order to get a model we can test with the test data.

In [1]:
import zipfile
import os
import cv2
import pandas as pd
from os import chdir, listdir

import matplotlib.pyplot as plt
import numpy as np
import random

import tensorflow as tf
from tensorflow.keras import layers, models, preprocessing, regularizers
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.losses import MeanSquaredError
from keras import backend as K
from keras import activations

# Import tabular data

The tabular data is imported. This contains information on whether several elements are present in the image, such as blur, a human, a group, etc. Also the pawpularity score of the training data is in the table. For the test data only the image ID and the features are in the table. There is also a sample submission table, which contains the pawpularity score for the test data.

In [3]:
# load train, test, and submission sample dataset.
csv_train_data = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/train.csv')
csv_test_data = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/test.csv')
sample_submission = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/sample_submission.csv')
csv_train_data.head()

# Drop rows with missing values (if NaN values are in dataframe)
# No missing values present, so no samples dropped
csv_train_data.dropna()

In [4]:
# Create a plot that shows the distribution of the output of the training samples
plt.hist(csv_train_data['Pawpularity'], bins=100)
plt.title("Data distribution of the tabular data")
plt.xlabel("Pawpularity score")
plt.ylabel("Occurence")
plt.xlim(0, 100)

plt.show()

# Import image data
The images are imported from the folders. Each image is reshaped to a 64x64 image. In this way all the images have the same shape and we do not use much memory, to speed up analysis. After the images are imported, the images and their names are shuffled. This is done, so we can later take a validation sample containing a random subsample of the dataset. It could be that the images in the dataset contain some order, so by shuffling we ensure that the subset for the validation data is random.


In [5]:
def reshape_images(path, n):
    """
    This function returns a list of images, which are reshaped to 64 x 64 
    and a list with the names of the images.
    """
    # Set the current path
    chdir(path)
    
    # Preset the lists
    images = []
    image_names = []
    
    # Go over all the files in the path
    for i in listdir():
        
        # Get the name of the image, without .jpg
        image_names.append(i[:-4])
        
        # Get the image and reshape to n x n
        file = cv2.imread(i)
        file = cv2.resize(file,(n, n), interpolation=cv2.INTER_AREA)
        
        # Rescale the pixels and store in the list
        images.append(file/255)
        
    return images, image_names

# Reshape train and test images
# Set the path for loading image dataset.
train_imgs, train_names = reshape_images('/kaggle/input/petfinder-pawpularity-score/train', 64)
test_imgs, test_names = reshape_images('/kaggle/input/petfinder-pawpularity-score/test', 64)

# Combine tabular data with images
To ensure that the dataframe has the same order as the images in the list, we sort the dataframe based on the names of the images. If this would not be the case, it could be that you learn incorrectly, as the output of an image perhaps is not the real output.

In [6]:
def sort_dataframe(data, images, names):
    """
    This function sorts the dataframe of the csv data according to the image names.
    """
    data_sorted = pd.DataFrame()

    # Iterate over images and get index of each image
    for img, name in zip(images, names):
        location = data[data['Id'] == name].index[0]

        # Sort dataframe according to index of images
        data_sorted = data_sorted.append([data.loc[location]])

        # Reset the index of the dataframe
        data_sorted = data_sorted.reset_index().drop(['index'],axis=1)
        
    return data_sorted

# Sort training and testing data
train_data_sorted = sort_dataframe(csv_train_data, train_imgs, train_names)
test_data_sorted = sort_dataframe(csv_test_data, test_imgs, test_names)
sample_submission_sorted = sort_dataframe(sample_submission, test_imgs, test_names)

# Processing data
The tabular data is split in x and y values and converted to numpy arrays, so the neural network can handle the data. Moreover, the image data is converted to numpy arrays.

In [7]:
# Remove samples with pawpularity score of 100
indexNames = train_data_sorted[train_data_sorted['Pawpularity'] == 100].index | train_data_sorted[train_data_sorted['Pawpularity'] < 5].index
train_data_new = train_data_sorted.drop(indexNames)
train_imgs_new = np.delete(train_imgs, indexNames, axis=0)

In [8]:
# Select x-values (the 12 input features) and y-values from training data
x_tabular = train_data_new.iloc[:,1:13].to_numpy()
y = train_data_new.iloc[:,13].to_numpy()

# Select x (the 12 input features) and y (pawpularity) values from testing data
x_test_tabular = test_data_sorted.iloc[:,1:13].to_numpy()
y_test = sample_submission_sorted.iloc[:,1].to_numpy()

# Create numpy array of image data 
x_images = np.array(train_imgs_new)
test_imgs_array = np.array(test_imgs)

# Create seperate neural networks
We create a tabular neural network to handle the data in the csv. Then we create a convolutional neural network to handle the image data. Both neural networks have no output layer, since they will be concatenated to one neural network, which will give the output.

In [9]:
def build_neural_net(input_size, hidden_nodes):
    """
    Build neural network with an input size and a hidden layer with a number of 
    hidden nodes.
    """
    # Create a sequential model object
    model = models.Sequential()

    # Create hidden layer 
    model.add(layers.Dense(units=hidden_nodes, activation='relu', input_shape=(input_size,)))    

    # Create hidden layer 
    model.add(layers.Dropout(0.4))
    model.add(layers.Dense(units=hidden_nodes, activation="relu"))

    # Create hidden layer 
    model.add(layers.Dropout(0.4))
    model.add(layers.Dense(units=hidden_nodes, activation="relu"))

    return model

In [10]:
def build_convol_net(image_size, hidden_nodes):
    """
    Build neural network with an input size and a hidden layer with a number 
    of hidden nodes.
    """
    # Create a sequential model object
    model = models.Sequential()
    
    # Create a convolutional layer 
    model.add(layers.Conv2D(filters=64, kernel_size=(3,3), activation='relu', input_shape=image_size, padding='same'))
    model.add(layers.Conv2D(filters=64, kernel_size=(3,3), activation='relu', input_shape=image_size, padding='same'))
    model.add(layers.MaxPool2D(pool_size=(2,2), strides=2))
    model.add(layers.BatchNormalization())

    # Create a convolutional layer 
    model.add(layers.Dropout(0.4))
    model.add(layers.Conv2D(filters=128, kernel_size=(3,3), activation='relu', padding='same'))
    model.add(layers.Conv2D(filters=128, kernel_size=(3,3), activation='relu', padding='same'))
    model.add(layers.MaxPool2D(pool_size=(2,2), strides=2))
    model.add(layers.BatchNormalization())

    # Create a convolutional layer 
    model.add(layers.Dropout(0.4))
    model.add(layers.Conv2D(filters=256, kernel_size=(3,3), activation='relu', padding='same'))
    model.add(layers.Conv2D(filters=256, kernel_size=(3,3), activation='relu', padding='same'))
    model.add(layers.MaxPool2D(pool_size=(2,2), strides=2))
    model.add(layers.BatchNormalization())
    
    # Create a flattening layer
    model.add(layers.Flatten())

    # Create a dense layer 
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(units=hidden_nodes, activation="relu", 
              kernel_regularizer=regularizers.l2(1e-3),
              bias_regularizer=regularizers.l2(1e-3),
              activity_regularizer=regularizers.l2(1e-3)))

    # Create a dense layer 
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(units=hidden_nodes, activation="relu", 
              kernel_regularizer=regularizers.l2(1e-3),
              bias_regularizer=regularizers.l2(1e-3),
              activity_regularizer=regularizers.l2(1e-3)))
    
    return model

## Concatenate tabular and image data models
Concatenate the tabular and image models to create one neural network that can handle both types of data. This neural network will give the prediction of the pawpularity.

In [11]:
def linear_limit(x):
    """
    Create a linear activation function that clips the output at 0 and 100.
    """
    activation_x = activations.linear(x)
    activation_x_new = K.clip(activation_x, 0, 100)

    return activation_x_new

In [12]:
def concatenate_models(model1, model2, hidden_nodes):
    """
    Concatenate two neural network models, model1 and model2, and create
    a concatenated model with dense layers with some hidden nodes.
    """
    # Input for concatenated model is retrieved by concatenating the output
    # of both models
    concat_input = layers.concatenate([model1.output, model2.output])

    # Create hidden layer 
    hidden_layer_1 = layers.Dense(hidden_nodes, activation="relu", 
              kernel_regularizer=regularizers.l2(1e-1),
              bias_regularizer=regularizers.l2(1e-1),
              activity_regularizer=regularizers.l2(1e-1))(concat_input)

    # Create hidden layer 
    drop_out_1 = layers.Dropout(0.4)(hidden_layer_1)    
    hidden_layer_2 = layers.Dense(hidden_nodes, activation="relu", 
              kernel_regularizer=regularizers.l2(1e-1),
              bias_regularizer=regularizers.l2(1e-1),
              activity_regularizer=regularizers.l2(1e-1))(drop_out_1)

    # Create hidden layer 
    drop_out_2 = layers.Dropout(0.4)(hidden_layer_2)
    hidden_layer_3 = layers.Dense(hidden_nodes, activation="relu", 
              kernel_regularizer=regularizers.l2(1e-1),
              bias_regularizer=regularizers.l2(1e-1),
              activity_regularizer=regularizers.l2(1e-1))(drop_out_2)

    # Create output layer
    output_layer = layers.Dense(1, activation=linear_limit)(hidden_layer_3)

    # Create concatenated model with inputs of both models and output of the
    # concatenated model
    concat_model = models.Model(inputs=[model1.input, model2.input], outputs=output_layer)

    return concat_model

In [13]:
# Part of code from: https://www.tensorflow.org/tutorials/keras/regression

def plot_loss(history):
    """
    Plot loss during epochs of training a neural network.
    """
    
    fig, axs = plt.subplots(1,2,figsize=(20,5)) 

    for i, metric in enumerate(['loss', 'root_mean_squared_error']):
        axs[i].plot(history.history[metric])
        axs[i].legend(['training'], loc='best')

        axs[i].set_title('Model '+metric)
        axs[i].set_ylabel(metric)
        axs[i].set_xlabel('epoch')

    plt.show()


def train_and_evaluate(model, image_x, tabular_x, train_y, 
                       x_test_imgs, x_test_tabular, test_y, epochs=20, preprocess = {}, augment={}):
  """
  This function trains and evaluated a model. It first compiles the model with 
  the loss and metrics. It then makes a train and validation generator for the 
  image data, based on the preprocess and augment input. 
  It then trains the model on both the image and tabular data for epochs times. 
  The values of the loss and metric are plotted and printed.
  """

  # Compile model and use mean squared error as loss and root mean squared error as metric
  model.compile(loss=MeanSquaredError(), metrics=[RootMeanSquaredError()])

  # Preprocess the image data
  train_gen = preprocessing.image.ImageDataGenerator(**preprocess, **augment)
  train_gen.fit(image_x)

  val_gen = preprocessing.image.ImageDataGenerator(**preprocess)
  val_gen.fit(image_x)

  # Train the model by fitting both tabular and image data at the same time
  history = model.fit(train_gen.flow([image_x, tabular_x], train_y), epochs=epochs)

  # Plot the loss and metric
  plot_loss(history)

  # Evaluate the model on the test data
  test_accuracy = model.evaluate(val_gen.flow([x_test_imgs, x_test_tabular], test_y))

  return test_accuracy

# Make Neural Networks before concatenation
tabular_NN = build_neural_net(12, hidden_nodes=20)
image_size = (64, 64, 3)
image_NN = build_convol_net(image_size, hidden_nodes=20)

# Concatenate tabular and image neural networks
concat_model = concatenate_models(image_NN, tabular_NN, hidden_nodes=20)

# Train model on both tabular and image data and preprocess
test_acc = train_and_evaluate(concat_model, x_images, x_tabular, y, test_imgs_array, x_test_tabular, y_test,
                   preprocess={'featurewise_center': True, 'featurewise_std_normalization': True}, epochs=60)

print(f"Test MSE: {test_acc[0]}, Test RMSE: {test_acc[1]}")

In [14]:
# Predict results for the test data set
img_result = concat_model.predict([test_imgs_array, x_test_tabular])
final_result = pd.DataFrame(img_result)
final_result.columns =['Pawpularity']

In [15]:
# Store predictions of the Pawpularity in a csv file
for ids, paw in zip(test_data_sorted['Id'], final_result['Pawpularity']):
    location = sample_submission[sample_submission['Id'] == ids].index[0]
    sample_submission['Pawpularity'].loc[location] = paw 
sample_submission.to_csv('/kaggle/working/submission.csv',index=False)
