#### This notebook uses the new train data set. Use the code by Goodness to generate the new train set. 
#### The environment is google colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
## Import the necessary librabries

import pandas as pd
import numpy as np
import random as rn

from keras.models import Model, load_model
from keras.layers import *
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras import optimizers

import tensorflow as tf
import keras

tf.set_random_seed(234)
np.random.seed(234)
rn.seed(234)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


###### Use pandas to load in the data set. The train data have some columns that were not needed, line 2 of the code drops these columns. check with train.head().

In [5]:
train = pd.read_csv("train2.csv")
train.drop(labels=['Commedian','Commedian_ID','Jokes_ID','Place','Place_ID','Version'], axis=1, inplace=True)
test = pd.read_csv("test.csv")
sub = pd.read_csv("SampleSubmission.csv")

###### Make the test viewers id an int. If the train data viewers id set is not an int, repeat the exact same code for it.

In [6]:
test['Viewers_ID'] = test['Viewers_ID'].str.slice(1)
test['Viewers_ID'] = test['Viewers_ID'].astype('int32')

###### Merge both train and test data.  

In [8]:
df = train.append(test, ignore_index=True)

In [9]:
df.head()

Unnamed: 0,Joke_identifier,Rating,Response_ID,Viewers_ID
0,Klint De Drunk Enugu 1,0.11,A1_Klint De Drunk Enugu 1,1
1,Klint De Drunk Enugu 2,-4.64,A1_Klint De Drunk Enugu 2,1
2,Klint De Drunk PH 1,-3.39,A1_Klint De Drunk PH 1,1
3,Klint De Drunk PH 2,0.44,A1_Klint De Drunk PH 2,1
4,Klint De Drunk Lagos 1,-4.83,A1_Klint De Drunk Lagos 1,1


###### The next line of code turns joke_identifier column in the merged data set into an integer betweeen 0 and the total number of unique joke_identifier in the entire data set

In [None]:
df["Joke_identifier"] = df["Joke_identifier"].astype("category").cat.codes

###### We separate our data back into train and test data

In [10]:
train = df[:672287]
test = df[672287:]

###### Create embeddings for viewers_id and joke_identifier.
###### Create a neural network model using the functional API

In [11]:
user_id_input = Input(shape=[1], name='user')
item_id_input = Input(shape=[1], name='item')


embedding_size = 15 # 5
user_embedding = Embedding(output_dim=embedding_size, input_dim=train["Viewers_ID"].shape[0],
                           input_length=1, name='user_embedding')(user_id_input)

item_embedding = Embedding(output_dim=embedding_size, input_dim=train['Joke_identifier'].shape[0],
                           input_length=1, name='item_embedding')(item_id_input)


####neural network model

x = Concatenate()([user_embedding, item_embedding])
x = Flatten()(x)
x = Dropout(0.7)(x)

x = Dense(256, activation='relu', use_bias=True)(x)
x = Dropout(0.3)(x)

y = Dense(1)(x)



model = Model(inputs=[user_id_input, item_id_input], outputs=y)
model.compile(optimizer=optimizers.Adam(0.003), loss='mse')

In [None]:
#model.summary()

In [12]:
file_path = 'best_model.hdf5'
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', save_best_only=True, mode='min')
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

###### Train the model, load the model and use it to predict the rating for the test data.

In [None]:
model.fit([train["Viewers_ID"], train["Joke_identifier"]]
                    , train["Rating"]
                    , batch_size=256, epochs=50
                    , validation_split=0.1
, shuffle=True,callbacks=[checkpoint, earlystop])

In [None]:
model = load_model(file_path)

In [None]:
preds = model.predict([test["Viewers_ID"], test["Joke_identifier"]]).astype(float)

In [None]:
preds = np.around(preds, 2)

In [None]:
sub["Rating"] = preds
sub.to_csv("submission.csv", index=False)