In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

import tensorflow as tf

import ast
import math

## Importing the data

We will start off with importing all the necessary packages and the data.
After looking at the data manually, we saw that 6 rows were 'broken' so to say with a lot of misaligned columns, so we decided to skip them.
We will be using the selected columns from the report.

In [2]:
keywords_df = pd.read_csv('keywords.csv', low_memory=False)
movies_df = pd.read_csv('movies_metadata.csv',skiprows=[19730, 19731, 29503, 29504, 35587, 35588], usecols=['id', 'vote_average', 'genres'])

## Preprocessing data

First off we will be converting the **JSON** formats of columns **'genres'** and **'keywords'** to a list format using the help of `ast.literal_eval()`,
which helps to parse the columns' objects into the desired type.

In [3]:
movies_df['genres'] = movies_df['genres'].apply(lambda genres_list: [genres['name'] for genres in ast.literal_eval(genres_list)])

keywords_df['keywords'] = keywords_df['keywords'].apply(lambda keywords_list: [keywords['name'] for keywords in ast.literal_eval(keywords_list)])

Mvies and keywords dataframes will be merged by their common column, **'id'**

In [4]:
data = keywords_df.merge(movies_df, on='id')

data

Unnamed: 0,id,keywords,genres,vote_average
0,862,"[jealousy, toy, boy, friendship, friends, riva...","[Animation, Comedy, Family]",7.7
1,8844,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]",6.9
2,15602,"[fishing, best friend, duringcreditsstinger, o...","[Romance, Comedy]",6.5
3,31357,"[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]",6.1
4,11862,"[baby, midlife crisis, confidence, aging, daug...",[Comedy],5.7
...,...,...,...,...
46473,439050,[tragic love],"[Drama, Family]",4.0
46474,111109,"[artist, play, pinoy]",[Drama],9.0
46475,67758,[],"[Action, Drama, Thriller]",3.8
46476,227506,[],[],0.0


Now we will do some data cleaning. We will remove movies with:

- No genres or keywords
- 0 vote_average

In [5]:
print(f'The number of movies before performing all cleaning operations: {len(data)}')

data.drop(data[(data['genres'].map(lambda genres: len(genres)) == 0) | (data['keywords'].map(lambda keywords: len(keywords)) == 0)].index, inplace=True)
data.drop(data[data['vote_average'] == 0.0].index, inplace=True)
data = data.reset_index(drop=True)


print(f'The number of movies after performing all cleaning operations: {len(data)}')

The number of movies before performing all cleaning operations: 46478
The number of movies after performing all cleaning operations: 30012


## Splitting data

Firstly, we will split the data's features into variable **X** and the ratings into variable **y**

In [6]:
y = data.loc[:, 'vote_average']
X = data.loc[:, ['keywords', 'genres']]

In [7]:
X

Unnamed: 0,keywords,genres
0,"[jealousy, toy, boy, friendship, friends, riva...","[Animation, Comedy, Family]"
1,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,"[fishing, best friend, duringcreditsstinger, o...","[Romance, Comedy]"
3,"[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]"
4,"[baby, midlife crisis, confidence, aging, daug...",[Comedy]
...,...,...
30007,"[revenge, murder, serial killer, new york city...","[Horror, Mystery, Thriller]"
30008,[blair witch],"[Mystery, Horror]"
30009,"[witch, mythology, legend, serial killer, mock...",[Horror]
30010,[tragic love],"[Drama, Family]"


As the data contains many **unique** keywords, we decided that we will only use movies with the most popular keywords for our learning model. We will count the occurences of keywords so we can select only movies with the most popular keywords.

In [8]:
keyword_counts = {}

for word_list in X['keywords']:
    for word in word_list:
        if word in keyword_counts:
            keyword_counts[word] += 1
        else:
            keyword_counts[word] = 1

# sorting the dictionary
keyword_counts = {key: value for key, value in sorted(keyword_counts.items(), key=lambda item: item[1], reverse=True)}

keyword_counts

{'woman director': 2753,
 'independent film': 1891,
 'murder': 1278,
 'based on novel': 803,
 'sex': 679,
 'musical': 675,
 'violence': 650,
 'nudity': 631,
 'revenge': 614,
 'biography': 604,
 'suspense': 587,
 'love': 559,
 'female nudity': 553,
 'sport': 525,
 'police': 447,
 'duringcreditsstinger': 438,
 'sequel': 435,
 'teenager': 434,
 'friendship': 406,
 'world war ii': 383,
 'drug': 351,
 'prison': 342,
 'high school': 313,
 'martial arts': 310,
 'stand-up comedy': 309,
 'rape': 304,
 'suicide': 304,
 'film noir': 299,
 'kidnapping': 298,
 'serial killer': 289,
 'family': 286,
 'monster': 282,
 'alien': 279,
 'silent film': 278,
 'dystopia': 271,
 'blood': 266,
 'paris': 265,
 'new york': 263,
 'gay': 256,
 'marriage': 250,
 'christmas': 250,
 'gore': 245,
 'short': 245,
 'death': 243,
 'zombie': 241,
 'london england': 235,
 'gangster': 234,
 'small town': 234,
 'aftercreditsstinger': 231,
 'prostitute': 230,
 'detective': 229,
 'romance': 226,
 'male nudity': 226,
 'vampire':

We decided to use the top 1000 most popular keywords for our learning model.

In [9]:
vocabulary = list(keyword_counts)[0:1000]

for word_list in X['keywords']:
    word_list[:] = [word for word in word_list if word in vocabulary]

X

Unnamed: 0,keywords,genres
0,"[jealousy, boy, friendship, friends, rivalry]","[Animation, Comedy, Family]"
1,"[disappearance, based on children's book]","[Adventure, Fantasy, Family]"
2,"[best friend, duringcreditsstinger]","[Romance, Comedy]"
3,"[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]"
4,"[baby, midlife crisis, aging, daughter, mother...",[Comedy]
...,...,...
30007,"[revenge, murder, serial killer, new york city]","[Horror, Mystery, Thriller]"
30008,[],"[Mystery, Horror]"
30009,"[witch, mythology, legend, serial killer, mock...",[Horror]
30010,[],"[Drama, Family]"


We will now remove the rows which have empty lists in the **'keywords'** column

In [10]:
null_indices = set()

for i, row in enumerate(X['keywords']):
    if not row:
        null_indices.add(i)

X = X.drop(null_indices, axis=0).reset_index(drop=True)
y = y.drop(null_indices, axis=0).reset_index(drop=True)

We will make a list of unique keywords and genres to use them for column names when executing **one-hot-encoding** later.

In [11]:
keyword_column_names = []
genres_column_names = []

for i, row in X.iterrows():
    for word in row.keywords:
        if word not in keyword_column_names:
            keyword_column_names.append(word)
    for genre in row.genres:
        if genre not in genres_column_names:
            genres_column_names.append(genre)

Next, we will be using **MultiLabelBinarizer()** from sklearn to one-hot-encode **'keywords'** and **'genres'** columns.

In [12]:
mlb = MultiLabelBinarizer()
X_keywords = pd.DataFrame(mlb.fit_transform(X['keywords']), columns=keyword_column_names)
X_genres = pd.DataFrame(mlb.fit_transform(X['genres']), columns=genres_column_names)

X = pd.concat([X_keywords, X_genres], axis=1)

X

Unnamed: 0,jealousy,boy,friendship,friends,rivalry,disappearance,based on children's book,best friend,duringcreditsstinger,based on novel,...,Horror,History,Science Fiction,Mystery,War,Foreign,Music,Documentary,Western,TV Movie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26751,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26752,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26753,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
26754,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


## Splitting the data

Splitting the data for training and testing. For this we will be using **train_test_split** from sklearn.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=10)

X.shape

(26756, 1020)

## Balancing dataset

We will also try training the model on a more balanced dataset.

In [14]:
train_new = pd.concat([X_train, y_train], axis=1)

train_0_4 = train_new[train_new.vote_average < 5]
train_5_6 = train_new[(train_new.vote_average >= 5) & (train_new.vote_average < 7)]
train_7_10 = train_new[train_new.vote_average >= 7]

train_5_6 = train_5_6.sample(n=3389, random_state=0)
train_7_10 = train_7_10.sample(n=3389, random_state=0)

train_new = pd.concat([train_0_4, train_5_6, train_7_10]).sort_index()

X_train_new = train_new.drop(columns=['vote_average'])
y_train_new = train_new['vote_average']

## Training

We will train a neural network using **Keras deep learning API**. We built a neural network consisting of 4 layers: input layer, **2 densely connected layers with 64 units each** and an output layer. The dense layers use **ReLU activation functions**, we are using **Adam optimization algorithm** to optimize the model. We ran 50 epochs as the model's performance plateaued around that point. The model's hyperparameters have been chosen based on trial and error testing.

In [15]:
inputs = tf.keras.Input(shape=(len(keyword_column_names) + len(genres_column_names),))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='linear')(x)

model = tf.keras.Model(inputs, outputs)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse')

history = model.fit(X_train, y_train, validation_split=0.1, batch_size=32, epochs=50, callbacks=[tf.keras.callbacks.ReduceLROnPlateau()])

model2 = tf.keras.models.clone_model(model)
model2.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse')

history2 = model2.fit(X_train_new, y_train_new, validation_split=0.1, batch_size=32, epochs=50, callbacks=[tf.keras.callbacks.ReduceLROnPlateau()])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

## Testing the model

Having the model predict on the testing data.

In [16]:
y_preds = model.predict(X_test)
y_preds



array([[7.2592034],
       [5.6519966],
       [7.256482 ],
       ...,
       [5.813736 ],
       [5.821364 ],
       [5.793685 ]], dtype=float32)

In [17]:
y_test = y_test.to_numpy()
y_preds = np.squeeze(y_preds)
y_test

array([5.5, 5.5, 7.6, ..., 6. , 5. , 4.9])

Results from the model that was trained on the balanced dataset

In [18]:
y_preds_new = model2.predict(X_test)
y_preds_new



array([[7.2027473],
       [5.5331955],
       [8.802425 ],
       ...,
       [5.208526 ],
       [6.594243 ],
       [6.053894 ]], dtype=float32)

Defining our way of determining the accuracy of the models.
For this we will be using root mean squared error.

In [19]:
def MSE(y_target, y_pred):
    sum = 0
    for i in range(len(y_target)):
        sum += (y_target[i] - y_pred[i]) ** 2
    return sum / len(y_target)

def RMSE(y_target, y_pred):
    return math.sqrt(MSE(y_target, y_pred))

Finding the **accuracy** of our model

In [20]:
print(f'RMSE for model (unbalanced) - {RMSE(y_test, y_preds)}')
print(f'RMSE for model (balanced) - {RMSE(y_test, y_preds_new)}')

RMSE for model (unbalanced) - 1.10113642947715
RMSE for model (balanced) - 1.1727449875307518
