In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, Concatenate, Embedding, Reshape, Dot
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Step 1: Load and preprocess the MovieLens dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/FINAL Project/data'
movies = pd.read_csv(f'{file_path}/ml-latest-small/movies.csv')
movies['genres'] = movies['genres'].str.split('|')
print(movies.shape)
movies.head()

(9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [6]:
ratings = pd.read_csv(f'{file_path}/ml-latest-small/ratings.csv')
ratings = ratings.drop('timestamp',axis=1)
print(ratings.shape)
ratings.head()

(100836, 3)


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [None]:
# Normalize the ratings between 0 and 1
max_rating = ratings['rating'].max()
min_rating = ratings['rating'].min()
print('Maximum Rating:', max_rating)
print('Minimum Rating:', min_rating)

ratings['rating'] = (ratings['rating'] - min_rating) / (max_rating - min_rating)
ratings.head()

5.0
0.5


Unnamed: 0,userId,movieId,rating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,workplace,world war ii,wormhole,writer,writers,writing,wry,youth,zombie,zombies
0,1,307,0.666667,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,481,0.666667,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
2,1,1091,0.222222,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1257,0.888889,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1449,0.888889,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
tags = pd.read_csv(f'{file_path}/ml-latest/tags_filtered_200.csv')
print(tags.shape)
tags.head()

(492004, 4)


Unnamed: 0,userId,movieId,tag,timestamp
0,14,110,epic,1443148538
1,14,260,sci-fi,1442169410
2,14,318,imdb top 250,1442615195
3,14,318,justice,1442615192
4,14,1682,philosophy,1442615158


In [9]:
#make sure all dataframes have the same movies included
movies = movies[movies['movieId'].isin(ratings['movieId'])]
movies = movies.reset_index(drop=True)

tags = tags[tags['movieId'].isin(ratings['movieId'])]
tags = tags.reset_index(drop=True)

movies = movies[movies['movieId'].isin(tags['movieId'])]
movies = movies.reset_index(drop=True)

ratings = ratings[ratings['movieId'].isin(tags['movieId'])]
ratings = ratings.reset_index(drop=True)

In [10]:
print(f"There are: {ratings['userId'].nunique()} unqiue users")
print(f"There are: {ratings['movieId'].nunique()} unqiue films in the ratings dataframe")
print(f"There are: {movies['movieId'].nunique()} unqiue films in the movies dataframe")
print(f"There are: {tags['movieId'].nunique()} unqiue films in the tags dataframe")
print(f"There are: {tags['tag'].nunique()} unqiue tags in the tags dataframe")

There are: 610 unqiue users
There are: 9186 unqiue films in the ratings dataframe
There are: 9186 unqiue films in the movies dataframe
There are: 9186 unqiue films in the tags dataframe
There are: 733 unqiue tags in the tags dataframe


In [None]:
# #remove tags that occur less than 120 times from the tags dataframe
# from collections import Counter

# #get values for every tag and count the occurances
# all_tags = tags['tag'].values
# tag_counts = Counter(all_tags)

# # Sort the tags by their frequencies in descending order
# sorted_tags = sorted(tag_counts.items(), key=lambda x: x[1], reverse=True)

# # Print the sorted tags(its a very long output)
# # for tag, count in sorted_tags:
# #     print(tag, count)

# # Create a list of tags to remove (occurring less than 120 times)
# no_of_tag_occurances = 120
# tags_to_remove = [tag for tag, count in tag_counts.items() if count < no_of_tag_occurances]
# print(len(tags_to_remove), "tags to be removed from dataframe")

# #iterate through tags dataframe and find the index numbers of rows to be removed
# index_to_remove = []
# for num, tag in enumerate(tags['tag']):
#     if tag in tags_to_remove:
#         index_to_remove.append(num)

# tags_filtered = tags.drop(index_to_remove)
# tags_filtered = tags_filtered.reset_index(drop=True)
# print(tags_filtered.shape)
# tags_filtered.head()

In [None]:
# tags_filtered.to_csv(f'{file_path}/tags_filtered_120.csv', index=False)

# Join Data

In [11]:
#aggregate tags into list by movie id and attach to dataframe
tags_agg = tags.groupby('movieId')['tag'].agg(list)
movies_tag_df = pd.merge(movies, tags_agg, how='inner', on='movieId')

#eliminate duplicate tag in tag lists by turning into a set and then back into a list
movies_tag_df['tag'] = movies_tag_df['tag'].apply(lambda tags: list({tag for tag in tags}))

#display dataframe
print(movies_tag_df.shape)
movies_tag_df.head()

(9186, 4)


Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[BD-Video, animated, time travel, itaege, Nati..."
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","[childish, comedy, scary, time travel, adventu..."
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","[comedy, funny, duringcreditsstinger, sequel, ..."
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]","[revenge, based on novel or book, CLV, charact..."
4,5,Father of the Bride Part II (1995),[Comedy],"[parent child relationship, humorous, pregnanc..."


In [12]:
# Extract the genre information
genres = movies_tag_df['genres'].values

# Perform one-hot encoding on the genre lists
mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(genres)

# Retrieve the list of genre classes
genre_classes = mlb.classes_

# Create a new dataframe with the one-hot encoded genre features
genre_df = pd.DataFrame(genre_features, columns=genre_classes)
genre_df.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
# Extract the tag information
tags = movies_tag_df['tag'].values

# Perform one-hot encoding on the tag lists
mlb = MultiLabelBinarizer()
tag_features = mlb.fit_transform(tags)

# Retrieve the list of tag classes
tag_classes = mlb.classes_

# Create a new dataframe with the one-hot encoded tag features
tag_df = pd.DataFrame(tag_features, columns=tag_classes)
tag_df.head()

Unnamed: 0,007,1930s,1950s,1960s,1970s,1980s,19th century,70mm,AFI 100,Action,...,whimsical,wilderness,witch,witty,wizards,woman director,women,writers,zombie,zombies
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
movies_features = pd.concat([movies_tag_df,genre_df,tag_df],axis=1)
print(movies_features.shape)
movies_features.head()

(9186, 757)


Unnamed: 0,movieId,title,genres,tag,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,whimsical,wilderness,witch,witty,wizards,woman director,women,writers,zombie,zombies
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[BD-Video, animated, time travel, itaege, Nati...",0,0,1,1,1,1,...,0,0,0,1,0,0,0,0,0,0
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","[childish, comedy, scary, time travel, adventu...",0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","[comedy, funny, duringcreditsstinger, sequel, ...",0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]","[revenge, based on novel or book, CLV, charact...",0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),[Comedy],"[parent child relationship, humorous, pregnanc...",0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [15]:
movielens_df = pd.merge(ratings,movies_features,how='left', on = 'movieId')
movielens_df = movielens_df.drop(['title','genres','tag'],axis=1)
print(movielens_df.shape)
movielens_df.head()

(100009, 756)


Unnamed: 0,userId,movieId,rating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,whimsical,wilderness,witch,witty,wizards,woman director,women,writers,zombie,zombies
0,1,1,4.0,0,0,1,1,1,1,0,...,0,0,0,1,0,0,0,0,0,0
1,1,3,4.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,6,4.0,0,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
3,1,47,5.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,50,5.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(movielens_df, test_size=0.2, random_state=42)

# Determine the maximum values of userId and movieId for embedding layer sizes
max_user_id = ratings['userId'].max()
max_movie_id = ratings['movieId'].max()

# Determine the number of genre and tag features
genre_len = len(genre_df.columns)
tag_len = len(tag_df.columns)

# Prepare the input data for the neural network
train_user_ids = train_data['userId'].values
train_movie_ids = train_data['movieId'].values
train_genres = train_data.iloc[:, 3:23].values
train_tags = train_data.iloc[:, 23:].values
train_ratings = train_data['rating'].values

test_user_ids = test_data['userId'].values
test_movie_ids = test_data['movieId'].values
test_genres = test_data.iloc[:, 3:23].values
test_tags = test_data.iloc[:, 23:].values
test_ratings = test_data['rating'].values

In [17]:
print(train_user_ids.shape)
print(train_movie_ids.shape)
print(train_genres.shape)
print(train_tags.shape)
print(train_ratings.shape)

(80007,)
(80007,)
(80007, 20)
(80007, 733)
(80007,)


In [18]:
def create_hybrid_model(max_user_id, max_movie_id, genre_len, tag_len, embedding_size):
    # Input Layers
    user_input = Input(name='user_input', shape=(1,))
    movie_input = Input(name='movie_input', shape=(1,))
    genre_input = Input(name="genre_input", shape=(genre_len,))
    tag_input = Input(name="tag_input", shape=(tag_len,))

    # Collaborative Filtering
    cf_user_embedding = Embedding(name='cf_user_embedding', input_dim=max_user_id+1, output_dim=embedding_size)(user_input)
    cf_user_reshape = Reshape(name='cf_user_reshape', target_shape=(embedding_size,))(cf_user_embedding)
    cf_movie_embedding = Embedding(name='cf_movie_embedding', input_dim=max_movie_id+1, output_dim=embedding_size)(movie_input)
    cf_movie_reshape = Reshape(name='cf_movie_reshape', target_shape=(embedding_size,))(cf_movie_embedding)
    cf_user_movie_dot = Dot(name='cf_user_movie_dot', normalize=True, axes=1)([cf_user_reshape, cf_movie_reshape])

    # Neural Network
    nn_user_embedding = Embedding(name='nn_user_embedding', input_dim=max_user_id+1, output_dim=embedding_size)(user_input)
    nn_user_reshape = Reshape(name='nn_user_reshape', target_shape=(embedding_size,))(nn_user_embedding)
    nn_movie_embedding = Embedding(name='nn_movie_embedding', input_dim=max_movie_id+1, output_dim=embedding_size)(movie_input)
    nn_movie_reshape = Reshape(name='nn_movie_reshape', target_shape=(embedding_size,))(nn_movie_embedding)
    nn_concat = Concatenate()([nn_user_reshape, nn_movie_reshape])
    nn_dense = Dense(name="nn_dense", units=int(embedding_size/2), activation='relu')(nn_concat)

    # Genre Features
    genre_features = Dense(name="genre_features", units=genre_len, activation='relu')(genre_input)

    # Tag Features
    tag_features = Dense(name="tag_features", units=tag_len, activation='relu')(tag_input)

    # Merge all
    pred_ratings = Concatenate()([cf_user_movie_dot, nn_dense, genre_features, tag_features])
    pred_ratings = Dense(name="pred_ratings", units=1, activation='linear')(pred_ratings)

    # Create and compile the model
    model = Model(inputs=[user_input, movie_input, genre_input, tag_input], outputs=pred_ratings, name="hybrid_model")
    model.compile(loss='mse', optimizer='adam')

    return model

In [19]:
# Define the embedding sizes
embedding_size = 50

# Create the hybrid model
hybrid_model = create_hybrid_model(max_user_id, max_movie_id, genre_len, tag_len, embedding_size)

#training inputs
epochs = 5
batch_size = 64

# Train the model
hybrid_model.fit([train_user_ids, train_movie_ids, train_genres, train_tags], train_ratings, epochs = epochs, batch_size=batch_size, shuffle=True, validation_split=0.3)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7a77616fc460>

In [20]:
predictions = hybrid_model.predict([test_user_ids, test_movie_ids, test_genres, test_tags])
print("the highest rating the model predicted is:", np.max(predictions))
rmse = np.sqrt(mean_squared_error(test_ratings, predictions))
print('RMSE:', rmse)

the highest rating the model predicted is: 5.5540023
RMSE: 0.9204621401055346


In [21]:
predictions = np.clip(predictions, a_min=0, a_max=1.0)
print("the highest rating the model predicted is:", np.max(predictions))
rmse = np.sqrt(mean_squared_error(test_ratings, predictions))
print('RMSE:', rmse)

the highest rating the model predicted is: 1.0
RMSE: 2.706790037643457


In [None]:
# Define model inputs for hyperparameters tuning
param_grid = {
    'max_user_id': [max_user_id],
    'max_movie_id': [max_movie_id],
    'genre_len': [genre_len],
    'tag_len': [tag_len],
    'embedding_size': [50, 100],
}

# Manually set a list of values for epochs and batch_size
epochs_list = [5, 10, 20]
batch_size_list = [32, 64, 128, 256]

best_score = None
best_params = {}

# Nested loops for hyperparameter tuning
for embedding_size in param_grid['embedding_size']:
    for batch_size in batch_size_list:
        for epochs in epochs_list:
            # Create the hybrid model
            hybrid_model = create_hybrid_model(max_user_id, max_movie_id, genre_len, tag_len, embedding_size)

            # Train the model
            hybrid_model.fit([train_user_ids, train_movie_ids, train_genres, train_tags], train_ratings, epochs=epochs, batch_size=batch_size, shuffle=True, validation_split=0.3)

            # Evaluate the model performance
            mse_score = -hybrid_model.evaluate([train_user_ids, train_movie_ids, train_genres, train_tags], train_ratings)

            # Keep track of the best hyperparameters based on MSE score
            if best_score is None or mse_score < best_score:
                best_score = mse_score
                best_params = {
                    'embedding_size': embedding_size,
                    'batch_size': batch_size,
                    'epochs': epochs,
                }

print("Best Hyperparameters:", best_params)
print("Best Model Score (MSE):", best_score)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20