In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, Concatenate, Embedding, Reshape, Dot
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MultiLabelBinarizer

In [20]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
# Step 1: Load and preprocess the MovieLens dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/FINAL Project/data/ml-latest'
movies = pd.read_csv(f'{file_path}/movies.csv')
movies['genres'] = movies['genres'].str.split('|')
print(movies.shape)
movies.head()

(58098, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [19]:
ratings = pd.read_csv(f'{file_path}/ratings.csv')
ratings = ratings.drop('timestamp',axis=1)
print(ratings.shape)
ratings.head()

(27753444, 3)


Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5


In [22]:
# Take a smaller sample of user ratings
# Even with the colab GPU there isn't enough compute for whole data set!
sample_size = 2000000
ratings = ratings[0:sample_size].copy()
print(ratings.shape)
ratings.head()

(2000000, 3)


Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5


In [23]:
tags = pd.read_csv(f'{file_path}/tags_filtered_200.csv')
print(tags.shape)
tags.head()

(607432, 4)


Unnamed: 0,userId,movieId,tag,timestamp
0,14,110,epic,1443148538
1,14,260,sci-fi,1442169410
2,14,318,imdb top 250,1442615195
3,14,318,justice,1442615192
4,14,1682,philosophy,1442615158


In [35]:
#make sure all dataframes have the same movies included
movies = movies[movies['movieId'].isin(ratings['movieId'])]
movies = movies.reset_index(drop=True)

tags = tags[tags['movieId'].isin(ratings['movieId'])]
tags = tags.reset_index(drop=True)

movies = movies[movies['movieId'].isin(tags['movieId'])]
movies = movies.reset_index(drop=True)

ratings = ratings[ratings['movieId'].isin(tags['movieId'])]
ratings = ratings.reset_index(drop=True)

In [36]:
print(f"There are: {ratings['userId'].nunique()} unqiue users")
print(f"There are: {ratings['movieId'].nunique()} unqiue films in the ratings dataframe")
print(f"There are: {movies['movieId'].nunique()} unqiue films in the movies dataframe")
print(f"There are: {tags['movieId'].nunique()} unqiue films in the tags dataframe")
print(f"There are: {tags['tag'].nunique()} unqiue tags in the tags dataframe")

There are: 20503 unqiue users
There are: 21674 unqiue films in the ratings dataframe
There are: 21674 unqiue films in the movies dataframe
There are: 21674 unqiue films in the tags dataframe
There are: 876 unqiue tags in the tags dataframe


In [37]:
# #remove tags that occur less than 200 times from the tags dataframe
# from collections import Counter

# #get values for every tag and count the occurances
# all_tags = tags['tag'].values
# tag_counts = Counter(all_tags)

# # Sort the tags by their frequencies in descending order
# sorted_tags = sorted(tag_counts.items(), key=lambda x: x[1], reverse=True)

# # Print the sorted tags(its a very long output)
# # for tag, count in sorted_tags:
# #     print(tag, count)

# # Create a list of tags to remove (occurring less than 200 times)
# no_of_tag_occurances = 200
# tags_to_remove = [tag for tag, count in tag_counts.items() if count < no_of_tag_occurances]
# print(len(tags_to_remove), "tags to be removed from dataframe")

# #iterate through tags dataframe and find the index numbers of rows to be removed
# index_to_remove = []
# for num, tag in enumerate(tags['tag']):
#     if tag in tags_to_remove:
#         index_to_remove.append(num)

# tags_filtered = tags.drop(index_to_remove)
# tags_filtered = tags_filtered.reset_index(drop=True)
# print(tags_filtered.shape)
# tags_filtered.head()

In [38]:
# tags_filtered.to_csv(f'{file_path}/tags_filtered_200.csv', index=False)

# Join Data

In [39]:
#aggregate tags into list by movie id and attach to dataframe
tags_agg = tags.groupby('movieId')['tag'].agg(list)
movies_tag_df = pd.merge(movies, tags_agg, how='inner', on='movieId')

#eliminate duplicate tag in tag lists by turning into a set and then back into a list
movies_tag_df['tag'] = movies_tag_df['tag'].apply(lambda tags: list({tag for tag in tags}))

#display dataframe
print(movies_tag_df.shape)
movies_tag_df.head()

(21674, 4)


Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[bullying, USA, friendship, cute, humorous, An..."
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","[Robin Williams, comedy, childish, based on a ..."
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","[comedy, funny, duringcreditsstinger, sequel, ..."
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]","[revenge, divorce, chick flick, CLV, based on ..."
4,5,Father of the Bride Part II (1995),[Comedy],"[remake, parent child relationship, family, Co..."


In [40]:
# Extract the genre information
genres = movies_tag_df['genres'].values

# Perform one-hot encoding on the genre lists
mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(genres)

# Retrieve the list of genre classes
genre_classes = mlb.classes_

# Create a new dataframe with the one-hot encoded genre features
genre_df = pd.DataFrame(genre_features, columns=genre_classes)
genre_df.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [41]:
# Extract the tag information
tags = movies_tag_df['tag'].values

# Perform one-hot encoding on the tag lists
mlb = MultiLabelBinarizer()
tag_features = mlb.fit_transform(tags)

# Retrieve the list of tag classes
tag_classes = mlb.classes_

# Create a new dataframe with the one-hot encoded tag features
tag_df = pd.DataFrame(tag_features, columns=tag_classes)
tag_df.head()

Unnamed: 0,007,1920s,1930s,1950s,1960s,1970s,1980s,19th century,3d,70mm,...,witch,witty,wizards,woman director,women,world war ii,writer,writers,zombie,zombies
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
movies_features = pd.concat([movies_tag_df,genre_df,tag_df],axis=1)
print(movies_features.shape)
movies_features.head()

(21674, 900)


Unnamed: 0,movieId,title,genres,tag,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,witch,witty,wizards,woman director,women,world war ii,writer,writers,zombie,zombies
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[bullying, USA, friendship, cute, humorous, An...",0,0,1,1,1,1,...,0,1,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","[Robin Williams, comedy, childish, based on a ...",0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","[comedy, funny, duringcreditsstinger, sequel, ...",0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]","[revenge, divorce, chick flick, CLV, based on ...",0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),[Comedy],"[remake, parent child relationship, family, Co...",0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [43]:
movielens_df = pd.merge(ratings,movies_features,how='left', on = 'movieId')
movielens_df = movielens_df.drop(['title','genres','tag'],axis=1)
print(movielens_df.shape)
movielens_df.head()

(1985585, 899)


Unnamed: 0,userId,movieId,rating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,witch,witty,wizards,woman director,women,world war ii,writer,writers,zombie,zombies
0,1,307,3.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,481,3.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,1091,1.5,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1257,4.5,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1449,4.5,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [44]:
# Normalize the ratings between 0 and 1
max_rating = movielens_df['rating'].max()
min_rating = movielens_df['rating'].min()
print(max_rating)
print(min_rating)

movielens_df['rating'] = (movielens_df['rating'] - min_rating) / (max_rating - min_rating)
movielens_df.head()

5.0
0.5


Unnamed: 0,userId,movieId,rating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,witch,witty,wizards,woman director,women,world war ii,writer,writers,zombie,zombies
0,1,307,0.666667,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,481,0.666667,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,1091,0.222222,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1257,0.888889,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1449,0.888889,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [45]:
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(movielens_df, test_size=0.2, random_state=42)

# Determine the maximum values of userId and movieId for embedding layer sizes
max_user_id = ratings['userId'].max()
max_movie_id = ratings['movieId'].max()

# Determine the number of genre and tag features
genre_len = len(genre_df.columns)
tag_len = len(tag_df.columns)

# Prepare the input data for the neural network
train_user_ids = train_data['userId'].values
train_movie_ids = train_data['movieId'].values
train_genres = train_data.iloc[:, 3:23].values
train_tags = train_data.iloc[:, 23:].values
train_ratings = train_data['rating'].values

test_user_ids = test_data['userId'].values
test_movie_ids = test_data['movieId'].values
test_genres = test_data.iloc[:, 3:23].values
test_tags = test_data.iloc[:, 23:].values
test_ratings = test_data['rating'].values

In [46]:
print(train_user_ids.shape)
print(train_movie_ids.shape)
print(train_genres.shape)
print(train_tags.shape)
print(train_ratings.shape)

(1588468,)
(1588468,)
(1588468, 20)
(1588468, 876)
(1588468,)


# Define and Run Model

In [48]:
# Define the embedding sizes for users and movies
embedding_size = 50

############# COLLABORATIVE FILTERING ###############################################################################################
# Input Layers
user_input = Input(name='user_input', shape=(1,))
movie_input = Input(name='movie_input', shape=(1,))


# Matrix Factorization
## embed & reshape
cf_user_embedding = Embedding(name='cf_user_embedding', input_dim = max_user_id+1, output_dim = embedding_size)(user_input)
cf_user_reshape = Reshape(name = 'cf_user_reshape', target_shape=(embedding_size,))(cf_user_embedding)
cf_movie_embedding = Embedding(name='cf_movie_embedding', input_dim = max_movie_id+1, output_dim = embedding_size)(movie_input)
cf_movie_reshape = Reshape(name = 'cf_movie_reshape', target_shape=(embedding_size,))(cf_movie_embedding)

## Dot Product of users and movie embeddings
cf_user_movie_dot = Dot(name = 'cf_user_movie_dot', normalize=True, axes=1)([cf_user_reshape, cf_movie_reshape])

#Neural Network
## embed & reshape
nn_user_embedding = Embedding(name='nn_user_embedding', input_dim = max_user_id+1, output_dim = embedding_size)(user_input)
nn_user_reshape = Reshape(name = 'nn_user_reshape', target_shape=(embedding_size,))(nn_user_embedding)
nn_movie_embedding = Embedding(name='nn_movie_embedding', input_dim = max_movie_id+1, output_dim = embedding_size)(movie_input)
nn_movie_reshape = Reshape(name = 'nn_movie_reshape', target_shape=(embedding_size,))(nn_movie_embedding)

## concat & dense
nn_concat = Concatenate()([nn_user_reshape, nn_movie_reshape])
nn_dense = Dense(name="nn_dense", units=int(embedding_size/2), activation='relu')(nn_concat)



############# CONTENT BASED ###############################################################################################

#Genre Features
genre_input = Input(name="genre_input", shape=(genre_len,))
genre_features = Dense(name="genre_features", units=genre_len, activation='relu')(genre_input)
genre_input = Input(name="genre_input", shape=(genre_len,))
genre_features = Dense(name="genre_features", units=genre_len, activation='relu')(genre_input)



#TAG Features
tag_input = Input(name="tag_input", shape=(tag_len,))
tag_features = Dense(name="tag_features", units=tag_len, activation='relu')(tag_input)


############# OUTPUT ######################################################################################################


# Merge all
pred_ratings = Concatenate()([cf_user_movie_dot, nn_dense, genre_features, tag_features])
pred_ratings = Dense(name="pred_ratings", units=1, activation='linear')(pred_ratings)

# Compile
hybrid_model = Model(inputs=[user_input, movie_input,genre_input,tag_input], outputs=pred_ratings, name="hybrid_model")
hybrid_model.compile(loss='mse', optimizer='adam')

In [49]:
epochs = 5
batch_size = 32

# Train the model
hybrid_model.fit([train_user_ids, train_movie_ids, train_genres, train_tags], train_ratings, epochs = epochs, batch_size=128, shuffle=True, validation_split=0.3)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ccb202c5f90>

In [50]:
predictions = hybrid_model.predict([test_user_ids, test_movie_ids, test_genres, test_tags])
print("the highest rating the model predicted is:", np.max(predictions))
rmse = np.sqrt(mean_squared_error(test_ratings, predictions))
print('RMSE:', rmse)

the highest rating the model predicted is: 1.3184536
RMSE: 0.2045729164900602


In [51]:
predictions = np.clip(predictions, a_min=0, a_max=1.0)
print("the highest rating the model predicted is:", np.max(predictions))
rmse = np.sqrt(mean_squared_error(test_ratings, predictions))
print('RMSE:', rmse)

the highest rating the model predicted is: 1.0
RMSE: 0.20384048145209618


# Recommend Movies

In [52]:
#recommend the top "n" most recommended films for the test_user
n = 3
test_user = 222

#get all the values of the movie Ids
movie_ids = movies_features['movieId'].values
movie_genres = genre_df.values
movie_tags = tag_df.values

# get a list of movies that user has already seen
user_seen_movies = movielens_df[movielens_df['userId'] == test_user]['movieId'].values

# Get predictions for all movies for the selected user
user_predictions = hybrid_model.predict([np.array([test_user] * len(movie_ids)), movie_ids, movie_genres, movie_tags])

# Unnormalize the ratings
user_predictions = user_predictions * (max_rating - min_rating) + min_rating

# Sort the predictions in descending order
top_movie_indices = user_predictions.argsort(axis=0)[::-1]

#Clip ratings so ratings above 5.0 are 5 star ratings
user_predictions = np.clip(user_predictions, a_min=None, a_max=5.0)

# Filter out the movies that the user has already seen from the recommendations
# Limit the recommendations to the top "n" unseen movies
top_unseen_movie_indices = [index for index in top_movie_indices if movie_ids[index] not in user_seen_movies]
top_unseen_movie_indices = top_unseen_movie_indices[:n]

# Retrieve the movie IDs of the top "n" movies
print(f"Top {n} Recommended Films for User", test_user)
for index in top_unseen_movie_indices:
    movie_title = movies.loc[index, 'title'].values
    rating = user_predictions[index]
    print("Movie:", movie_title[0], "| Predicted Rating:", rating[0][0])

Top 3 Recommended Films for User 222
Movie: Five, The (Gonin) (1995) | Predicted Rating: 5.0
Movie: Pale Rider (1985) | Predicted Rating: 5.0
Movie: Planet Earth (2006) | Predicted Rating: 5.0


In [53]:
#create a data frame of all the films the user has rated and append a column with the predicted rating from the model

#create user_ratings dataframe
user_ratings = movielens_df[movielens_df['userId'] == test_user][['userId', 'movieId', 'rating']]
user_ratings['rating'] = user_ratings['rating'] * (max_rating - min_rating) + min_rating
user_ratings = pd.merge(user_ratings, movies,on = 'movieId', how='left')
user_films_viewed = user_ratings['movieId'].values

# Create an array of the user ID to match the shape of user_films_viewed
user_id_array = np.array([test_user] * len(user_films_viewed))

#create a array of all the genres & tags from the movies that the user watched
user_movie_features = movies_features[movies_features['movieId'].isin(user_ratings['movieId'])]
user_movie_genres = user_movie_features.iloc[:, 4:24].values
user_movie_tags = user_movie_features.iloc[:, 24:].values


# Get the model predictions for the user and movie IDs
user_predictions = hybrid_model.predict([user_id_array, user_films_viewed, user_movie_genres, user_movie_tags])
user_predictions = user_predictions * (max_rating - min_rating) + min_rating
user_predictions = np.clip(user_predictions, a_min=0, a_max=5.0)

#add a column for predicted rating to the data frame
user_ratings['predicted_rating'] = user_predictions
user_ratings.sort_values(by='rating',ascending=False).head(20)



Unnamed: 0,userId,movieId,rating,title,genres,predicted_rating
0,222,19,5.0,Ace Ventura: When Nature Calls (1995),[Comedy],3.456064
66,222,76251,5.0,Kick-Ass (2010),"[Action, Comedy]",3.337032
86,222,89745,5.0,"Avengers, The (2012)","[Action, Adventure, Sci-Fi, IMAX]",4.56489
78,222,87483,5.0,Mr. Popper's Penguins (2011),[Comedy],3.939748
77,222,86882,5.0,Midnight in Paris (2011),"[Comedy, Fantasy, Romance]",4.799458
76,222,86880,5.0,Pirates of the Caribbean: On Stranger Tides (2...,"[Action, Adventure, Fantasy, IMAX]",4.8852
75,222,86332,5.0,Thor (2011),"[Action, Adventure, Drama, Fantasy, IMAX]",4.497845
73,222,85788,5.0,Insidious (2010),"[Fantasy, Horror, Thriller]",4.032255
70,222,80549,5.0,Easy A (2010),"[Comedy, Romance]",4.854212
69,222,80463,5.0,"Social Network, The (2010)",[Drama],4.84447
