# Movie Collaborative Recommender

### Libraries
- https://pandas.pydata.org/
- http://www.numpy.org/
- https://docs.python.org/3/library/pickle.html

### Source
- https://www.linkedin.com/learning/machine-learning-ai-foundations-recommendations

In [1]:
# Configuration

# user_id to get recommendations (Between 1 and 100):")
user_id_to_search = 1

In [2]:
# Libraries

import os
import webbrowser

import numpy as np
import pandas as pd
import pickle

import matrix_factorization_utilities

## Load data

In [3]:
# Load user ratings
raw_dataset_df = pd.read_csv('data/movie_ratings_data_set.csv')
raw_dataset_df.head()

Unnamed: 0,user_id,movie_id,value
0,1,28,4
1,1,26,4
2,1,9,4
3,1,1,4
4,1,14,4


In [4]:
# Load movie titles
movies_df = pd.read_csv('data/movies.csv', index_col='movie_id')
movies_df.head()

Unnamed: 0_level_0,title,genre
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,The Sheriff 1,"crime drama, western"
2,The Big City Judge 1,legal drama
3,The Sheriff 2,"crime drama, western"
4,Just a Regular Family,reality
5,The Big City Judge 2,legal drama


In [5]:
# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(raw_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)
ratings_df.head()

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
movie_id,1,2,3,4,5,6,7,8,9,10,...,25,26,27,28,29,30,31,32,33,34
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.0,,,,,,,,4.0,,...,,4.0,,4.0,,,,,,
2,5.0,5.0,,,,,,,,,...,,,,,,,3.0,,,4.0
3,4.0,4.0,5.0,,,,,,,,...,,,,,,,,,,
4,5.0,5.0,,5.0,5.0,,,,,,...,,,,,,,,,,
5,5.0,,,,,,,,5.0,,...,,,,,3.0,,3.0,2.0,5.0,5.0


## Build model

In [6]:
# Apply matrix factorization to find the latent features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(ratings_df.values,
                                                                    num_features=15,
                                                                    regularization_amount=0.1)

# Find all predicted ratings by multiplying U and M matrices
predicted_ratings = np.matmul(U, M)

# Save all the ratings to a csv file
predicted_ratings_df = pd.DataFrame(index=ratings_df.index, columns=ratings_df.columns, data=predicted_ratings)
predicted_ratings_df.to_csv("output/predicted_ratings.csv")

predicted_ratings_df.head()

         Current function value: 32.504364
         Iterations: 3000
         Function evaluations: 4477
         Gradient evaluations: 4477


Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
movie_id,1,2,3,4,5,6,7,8,9,10,...,25,26,27,28,29,30,31,32,33,34
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.003219,3.920675,4.155678,3.527622,3.920314,4.142751,2.655788,4.066306,4.010429,3.805244,...,3.797055,3.984582,3.33024,3.990531,3.457332,3.108841,2.723685,3.356601,3.040114,4.570281
2,4.970693,4.967969,4.445326,3.903774,4.831247,4.706595,3.394144,4.698896,4.53167,4.837432,...,4.314343,4.494218,4.414652,4.163664,3.462445,4.388993,2.994315,4.346963,4.013346,3.99941
3,4.017211,3.985701,4.955713,4.088405,4.528626,4.271157,2.818616,4.337878,4.491406,4.367755,...,4.080597,4.316983,3.879431,4.474723,3.205893,3.238396,2.233283,2.591451,3.445053,4.599094
4,4.991065,4.987251,4.928978,4.961957,5.010571,4.961595,3.945017,5.076058,4.951922,5.017218,...,4.062708,4.741866,4.503092,4.906702,3.116599,3.413158,2.008294,4.223563,4.171658,4.64432
5,4.980619,4.24051,5.461142,4.373737,5.162612,4.555445,3.15028,4.638079,4.982059,4.918989,...,4.506947,4.723241,3.737432,5.073127,3.017239,3.61039,2.996268,2.031798,4.980752,4.987311


## Save model

In [7]:
# Save features and predicted ratings to files for later use
pickle.dump(U, open("models/user_features.dat", "wb"))
pickle.dump(M, open("models/product_features.dat", "wb"))
pickle.dump(predicted_ratings, open("models/predicted_ratings.dat", "wb" ))

## Validate model

In [8]:
# Mesure accuracy with training, testing datasets

# Load user ratings
raw_training_dataset_df = pd.read_csv('data/movie_ratings_data_set_training.csv')
raw_testing_dataset_df = pd.read_csv('data/movie_ratings_data_set_testing.csv')

# Convert the running list of user ratings into a matrix
ratings_training_df = pd.pivot_table(raw_training_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)
ratings_testing_df = pd.pivot_table(raw_testing_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(ratings_training_df.values,
                                                                    num_features=11,
                                                                    regularization_amount=1.1)

# Find all predicted ratings by multiplying U and M
predicted_ratings = np.matmul(U, M)

# Measure RMSE
rmse_training = matrix_factorization_utilities.RMSE(ratings_training_df.values, predicted_ratings)
rmse_testing = matrix_factorization_utilities.RMSE(ratings_testing_df.values, predicted_ratings)

print("Training RMSE: {}".format(rmse_training))
print("Testing RMSE: {}".format(rmse_testing))

Optimization terminated successfully.
         Current function value: 315.538580
         Iterations: 875
         Function evaluations: 1306
         Gradient evaluations: 1306
Training RMSE: 0.24952560869598406
Testing RMSE: 1.2096509829672488


## Prediction

In [9]:
# Load prediction rules from data files
U = pickle.load(open("models/user_features.dat", "rb"))
M = pickle.load(open("models/product_features.dat", "rb"))
predicted_ratings = pickle.load(open("models/predicted_ratings.dat", "rb"))

# Load movie titles
movies_df = pd.read_csv('data/movies.csv', index_col='movie_id')

In [10]:
print("Movies previously reviewed by user_id {}:".format(user_id_to_search))

reviewed_movies_df = raw_dataset_df[raw_dataset_df['user_id'] == user_id_to_search]
reviewed_movies_df = reviewed_movies_df.join(movies_df, on='movie_id')

print(reviewed_movies_df[['title', 'genre', 'value']])

print("\nMovies we will recommend:")

user_ratings = predicted_ratings[user_id_to_search - 1]
movies_df['rating'] = user_ratings

already_reviewed = reviewed_movies_df['movie_id']
recommended_df = movies_df[movies_df.index.isin(already_reviewed) == False]
recommended_df = recommended_df.sort_values(by=['rating'], ascending=False)

print(recommended_df[['title', 'genre', 'rating']].head(5))

Movies previously reviewed by user_id 1:
               title                  genre  value
0      The Sheriff 4   crime drama, western      4
1  Mafia Underground  crime drama, thriller      4
2        Biker Gangs    crime drama, action      4
3      The Sheriff 1   crime drama, western      4
4     The Spy Family              spy drama      4
5      The Sheriff 3   crime drama, western      5

Movies we will recommend:
                              title                  genre    rating
movie_id                                                            
34            The Serious Detective        detective drama  4.570281
3                     The Sheriff 2   crime drama, western  4.155678
6                 Attack on Earth 1         sci-fi, action  4.142751
19        Fake News about Fake News         satire, comedy  4.119082
8          Sci-Fi Murder Detectives  supernatural, mystery  4.066306


## Troubleshooting

In [11]:
# Create a web page view of the data for easy viewing
html = raw_dataset_df[0:100].to_html()

# Save the html to a temporary file
with open("output/data.html", "w") as f:
    f.write(html)

# Open the web page in our web browser
full_filename = os.path.abspath("output/data.html")
webbrowser.open("file://{}".format(full_filename))

True

In [12]:
# Create a web page view of the movie list for easy viewing
html = movies_df.to_html()

# Save the html to a temporary file
with open("output/movie_list.html", "w") as f:
    f.write(html)

# Open the web page in our web browser
full_filename = os.path.abspath("output/movie_list.html")
webbrowser.open("file://{}".format(full_filename))

True

In [13]:
# Create a web page view of the review matrix for easy viewing
html = ratings_df.to_html(na_rep="")

# Save the html to a temporary file
with open("output/review_matrix.html", "w") as f:
    f.write(html)

# Open the web page in our web browser
full_filename = os.path.abspath("output/review_matrix.html")
webbrowser.open("file://{}".format(full_filename))

True