# Similar Movie Recommender

### Source

- https://www.linkedin.com/learning/machine-learning-ai-foundations-recommendations

## Setup

### Configuration

In [2]:
# Choose a movie to find similar movies to. Let's find movies similar to movie #5:

movie_id = 5

### Libraries

- https://pandas.pydata.org/
- http://www.numpy.org/

In [3]:
import numpy as np
import pandas as pd
import matrix_factorization_utilities

## Load data

In [4]:
# Load user ratings
df = pd.read_csv('data/movie_ratings_data_set.csv')
df.head(5)

Unnamed: 0,user_id,movie_id,value
0,1,28,4
1,1,26,4
2,1,9,4
3,1,1,4
4,1,14,4


In [5]:
# Load movie titles
movies_df = pd.read_csv('data/movies.csv', index_col='movie_id')
movies_df.head(5)

Unnamed: 0_level_0,title,genre
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,The Sheriff 1,"crime drama, western"
2,The Big City Judge 1,legal drama
3,The Sheriff 2,"crime drama, western"
4,Just a Regular Family,reality
5,The Big City Judge 2,legal drama


In [6]:
# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(df, index='user_id', columns='movie_id', aggfunc=np.max)

# Create a csv file of the data for easy viewing
ratings_df.to_csv("output/review_matrix.csv", na_rep="")

ratings_df.head(5)

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
movie_id,1,2,3,4,5,6,7,8,9,10,...,25,26,27,28,29,30,31,32,33,34
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.0,,,,,,,,4.0,,...,,4.0,,4.0,,,,,,
2,5.0,5.0,,,,,,,,,...,,,,,,,3.0,,,4.0
3,4.0,4.0,5.0,,,,,,,,...,,,,,,,,,,
4,5.0,5.0,,5.0,5.0,,,,,,...,,,,,,,,,,
5,5.0,,,,,,,,5.0,,...,,,,,3.0,,3.0,2.0,5.0,5.0


## Build model

In [8]:
# Apply matrix factorization to find the latent features
U, M = matrix_factorization_utilities.low_rank_matrix_factorization(ratings_df.values,
                                                                    num_features=15,
                                                                    regularization_amount=1.0)

# Swap the rows and columns of product_features just so it's easier to work with
M = np.transpose(M)

Optimization terminated successfully.
         Current function value: 312.762757
         Iterations: 1582
         Function evaluations: 2365
         Gradient evaluations: 2365


## Prediction

### Get current movie features

In [9]:
# Get movie #1's name and genre
movie_information = movies_df.loc[movie_id]

print("We are finding movies similar to this movie:")
print("Movie title: {}".format(movie_information.title))
print("Genre: {}".format(movie_information.genre))

# Get the features for movie #1 we found via matrix factorization
current_movie_features = M[movie_id - 1]

print("The attributes for this movie are:")
print(current_movie_features)

We are finding movies similar to this movie:
Movie title: The Big City Judge 2
Genre: legal drama
The attributes for this movie are:
[ 0.66540002 -0.82926466 -0.72701454  0.52204921 -0.84848216 -1.84147034
 -0.78728766  0.25962615 -0.11966971  0.11401201 -0.15075569 -0.17696512
 -0.23309564 -0.81290838  1.08279139]


### Finding similar movies

In [11]:
# 1. Subtract the current movie's features from every other movie's features
difference = M - current_movie_features

# 2. Take the absolute value of that difference (so all numbers are positive)
absolute_difference = np.abs(difference)

# 3. Each movie has 15 features. Sum those 15 features to get a total 'difference score' for each movie
total_difference = np.sum(absolute_difference, axis=1)

# 4. Create a new column in the movie list with the difference score for each movie
movies_df['difference_score'] = total_difference

# 5. Sort the movie list by difference score, from least different to most different
sorted_movie_list = movies_df.sort_values('difference_score')

# 6. Print the result, showing the 5 most similar movies to movie_id #1
print("The five most similar movies are:")
print(sorted_movie_list[['title', 'difference_score']][0:5])

The five most similar movies are:
                            title  difference_score
movie_id                                           
5            The Big City Judge 2          0.000000
10        Surrounded by Zombies 1          1.872421
9                     Biker Gangs          2.599769
3                   The Sheriff 2          2.695851
24           The Big City Judge 3          2.787371
