# Content Based Recommendation - Latent Factors
[Based on a dummy data set by Adam Geitgey]

In [None]:
import pandas as pd
import numpy as np

## Genres of some 34 fictional movies

In [None]:
movies_df = pd.read_csv('_movies.csv', index_col='movie_id')
movies_df.head()

## Read 680 user movie ratings by 100 unique users

In [None]:
ratings_df = pd.read_csv("_movie_ratings.csv")
ratings_df.head()

In [None]:
# Refresher on DataFrames: How do you find the number of unique users?






## Convert `ratings_df` into a `user_ratings_df` matrix
Hint: use `pivot_table`

In [None]:
user_ratings_df = ???
user_ratings_df.head()

#### Note that though we view the above as a matrix, data type wise it is still a Pandas data frame

In [None]:
type(ratings_df), type(user_ratings_df)

# Factor the `user_ratings_df` matrix

In [None]:
import matrix_factorization_utilities as m

In [None]:
# Apply matrix factorization to find the latent features

# Now, we convert the dataframe "matrix" to a real 2D numpy matrix

mat = user_ratings_df.values 
type(mat)
mat

In [None]:
U, M = m.low_rank_matrix_factorization(mat, num_features=15)

#### U and M are numpy matrices

In [None]:
type(U), type(M)

In [None]:
U.shape, M.shape

### Find all predicted ratings by multiplying the U by M

In [None]:
predicted_ratings_np = np.matmul(U, M)

In [None]:
predicted_ratings_np.shape

In [None]:
predicted_ratings_np

In [None]:
predicted_ratings_np[2]

### Convert the numpy matrix into a dataframe for easy viewing

In [None]:
predicted_ratings_df = pd.DataFrame(index     = user_ratings_df.index,
                                    columns   = user_ratings_df.columns,
                                    data      = predicted_ratings_np)

In [None]:
predicted_ratings_df

## Find similar products

In [None]:
# Swap the rows and columns of product_features just so it's easier to work with
MT = np.transpose(M)

In [None]:
movies_df.head()

In [None]:
# Choose a movie to find similar movies to. Let's find movies similar to movie #5:
movie_id = 5

# Get movie #1's name and genre
movie_information = movies_df.loc[movie_id]

In [None]:
movie_information

In [None]:
#M

#### Using the features we got via matrix factorizatin, get the features for the movie

In [None]:
# since movie_ids start from 1 and indexing in NumPy arrays/matrices starts from 0
# we need to substract 1 from movied_id

current_movie_features = MT[movie_id - 1]
current_movie_features

## How would you get this same information from `M`?

### To find similar movies we:

#### 1. Subtract the current movie's features from every other movie's features

In [None]:
difference = MT - current_movie_features
difference

#### 2. Take the absolute value of that difference (so all numbers are positive)

In [None]:
absolute_difference = np.abs(difference)

#### 3. Each movie has 15 features. Sum those 15 features to get a total 'difference score' for each movie

In [None]:
total_difference = np.sum(absolute_difference, axis=1)

In [None]:
total_difference

In [None]:
total_difference.shape

#### 4. Create a new column in the movie list with the difference score for each movie

In [None]:
movies_df['difference_score'] = total_difference

In [None]:
movies_df.head()

#### 5. Sort the movie list by difference score, from least different to most different

In [None]:
sorted_movie_list = movies_df.sort_values('difference_score')

#### 6. The 5 most similar movies to movie_id

In [None]:
sorted_movie_list[['title', 'difference_score']][0:5]

# Make a Recommendation

In [None]:
# Set the below to a userID
user_id = 2

In [None]:
reviewed_movies_df = ratings_df[ratings_df['user_id'] == user_id]
reviewed_movies_df

In [None]:
#reviewed_movies_joined_df = reviewed_movies_df.join(movies_df, on='movie_id')
reviewed_movies_joined_df = pd.merge(reviewed_movies_df, movies_df, on='movie_id')
reviewed_movies_joined_df

In [None]:
#predicted_ratings_df

## Recommended movies

In [None]:
user_ratings = predicted_ratings_np[user_id - 1]
user_ratings

In [None]:
movies_df['rating'] = user_ratings

In [None]:
movies_df.head()

In [None]:
reviewed_movies_df.head()

In [None]:
already_reviewed = reviewed_movies_df['movie_id']

recommended_df = movies_df[movies_df.index.isin(already_reviewed) == False]
recommended_df = recommended_df.sort_values(by=['rating'], ascending=False)

#recommended_df[['title', 'genre', 'rating']].head(5)
recommended_df.head()