# Movie Recommender 1.0

###### April, 2020

In [None]:
"""
Context
In this lab, you will be implementing a simple movie recommender system.
Dataset details
You will be using the ml-m1 dataset from the MovieLense website.
You will be using movies.dat and rating.dat for building your recommender.
"""
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
""" Step 0. load and preprocess data """

# use info. from README.txt to name columns
user_col_names = 'UserID::Gender::Age::Occupation::Zip-code'.split('::')
# load data using '::' as separator
users = pd.read_table(r'ml-1m\users.dat', header = None, sep = '::', names = user_col_names)
users.head()
users.shape
users.info()
users.describe()


# load and preprocess movies.dat
movie_col_names = 'MovieID::Title::Genres'.split('::')
movies = pd.read_table(r'ml-1m\movies.dat', header = None, sep ='::', names = movie_col_names, encoding = 'ISO-8859-1')
movies.head()
movies.shape
movies.info()
movies.describe()


# load and preprocess ratings.dat
rating_col_names = 'UserID::MovieID::Rating::Timestamp'.split('::')
ratings = pd.read_table(r'ml-1m\ratings.dat', header = None, sep ='::', names = rating_col_names)
ratings.head()
ratings.shape
ratings.info()
ratings.describe()

In [None]:
#### load and preprocess data ####

""" Step 1. Create m x u matrix with movies as row and users as column"""
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

R_df = ratings.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
R_df.head()

In [None]:
# convert data frame to a matrix

#R = R_df.as_matrix() 
R = np.array(R_df)
print(R)

In [None]:
""" Step 2. Normalize the matrix """

# normalize by each users mean
user_ratings_mean = np.mean(R, axis = 1)
pd.DataFrame(user_ratings_mean).head()

# normalize it
R_demeaned = R - user_ratings_mean.reshape(-1, 1)
pd.DataFrame(R_demeaned).head()

In [None]:
""" Step 3. Compute SVD to get U, S, and V. Use np.linalg.svd() """
from scipy.sparse.linalg import svds

# choose 50 of them so 
U, sigma, Vt = svds(R_demeaned, k = 50) 
U.shape
# convert sigma to diagonal matrix to leverage matrix multiplication to get predictions
sigma = np.diag(sigma)
sigma.shape

Vt.shape
V = Vt.T
V.shape

In [None]:
""" Step 4. From your V.T select 50 components """
print(Vt.shape, '\n\n', Vt)

In [None]:
# multiply U, sigma, Vt back to get 50 approximations of R
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1) # add user mean back for 5 stars
all_user_predicted_ratings.shape
pd.DataFrame(all_user_predicted_ratings).head()

# this is the prediction matrix from which top k movies can be found
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)
preds_df.shape
preds_df.columns
# 6040 users, 3706 movies
preds_df.head()

In [None]:
""" Step 5. Implement a function that takes movieID as input and then implement cosine similarity
            along with sorting to recommend the top 10 movies. """
            
# Cosine similarity is a metric used to determine how similar the documents are irrespective of their size.

# use columns of preds_df as vectors to compute cosine similarity

def cos_sim(a, b):
	"""Takes 2 vectors a, b and returns the cosine similarity according 
	to the definition of the dot product
	"""
	dot_product = np.dot(a, b)
	norm_a = np.linalg.norm(a)
	norm_b = np.linalg.norm(b)
	return dot_product / (norm_a * norm_b)

def movie_cosine(movieID):
    """Takes one movie ID and returns the cosine similarity """
    n = preds_df.shape[1]
    similarity = []
    for i in range(n):
        # movie ID is 1 larger and index 
        cosim = cos_sim(preds_df.iloc[:, movieID-1], preds_df.iloc[:, i])
        similarity.append(cosim)
    return similarity

def find_top10(movieID):
    """returns the movie IDs and information of the those movies"""
    # find the indices of the cosine similarity from min to max
    alist = movie_cosine(movieID)
    ind = np.argsort(np.array(alist))
    # pick the first 11 of the reversed indices (from max to min)
    top10_ind = ind[::-1][0:11]
    #return the columns names(movie ID) that have the top 11 similarity by broadcasting
    top10 = preds_df.iloc[:, top10_ind].columns[1:]
    top10 = np.array(top10)
    # return the top 10 similar movieID and the fulll information of the movies
    return (top10, '\n', movies.iloc[(top10-1), ])

find_top10(2)

In [None]:
find_top10(200)

In [None]:
find_top10(3000)