<h1>Basic Recommender System</h1>

<h2>Imports</h2>

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz
from random import random
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split



<h2>Functions</h2>

In [2]:
def filter_data():
    """
    Function to ready data for recommendation. Returns a movie-user matrix 
    of interesting data. This data is a matrix of all userId's on the columns
    and all movieId's as the rows. Values are ratings where the all movie values have
    more than 50.
    
    Returns:
    movie_user_matrix (scipy.sparse.csr.csr_matrix)
    hashmap (dict): holds movie title and index of that movie
    """
    
    # Read data
    movies = pd.read_csv("data/movies.csv",usecols=['movieId', 'title'], dtype={'movieId': 'int32', 'title': 'str'})
    ratings = pd.read_csv("data/ratings.csv", usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
    
    # The count of each movie 
    movies_count = pd.DataFrame(ratings.groupby('movieId').size(), columns=['count'])

    # Get popular movies that only have more than 50 ratings
    popular = list(set(movies_count.query('count >= 50').index))
    movies_filtered = ratings.movieId.isin(popular).values

    # The count of each user
    users_count = pd.DataFrame(ratings.groupby('userId').size(), columns=['count'])
    
    # Get popular users that have more than 50 ratings
    active_users = list(set(users_count.query('count >= 50').index))
    users_filtered = ratings.userId.isin(active_users).values
    
    ratings_filtered = ratings[movies_filtered & users_filtered]
    
    # Create movie-user matrix which will be needed for KNN, empty values are set to 0
    movie_user = ratings_filtered.pivot(index='movieId', columns='userId', values='rating').fillna(0)
        
    # Hashmap that point from movie title to index
    hashmap = {
        movie: i for i, movie in 
        enumerate(list(movies.set_index('movieId').loc[movie_user.index].title))
    }
    
    # Since movie_user matrix is very sparse
    # we transform matrix to scipy sparse matrix 
    movie_user_matrix = csr_matrix(movie_user.values)
    
    
    
    return movie_user_matrix, hashmap

In [3]:
def recommend(model, movie_user_matrix, movie):
    """
    Recommends 10 movies for the user based on a movie name they input
    
    Arguments:
    model ('sklearn.neighbors._unsupervised.NearestNeighbors')
    movie_user_matrix (scipy.sparse.csr.csr_matrix): compressed sparse matrix
    movie (int): index of the most similar movie title to the user_movie string
    
    Returns:
    recommendations (list)
    """
    
    # Fit the data to the model
    model.fit(movie_user_matrix)
    
    distance, index = model.kneighbors(movie_user_matrix[movie], n_neighbors=11)
    
    # Recommendations
    recommendations = \
            sorted(
                list(
                    zip(
                        index.squeeze().tolist(),
                        distance.squeeze().tolist()
                    )
                ),
                key=lambda x: x[1]
            )[:0:-1]
    
    return recommendations

In [4]:
def find_movie(hashmap, user_movie):
    """
    Finds the movie name in the dictionary
    
    Arguments:
    hashmap (dict): holds movie title and index of that movie
    user_movie (string): user entered movie title
    
    Returns:
    movie (int): index of the most similar movie title to the user_movie string
                 returns -1 if movie not found
    """
    
    movies = []
    
    for key, val in hashmap.items():
        # Uses fuzzywuzzy to get similarity in name and picks closest match
        ratio = fuzz.ratio(key.lower(), user_movie.lower())
        
        if ratio >= 60:
            movies.append((key, val, ratio))
    
    movies = sorted(movies, key=lambda x: x[2])[::-1]

    if not movies:
        # change this to a special character
        movie = -1
    else:
        movie = movies[0][1]
        
    return movie

<h2>Testing</h2>

In [7]:
movie_user_matrix, hashmap = filter_data()
# movie_user_matrix = movie_user_matrix.transpose()
# hold all value indices as a list of tuples
# all_cell_indices = indices_of_matrix(movie_user_matrix)
# random_cells = get_random_indices(all_cell_indices, 0.2)

user_input = input("Enter a movie title: ")
movie = find_movie(hashmap, user_input)
if movie == -1:
    print("Movie not found.")
else:
    model = NearestNeighbors(n_neighbors=11, algorithm='brute', metric='cosine', n_jobs=-1)
    recommendations = recommend(model, movie_user_matrix, movie)
    
    reverse_hashmap = {v: k for k, v in hashmap.items()}

    print('Recommendations for {}:'.format(movie))
    for i, (idx, dist) in enumerate(recommendations):
        print('{0}: {1} '.format(i, reverse_hashmap[idx], dist))


Enter a movie title: V for Vendetta
Recommendations for 396:
0: Lord of the Rings: The Two Towers, The (2002) 
1: Lord of the Rings: The Return of the King, The (2003) 
2: Lord of the Rings: The Fellowship of the Ring, The (2001) 
3: Pan's Labyrinth (Laberinto del fauno, El) (2006) 
4: Departed, The (2006) 
5: Kill Bill: Vol. 2 (2004) 
6: Kill Bill: Vol. 1 (2003) 
7: Sin City (2005) 
8: 300 (2007) 
9: Batman Begins (2005) 
