# Recommendation Systems

We will build four recommendation systems and then combine them to help compensate for weakenesses in each.

1. Popularity Ranking (Combining Ratings and Number of Ratings)
2. Nearest Neighbors Based Recommender
3. Correlation-Based Recommender
4. Genre-Based TFIDF Recommender

In [276]:
import pandas as pd

ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")

print("ratings.csv\n",ratings.head())
print()
print("movies.csv\n",movies.head())

ratings.csv
    userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

movies.csv
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


### Popularity Ranking (Combining Ratings and Number of Ratings)

In [288]:
# Add some features to our movies, including a custom metric for rating.
import math

movie_freq = ratings[['userId', 'movieId']].groupby(
    'movieId').count().reset_index()
movie_freq.columns = ['movieId', 'n_ratings']

mean_rating = ratings.groupby('movieId')[['rating']].mean()
mean_rating = mean_rating.rename(columns={'rating': 'mean_rating'})
combined = pd.merge(movie_freq, mean_rating, on='movieId')
combined = pd.merge(combined, movies, on='movieId')

# We use a custom metric that combines mean rating with the number of ratings.
# This attempts to quantify the increased chance that someone would like to see
# a movie that many others have seen.
combined['log_ratings'] = combined['n_ratings'].apply(
    lambda x: math.log(
        max(x-5,0.1) # Don't count the first five ratings
        ,100) # Only count ratings at a slow logarithmic rate
    -0.85) # Adjust so that movies with 50 ratings are not affected.

combined['adj_rating'] = combined['mean_rating']+combined['log_ratings']
combined = combined[[
    'movieId',
    'mean_rating',
    'n_ratings',
    'log_ratings',
    'adj_rating'
]]

adj_movies = movies.merge(combined, on='movieId')
adj_movies.sort_values(by='adj_rating', ascending=False).head()

Unnamed: 0,movieId,title,genres,mean_rating,n_ratings,log_ratings,adj_rating
277,318,"Shawshank Redemption, The (1994)",Crime|Drama,4.429022,317,0.397077,4.826099
2224,2959,Fight Club (1999),Action|Crime|Drama|Thriller,4.272936,218,0.31419,4.587126
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,4.197068,307,0.390003,4.587072
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.231076,251,0.345468,4.576543
659,858,"Godfather, The (1972)",Crime|Drama,4.289062,192,0.285921,4.574983
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War,4.164134,329,0.405273,4.569406
1938,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,4.192446,278,0.368081,4.560527
461,527,Schindler's List (1993),Drama|War,4.225,220,0.316219,4.541219
46,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4.237745,204,0.299427,4.537172
510,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,4.16129,279,0.368875,4.530166


In [304]:
# Build a base class for our recommenders.

class Recommender:
    def __init__(self,movies):
        self.movies = movies
        self.movie_dict = dict(zip(movies['movieId'], movies.to_dict('records')))

    def movie_string(self,movie_id):
        movie = self.movie_dict[movie_id]
        return "  {:50s} {:5d} {:.2f} {:3d}\n    {}".format(
            movie['title'][0:50],
            movie_id,
            movie['mean_rating'],
            movie['n_ratings'],
            movie['genres'],
        )
    
    def recommend_similar_movies(self,movie_id,k=10):
        similar_ids = self.find_similar_movies(movie_id,k)

        class_name = self.__class__.__name__
        movie_string = self.movie_string(movie_id)
        print(f"{class_name}: Since you watched...\n{movie_string}\n... you might also like:")
        
        for i in similar_ids[0:k]:
            movie = self.movie_dict[i]
            if movie is None:
                continue
            print(self.movie_string(i))

    def test_run(self,k=10):
        
        # Toy Story (1995)   
        self.recommend_similar_movies(1)
        print()
        
        # Star Wars: Episode VI - Return of the Jedi (1983)
        self.recommend_similar_movies(1210)
        print()
        
        # Monty Python's Life of Brian (1979)
        self.recommend_similar_movies(1080)

### Nearest Neighbors Based Recommender

In [305]:
# This KNN Recommender class was built from this walk-through:
# https://www.geeksforgeeks.org/recommendation-system-in-python/

import numpy as np

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

class KnnRecommender(Recommender):
    def __init__(self,ratings,movies):
        super().__init__(movies)
        N = len(ratings['userId'].unique())
        M = len(ratings['movieId'].unique())
     
        self.user_mapper = dict(zip(np.unique(ratings["userId"]), list(range(N))))
        self.movie_mapper = dict(zip(np.unique(ratings["movieId"]), list(range(M))))
     
        self.user_inv_mapper = dict(zip(list(range(N)), np.unique(ratings["userId"])))
        self.movie_inv_mapper = dict(zip(list(range(M)), np.unique(ratings["movieId"])))
        
        user_index = [self.user_mapper[i] for i in ratings['userId']]
        movie_index = [self.movie_mapper[i] for i in ratings['movieId']]
    
        self.X = csr_matrix((ratings["rating"], (movie_index, user_index)), shape=(M, N))
        
    def find_similar_movies(self,movie_id,k, metric='cosine', show_distance=False):
        neighbour_ids = []
        
        if movie_id not in self.movie_mapper:
            print(f"Movie ID {movie_id} not found in movie_mapper!")
            return []
    
        movie_ind = self.movie_mapper[movie_id]
        movie_vec = self.X[movie_ind]
        k += 1
        kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
        kNN.fit(self.X)
        movie_vec = movie_vec.reshape(1, -1)
        neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)

        for i in range(0, k):
            n = neighbour.item(i)
            neighbour_ids.append(self.movie_inv_mapper[n])
        
        neighbour_ids.pop(0) 
        return neighbour_ids

In [306]:
knn_rec = KnnRecommender(ratings,adj_movies)
knn_rec.test_run()

KnnRecommender: Since you watched...
  Toy Story (1995)                                       1 3.92 215
    Adventure|Animation|Children|Comedy|Fantasy
... you might also like:
  Toy Story 2 (1999)                                  3114 3.86  97
    Adventure|Animation|Children|Comedy|Fantasy
  Jurassic Park (1993)                                 480 3.75 238
    Action|Adventure|Sci-Fi|Thriller
  Independence Day (a.k.a. ID4) (1996)                 780 3.45 202
    Action|Adventure|Sci-Fi|Thriller
  Star Wars: Episode IV - A New Hope (1977)            260 4.23 251
    Action|Adventure|Sci-Fi
  Forrest Gump (1994)                                  356 4.16 329
    Comedy|Drama|Romance|War
  Lion King, The (1994)                                364 3.94 172
    Adventure|Animation|Children|Drama|Musical|IMAX
  Star Wars: Episode VI - Return of the Jedi (1983)   1210 4.14 196
    Action|Adventure|Sci-Fi
  Mission: Impossible (1996)                           648 3.54 162
    Action|Adventur

### Correlation-Based Recommender

In [307]:
# This "corrwith" class built from the walk-through here:
# https://analyticsindiamag.com/deep-tech/how-to-build-your-first-recommender-system-using-python-movielens-dataset/

import warnings

class CorrRecommender(Recommender):
    def __init__(self,ratings,movies):
        super().__init__(movies)
        
        self.data = ratings.merge(movies,on='movieId', how='left')
        self.movie_user = self.data.pivot_table(index='userId',columns='title',values='rating')
        
        avg_ratings = pd.DataFrame(self.data.groupby('title')['rating'].mean())
        avg_ratings['Total Ratings'] = pd.DataFrame(self.data.groupby('title')['rating'].count())
        self.avg_ratings = avg_ratings

    def find_similar_movies(self,movie_id,k):
        movie_title = self.movie_dict[movie_id]['title']

        movie_slice = self.movie_user[movie_title]

        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            correlations = self.movie_user.corrwith(movie_slice)
            
        recommendation = pd.DataFrame(correlations,columns=['Correlation'])
        recommendation.dropna(inplace=True)
        recommendation = recommendation.merge(self.movies,on='title')
        
        recc = recommendation[recommendation['n_ratings']>100].sort_values('Correlation',ascending=False).reset_index()
        similar_ids = list(recc['movieId'])
        similar_ids.pop(0)
        return similar_ids[0:k]

In [308]:
corr_rec = CorrRecommender(ratings,adj_movies)
corr_rec.test_run()

CorrRecommender: Since you watched...
  Toy Story (1995)                                       1 3.92 215
    Adventure|Animation|Children|Comedy|Fantasy
... you might also like:
  Incredibles, The (2004)                             8961 3.84 125
    Action|Adventure|Animation|Children|Comedy
  Finding Nemo (2003)                                 6377 3.96 141
    Adventure|Animation|Children|Comedy
  Aladdin (1992)                                       588 3.79 183
    Adventure|Animation|Children|Comedy|Musical
  Monsters, Inc. (2001)                               4886 3.87 132
    Adventure|Animation|Children|Comedy|Fantasy
  Mrs. Doubtfire (1993)                                500 3.39 144
    Comedy|Drama
  Amelie (Fabuleux destin d'Amélie Poulain, Le) (200  4973 4.18 120
    Comedy|Romance
  American Pie (1999)                                 2706 3.38 103
    Comedy|Romance
  Die Hard: With a Vengeance (1995)                    165 3.56 144
    Action|Crime|Thriller
  E.T. the Ex

### Genre-Based TFIDF Recommender

In [309]:
# The "Genre" class built from the walk-through here:
# https://github.com/khanhnamle1994/movielens/blob/master/Content_Based_and_Collaborative_Filtering_Models.ipynb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

class GenreRecommender(Recommender):
    def __init__(self,ratings,movies):
        super().__init__(movies)

        # Break up the big genre string into a string array
        movies = movies[['title','genres','movieId']].copy()
        movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))
        
        # Convert genres to string value
        movies['genres'] = movies['genres'].fillna("").astype('str')
        
        tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
        tfidf_matrix = tf.fit_transform(movies['genres'])

        self.g_cosine = linear_kernel(tfidf_matrix, tfidf_matrix)
    
        # Build a 1-dimensional array with movie ids
        self.g_movie_ids = movies['movieId']
        self.g_movie_idx = pd.Series(movies.index, index=movies['movieId'])

    # Function that get movie recommendations based on the cosine similarity score of movie genres
    def find_similar_movies(self,movie_id,k):
        idx = self.g_movie_idx[movie_id]
        sim_scores = list(enumerate(self.g_cosine[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:k]
        movie_indices = [i[0] for i in sim_scores]
        return self.g_movie_ids.iloc[movie_indices]

In [310]:
genre_rec = GenreRecommender(ratings,adj_movies)
genre_rec.test_run()

GenreRecommender: Since you watched...
  Toy Story (1995)                                       1 3.92 215
    Adventure|Animation|Children|Comedy|Fantasy
... you might also like:
  Antz (1998)                                         2294 3.24  45
    Adventure|Animation|Children|Comedy|Fantasy
  Toy Story 2 (1999)                                  3114 3.86  97
    Adventure|Animation|Children|Comedy|Fantasy
  Adventures of Rocky and Bullwinkle, The (2000)      3754 2.22   9
    Adventure|Animation|Children|Comedy|Fantasy
  Emperor's New Groove, The (2000)                    4016 3.72  37
    Adventure|Animation|Children|Comedy|Fantasy
  Monsters, Inc. (2001)                               4886 3.87 132
    Adventure|Animation|Children|Comedy|Fantasy
  Wild, The (2006)                                   45074 2.50   1
    Adventure|Animation|Children|Comedy|Fantasy
  Shrek the Third (2007)                             53121 3.02  21
    Adventure|Animation|Children|Comedy|Fantasy
  Tale o

### Combined Recommendation System

In [311]:
class CombinedRecommender(Recommender):
    def __init__(self,ratings,movies):
        super().__init__(movies)
        self.movies = movies
    
    def make_index_hash(self,input_list):
        index_hash={}
        for index, element in enumerate(input_list):
            index_hash[element]=index
        return index_hash

    def find_similar_movies(self,movie_id,k):
        k_list = knnr.find_similar_movies(movie_id,1000)
        c_list = corrr.find_similar_movies(movie_id,1000)
        g_list = grec.find_similar_movies(movie_id,1000)
        r_list = self.movies.sort_values(by='adj_rating', ascending=False)['movieId']
        
        k_hash = self.make_index_hash(k_list)
        c_hash = self.make_index_hash(c_list)
        g_hash = self.make_index_hash(g_list)
        r_hash = self.make_index_hash(r_list)
    
        k_default = len(k_list)
        c_default = len(c_list)
        g_default = len(g_list)
        r_default = len(r_list)        
        
        scores={}
        for index, row in self.movies.iterrows():
            movie_id = row['movieId']
            
            # Closest neighbor (based on ratings)
            k_score = k_hash.get(movie_id,k_default)
            
            # Best correlation (based on ratings)
            c_score = c_hash.get(movie_id,c_default)
            
            # Similar genre
            g_score = g_hash.get(movie_id,g_default)
            
            # Highly rated movies, regardless of similarity
            r_score = g_hash.get(movie_id,r_default)
            
            scores[movie_id]=k_score+c_score+g_score+r_score
            
        results = sorted(scores, key=scores.get)
        return results
   

In [312]:
combined = CombinedRecommender(ratings,adj_movies)
combined.test_run()

CombinedRecommender: Since you watched...
  Toy Story (1995)                                       1 3.92 215
    Adventure|Animation|Children|Comedy|Fantasy
... you might also like:
  Monsters, Inc. (2001)                               4886 3.87 132
    Adventure|Animation|Children|Comedy|Fantasy
  Shrek (2001)                                        4306 3.87 170
    Adventure|Animation|Children|Comedy|Fantasy|Romance
  Finding Nemo (2003)                                 6377 3.96 141
    Adventure|Animation|Children|Comedy
  Toy Story 2 (1999)                                  3114 3.86  97
    Adventure|Animation|Children|Comedy|Fantasy
  Incredibles, The (2004)                             8961 3.84 125
    Action|Adventure|Animation|Children|Comedy
  Bug's Life, A (1998)                                2355 3.52  92
    Adventure|Animation|Children|Comedy
  Antz (1998)                                         2294 3.24  45
    Adventure|Animation|Children|Comedy|Fantasy
  Ice Age (200