In [44]:
# What is implicit feedback?

# In Part 1, we learned that collaborative filtering is based on the assumption that similar users like similar things. 
# The user-item matrix, or "utility matrix", is the foundation of collaborative filtering. 
# In the utility matrix, rows represent users and columns represent items.

# The cells of the matrix are populated by a given user's degree of preference towards an item, which can come in the form of:

# explicit feedback: direct feedback towards an item (e.g., movie ratings which we explored in Part 1)
# implicit feedback: indirect behaviour towards an item (e.g., purchase history, browsing history, search behaviour)

# Implicit feedback makes assumptions about a user's preference based on their actions towards items. Let's take Netflix for example. 
# If you binge-watch a show and blaze through all seasons in a week, there's a high chance that you like that show. 
# However, if you start watching a series and stop halfway through the first episode, there's suspicion to believe that you probably don't like that show.

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

import implicit

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [45]:
# Step 2: Load the Data
ratings = pd.read_csv("data/ml-latest-small/ratings.csv")
movies = pd.read_csv("data/ml-latest-small/movies.csv")

In [46]:
# For this implicit feedback tutorial, we'll treat movie ratings as the number of times that a user watched a movie. 
# For example, if Jane (a user in our database) gave Batman a rating of 1 and Legally Blonde a rating of 5, we'll assume that Jane watched Batman one time and Legally Blonde five times.

# Step 3: Transforming the data

# user_mapper: maps user id to user index
# movie_mapper: maps movie id to movie index
# user_inv_mapper: maps user index to user id
# movie_inv_mapper: maps movie index to movie id

def create_X(df):
    """
    Generates a sparse matrix from ratings dataframe

    Args: 
        df: pandas dataframe

    Returns:
        X: sparse matrix
        user_mapper: dict that maps user indices to user id's
        user_inv_mapper: dict that maps movie id's to movie indices
        movie_inv_mapper: dict that maps movie indices to movie id's
    """
    N = df['userId'].nunique()
    M = df['movieId'].nunique()

    user_mapper = dict(zip(np.unique(df['userId']), list(range(N))))
    movie_mapper = dict(zip(np.unique(df['movieId']), list(range(M))))

    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df['movieId'])))

    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df['rating'], (movie_index, user_index)), shape=(M,N))

    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [47]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings)

In [48]:
# Creating movie title mappers

# We need to interpret a movie title from its index in the user-item matrix and vice versa. Let's create 2 helper functions that make this interpretation easy:

from fuzzywuzzy import process

def movie_finder(title):
    all_titles = movies['title'].tolist()
    closest_match = process.extractOne(title, all_titles)
    return closest_match[0]

movie_title_mapper = dict(zip(movies['title'], movies['movieId']))
movie_title_inv_mapper = dict(zip(movies['movieId'], movies['title']))

def get_movie_index(title):
    fuzzy_title = movie_finder(title)
    movie_id = movie_title_mapper[fuzzy_title]
    movie_idx = movie_mapper[movie_id]
    return movie_idx

def get_movie_title(movie_idx):
    movie_id = movie_inv_mapper[movie_idx]
    title = movie_title_inv_mapper[movie_id]
    return title

In [49]:
# Step 4: Building our implicit feedback recommender model

# The implicit package is built around a linear algebra technique called matrix factorization), which can help us discover latent features underlying the interactions between users and movies. These latent features give a more compact representation of user tastes and item descriptions. Matrix factorization is particularly useful for very sparse data and can enhance the quality of recommendations. The algorithm works by factorizing the original user-item matrix into two factor matrices:

# user-factor matrix (n_users, k)
# item-factor matrix (k, n_items)
# We are reducing the dimensions of our original matrix into "taste" dimensions. We cannot interpret what each latent feature 
#  represents. However, we could imagine that one latent feature may represent users who like romantic comedies from the 1990s, while another latent feature may represent movies which are independent foreign language films.

# In traditional matrix factorization, such as SVD, we would attempt to solve the factorization at once which can be very computationally expensive. As a more practical alternative, we can use a technique called Alternating Least Squares (ALS) instead. With ALS, we solve for one factor matrix at a time:

# Step 1: hold user-factor matrix fixed and solve for the item-factor matrix
# Step 2: hold item-factor matrix fixed and solve for the user-item matrix
# We alternate between Step 1 and 2 above, until the dot product of the item-factor matrix and user-item matrix is approximately equal to the original X (user-item) matrix. This approach is less computationally expensive and can be run in parallel.

# The implicit package implements matrix factorization using Alternating Least Squares (see docs here). Let's initiate the model using the AlternatingLeastSquares class.

model = implicit.als.AlternatingLeastSquares(factors=50)

# This model comes with a couple of hyperparameters that can be tuned to generate optimal results:

# factors (k): number of latent factors,
# regularization (lambda): prevents the model from overfitting during training
# In this tutorial, we'll set k = 50
#  and lambda = 0.01
#  (the default). In a real-world scenario, I highly recommend tuning these hyperparameters before generating recommendations to generate optimal results.

# The next step is to fit our model with our user-item matrix.

model.fit(X)

  0%|          | 0/15 [00:00<?, ?it/s]

  solver(
  solver(
  solver(
  solver(
  solver(
  solver(


In [50]:
# Now, let's test out the model's recommendations. 
# We can use the model's similar_items() method which returns the most relevant movies of a given movie. 
# We can use our helpful get_movie_index() function to get the movie index of the movie that we're interested in.

movie_of_interest = 'forrest gump'

movie_index = get_movie_index(movie_of_interest)
related = model.similar_items(movie_index)
related
# The output of similar_items() is not user-friendly. We'll need to use our get_movie_title() function to interpret what our results are.

  ids, scores = topk(
  ids, scores = topk(
  ids, scores = topk(


(array([314, 187, 167,  26, 284,  28, 308,  35, 553, 516], dtype=int32),
 array([1.        , 0.5931347 , 0.49845117, 0.48768184, 0.4794307 ,
        0.45930648, 0.44614336, 0.42688155, 0.4246078 , 0.4230297 ],
       dtype=float32))

In [51]:
print(f"Because you watched {movie_finder(movie_of_interest)}...")
for r in related:
    recommended_title = get_movie_title(r[0])
    if recommended_title != movie_finder(movie_of_interest):
        print(recommended_title)

Because you watched Forrest Gump (1994)...
Jumanji (1995)


In [52]:
# Step 5: Generating User-Item Recommendations

user_id = 95

user_ratings = ratings[ratings['userId']==user_id].merge(movies[['movieId', 'title']])
user_ratings = user_ratings.sort_values('rating', ascending=False)
print(f"Number of movies rated by user {user_id}: {user_ratings['movieId'].nunique()}")

Number of movies rated by user 95: 168


In [53]:
user_ratings = ratings[ratings['userId']==user_id].merge(movies[['movieId', 'title']])
user_ratings = user_ratings.sort_values('rating', ascending=False)
top_5 = user_ratings.head()
top_5

Unnamed: 0,userId,movieId,rating,timestamp,title
24,95,1089,5.0,1048382826,Reservoir Dogs (1992)
34,95,1221,5.0,1043340018,"Godfather: Part II, The (1974)"
83,95,3019,5.0,1043340112,Drugstore Cowboy (1989)
26,95,1175,5.0,1105400882,Delicatessen (1991)
27,95,1196,5.0,1043340018,Star Wars: Episode V - The Empire Strikes Back...


In [54]:
bottom_5 = user_ratings[user_ratings['rating']<3].tail()
bottom_5

Unnamed: 0,userId,movieId,rating,timestamp,title
93,95,3690,2.0,1043339908,Porky's Revenge (1985)
122,95,5283,2.0,1043339957,National Lampoon's Van Wilder (2002)
100,95,4015,2.0,1043339957,"Dude, Where's My Car? (2000)"
164,95,7373,1.0,1105401093,Hellboy (2004)
109,95,4732,1.0,1043339283,Bubble Boy (2001)


In [55]:
X_t = X.T.tocsr()

user_idx = user_mapper[user_id]
recommendations = model.recommend(user_idx, X_t)
recommendations

ValueError: user_items must contain 1 row for every user in userids

In [39]:
# We can't interpret the results as is since movies are represented by their index. 
# We'll have to loop over the list of recommendations and get the movie title for each movie index.

for r in recommendations:
    recommend_title = get_movie_title(r[0])
    print(recommended_title)

NameError: name 'recommendations' is not defined