In [None]:
#hide
from fastbook import *
setup_book()

# Collaborative Filtering Deep Dive

## A First Look at the Data

In [None]:
from fastai.collab import *
from fastai.tabular.all import *
path = untar_data(URLs.ML_100k)

In [None]:
# read file containing user ratings for movies
# into a table with 4 columns: user, movie, rating, timestamp
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
                      names=['user','movie','rating','timestamp'])
ratings.head()

In [None]:
# demonstration of movie latent factors
last_skywalker = np.array([0.98,0.9,-0.9])

In [None]:
# demonstration of user latent factors
user1 = np.array([0.9,0.8,-0.6])

In [None]:
# demonstration of predicting rating from user and movie latent factors
(user1*last_skywalker).sum()

In [None]:
# continued demonstration
casablanca = np.array([-0.99,-0.3,0.8])

In [None]:
# continued demonstration
(user1*casablanca).sum()

## Learning the Latent Factors

## Creating the DataLoaders

In [None]:
# read file containing movie genre information and other metadata; we shall not use genre information
# we use thes file only to get the movie titles
# movies is a table with 2 columns: movie, title
movies = pd.read_csv(path/'u.item',  delimiter='|', encoding='latin-1',
                     usecols=(0,1), names=('movie','title'), header=None)
movies.head()

In [None]:
# merge ratings and movies tables joining on movie column
ratings = ratings.merge(movies)
ratings.head()

In [None]:
# create a DataLoaders object from the ratings table, using titles to identify movies rather than movie ids
# by default, 0th column is user_name, which correctly corresponds to our user column
# by default, 1st column is item_name, which corresponds to our movie colume; we override this to use title
# by default, 2nd column is rating_name, which correctly corresponds to our rating column
# bs=64 is actually the default batch size, but we specify it here for clarity
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
dls.show_batch()

In [None]:
# echoes the two classes user and title, and their respective values
dls.classes

In [None]:
n_users  = len(dls.classes['user'])
n_movies = len(dls.classes['title'])
n_factors = 5

# create a matrix of 5 latint factors to represent each user
user_factors = torch.randn(n_users, n_factors)
# create a matrix of 5 latent factors to represent each movie
movie_factors = torch.randn(n_movies, n_factors)

In [None]:
# demonstration of extracting the 5 latent factors for a particular user (the 3rd 0-idx user) using matrix multiplication by a one-hot vector
one_hot_3 = one_hot(3, n_users).float()

In [None]:
# continued demonstration
user_factors.t() @ one_hot_3

In [None]:
# demonstration using index notation
user_factors[3]

## Collaborative Filtering from Scratch

In [None]:
# Creating a PyTorch Module
# but we first demonstrate a simple class
class Example:
    def __init__(self, a): self.a = a
    def say(self,x): return f'Hello {self.a}, {x}.'

In [None]:
# Demonstration of instantiating an object of class Example and using it
ex = Example('Sylvain')
ex.say('nice to meet you')

In [None]:
# First let us review the model's input and output
# In its most general form, its input is the concatenation of two vectors:
#   one vector contains unnormalized weights of users to consider
#   the other vector contains unnormalized weights of movies to consider
#   the output is how much our considered users like our considered movies weighted by the weights as a scalar
# In the case of training on the rating a user gave a movie, the input is a one-hot vector of the user and a one-hot vector of the movie

# Our module represents the learned laetnt factors of users and movies as two embedding layers; Embedding is meant for sparse data
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        
    def forward(self, x):
        # recall that x[:,0] is the user column and x[:,1] is the movie column
        # users is a matrix where the ith row contains the latent factors of the user in the ith row of x
        #   so observe that this is the matrix multiplication of a list of one-hot vectors with the user_factors matrix
        # movies is a matrix where the ith row contains the latent factors of the movie in the ith row of x
        #   so observe that this is the matrix multiplication of a list of one-hot vectors with the movie_factors matrix
        # if we had not been using the Embedding class, we would need to replace the user and movie columns with concatenated one-hot vectors
        # thus note that users and movies have exactly the same shape
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        return (users * movies).sum(dim=1)

In [None]:
# extract the first batch of data from the DataLoaders object
x,y = dls.one_batch()
# x is an array of shape (64,2), where each row is a pair of user and movie indices; note that our dataloader has re-indexed the users and movies
x.shape

In [None]:
# instantiate our model
model = DotProduct(n_users, n_movies, 50)
# instantiate a learner on our model and data
learn = Learner(dls, model, loss_func=MSELossFlat())

In [None]:
# train our model for 5 epochs, using a maximum learning rate of 5e-3
learn.fit_one_cycle(5, 5e-3)

In [None]:
# we revise the model to pass the output through a sigmoid function bounded between 0 and 5.5 (by default)
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        return sigmoid_range((users * movies).sum(dim=1), *self.y_range)

In [None]:
# we try training this model. Observe that lor loss is lower than the previous, until it starts to overfit
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)

In [None]:
# we now capture in our model the intuition that some users are more critical than others, and some movies are better than others as bias vectors
# I wonder if without these bias vectors (and regularization), the model would learn quality of movie as a latent factor (and e.g. one latent factor represents how much a user likes highly-rated movies)
class DotProductBias(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.user_bias = Embedding(n_users, 1)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.movie_bias = Embedding(n_movies, 1)
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        res = (users * movies).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
        return sigmoid_range(res, *self.y_range)

In [None]:
# this model quickly overfits
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3)

### Weight Decay

In [None]:
# we illustrate L2 regularization
# we add to the loss the sum of the squares of the latent factors, and see that larger latent factors have their errors penalized more
x = np.linspace(-2,2,100)
a_s = [1,2,5,10,50] 
ys = [a * x**2 for a in a_s]
_,ax = plt.subplots(figsize=(8,6))
for a,y in zip(a_s,ys): ax.plot(x,y, label=f'a={a}')
ax.set_ylim([0,5])
ax.legend();

In [None]:
# we specify wd=0.1 to use L2 regularization
# note that the regularization contribution to the loss is not actually computed, but is instead added to the gradient
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.1)

### Creating Our Own Embedding Module

In [None]:
# we now investigate what it takes to create a Module that replicates the Embedding layer

# the parameters of the Module need to be registered with the Module
#   but we see that we need to do more than naively initialize a tensor into a class member
class T(Module):
    def __init__(self): self.a = torch.ones(3)

L(T().parameters())

In [None]:
# we observe that we may register a tensor as a parameter of the Module by wrapping it in nn.Parameter
class T(Module):
    def __init__(self): self.a = nn.Parameter(torch.ones(3))

L(T().parameters())

In [None]:
# observe that predefined PyTorch Modules such as nn.Linear register parameters with the enclosing class where appropriate, when instantiated
# also note that t.a.parameters() returns an iterator over the same tensors as t.parameters() here, since t.a is the only member of t
class T(Module):
    def __init__(self): self.a = nn.Linear(1, 3, bias=False)

t = T()
L(t.parameters())

In [None]:
# the Linear Module, as with other torch modules, defines the weight class member just as we did above in
#     class T(Module):
#         def __init__(self): self.a = nn.Parameter(torch.ones(3))
type(t.a.weight)

In [None]:
# we use a helper function for initializing the weights of our DotProductBias model to small random values, and registering them as parameters directly, without using nn.Embedding
def create_params(size):
    return nn.Parameter(torch.zeros(*size).normal_(0, 0.01))

In [None]:
# the definition is largely identical to our previous definition of DotProductBias
#   of note, the different way to specify the shape during instantiation,
#   and the use of indexing [] to extract the latent factors where in using nn.Embedding we used () (specifying the indicer as an argument)
class DotProductBias(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = create_params([n_users, n_factors])
        self.user_bias = create_params([n_users])
        self.movie_factors = create_params([n_movies, n_factors])
        self.movie_bias = create_params([n_movies])
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors[x[:,0]]
        movies = self.movie_factors[x[:,1]]
        res = (users*movies).sum(dim=1)
        res += self.user_bias[x[:,0]] + self.movie_bias[x[:,1]]
        return sigmoid_range(res, *self.y_range)

In [None]:
model = DotProductBias(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.1)

## Interpreting Embeddings and Biases

In [None]:
# with regularization, the biases represent the criticality of users and the quality of movies
#   we sample the 5 most acclaimed movies, that are more universally enjoyed
movie_bias = learn.model.movie_bias.squeeze()
idxs = movie_bias.argsort()[:5]
[dls.classes['title'][i] for i in idxs]

In [None]:
idxs = movie_bias.argsort(descending=True)[:5]
[dls.classes['title'][i] for i in idxs]

In [None]:
# we now plot movie similarity by the two most expressive latent factors, and similar movies are close together
g = ratings.groupby('title')['rating'].count()
top_movies = g.sort_values(ascending=False).index.values[:1000]
top_idxs = tensor([learn.dls.classes['title'].o2i[m] for m in top_movies])
movie_w = learn.model.movie_factors[top_idxs].cpu().detach()
movie_pca = movie_w.pca(3)
fac0,fac1,fac2 = movie_pca.t()
idxs = list(range(50))
X = fac0[idxs]
Y = fac2[idxs]
plt.figure(figsize=(12,12))
plt.scatter(X, Y)
for i, x, y in zip(top_movies[idxs], X, Y):
    plt.text(x,y,i, color=np.random.rand(3)*0.7, fontsize=11)
plt.show()

### Using fastai.collab

In [None]:
# collab_learner is a helper function that creates a model (with bias for either dimension) and learner for collaborative filtering
learn = collab_learner(dls, n_factors=50, y_range=(0, 5.5))

In [None]:
learn.fit_one_cycle(5, 5e-3, wd=0.1)

In [None]:
# you can access the model from the learner using learn.model (u for user, i for item)
learn.model

In [None]:
movie_bias = learn.model.i_bias.weight.squeeze()
idxs = movie_bias.argsort(descending=True)[:5]
[dls.classes['title'][i] for i in idxs]

### Embedding Distance

In [None]:
# we want to compute how similar two movies are by comparing their latent factors
# we use the cosine similarity, which is the cosine of the angle between the two vectors
# compare against taking the distance between the two vectors
# we first extract the movie latent factors from the model as movie_factors
# then obtain the index assigned to Silence of the Lambs, The (1991) by the dataloader
# then compute the cosine similarity; (dim=1) is in fact the default, and it means that the dimensions are the latent factors, broadcasting over each movie (as opposed to each movie, broadcasting over each latent factor; in the second argument, [idx] extracts the latent factors for Silence of the Lambs, The (1991) as a len 50 vector, anc [None] reshapes it to a 1x50 matrix, so that it can be broadcast over the other movies)
# then we sort the movies by cosine similarity, and extract the index of the most similar movie
# finally, we look up the title of the most similar movie

movie_factors = learn.model.i_weight.weight
idx = dls.classes['title'].o2i['Silence of the Lambs, The (1991)']
distances = nn.CosineSimilarity(dim=1)(movie_factors, movie_factors[idx][None])
idx = distances.argsort(descending=True)[1]
dls.classes['title'][idx]

## Bootstrapping a Collaborative Filtering Model

## Deep Learning for Collaborative Filtering

In [None]:
# to convert our classic collaborative filtering model to a deep learning model, we replace the dot product with a neural network
# the inputs are the latent factors of the user and movie, concatenated
# note that with this we can be more flexible with the number of latent factors; the users and movies can have different numbers of latent factors
# here, we estimate a good number of latent factors for our set of users and movies
# this cell may print [(944, 74), (1635, 101)], which means that there are 944 users and 1635 movies, and we estimate that 74 latent factors for users and 101 latent factors for movies is a good number
embs = get_emb_sz(dls)
embs

In [None]:
# user_sz is a shape tuple for a matrix of the number of users by the number of latent factors for users
# and similarly for item_sz
# the nn is
#   a linear layer with input size equal to the sum of the latent factors for users and movies mapping to n_act (100 by default) latent factors
#   a ReLU activation function
#   a linear layer with input size equal to n_act mapping to 1 output
# the output is passed through a sigmoid function bounded between 0 and 5.5 (by default)
class CollabNN(Module):
    def __init__(self, user_sz, item_sz, y_range=(0,5.5), n_act=100):
        self.user_factors = Embedding(*user_sz)
        self.item_factors = Embedding(*item_sz)
        self.layers = nn.Sequential(
            nn.Linear(user_sz[1]+item_sz[1], n_act),
            nn.ReLU(),
            nn.Linear(n_act, 1))
        self.y_range = y_range
        
    def forward(self, x):
        embs = self.user_factors(x[:,0]),self.item_factors(x[:,1])
        x = self.layers(torch.cat(embs, dim=1))
        return sigmoid_range(x, *self.y_range)

In [None]:
# observe how our model instantiation params are meant to accept the output of get_emb_sz
model = CollabNN(*embs)

In [None]:
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.01)

In [None]:
# use_nn=True in fast.ai's collab_learner uses this model instead of the classic collaborative filtering model
# layers specifies the number of layers and their sizes
learn = collab_learner(dls, use_nn=True, y_range=(0, 5.5), layers=[100,50])
learn.fit_one_cycle(5, 5e-3, wd=0.1)

In [None]:
learn.model

In [None]:
# the EmbeddingNN class is used by fast.ai's collab_learner when use_nn=True
# observe that 176 = 74 + 102, which is the sum of the latent factors for users and movies
# also observe that this is a specialization of TabularModel with n_cont=0 (no continuous variables) and out_sz=1 (one output),
# since we may view collaborative filtering as a tabular problem with two categorical variables (user and movie) and one continuous variable (rating), where we have first converted the sparse categorical variables to embeddings (c.f. word embeddings). Note that TabularModel expects to first embed the categorical variables, so the shape to the categorical variables includes the not only the number of categories, but also the number of latent factors; the size of the embedding is estimated under the hood when we use collab_learner
# EmbeddingNN(
#   (embeds): ModuleList(
#     (0): Embedding(944, 74)
#     (1): Embedding(1665, 102)
#   )
#   (emb_drop): Dropout(p=0.0, inplace=False)
#   (bn_cont): BatchNorm1d(0, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
#   (layers): Sequential(
#     (0): LinBnDrop(
#       (0): Linear(in_features=176, out_features=100, bias=False)
#       (1): ReLU(inplace=True)
#       (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
#     )
#     (1): LinBnDrop(
#       (0): Linear(in_features=100, out_features=50, bias=False)
#       (1): ReLU(inplace=True)
#       (2): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
#     )
#     (2): LinBnDrop(
#       (0): Linear(in_features=50, out_features=1, bias=True)
#     )
#     (3): fastai.layers.SigmoidRange(low=0, high=5.5)
#   )
# )
@delegates(TabularModel)
class EmbeddingNN(TabularModel):
    def __init__(self, emb_szs, layers, **kwargs):
        super().__init__(emb_szs, layers=layers, n_cont=0, out_sz=1, **kwargs)

### Sidebar: kwargs and Delegates

### End sidebar

## Conclusion

## Questionnaire

1. What problem does collaborative filtering solve?
1. How does it solve it?
1. Why might a collaborative filtering predictive model fail to be a very useful recommendation system?
1. What does a crosstab representation of collaborative filtering data look like?
1. Write the code to create a crosstab representation of the MovieLens data (you might need to do some web searching!).
1. What is a latent factor? Why is it "latent"?
1. What is a dot product? Calculate a dot product manually using pure Python with lists.
1. What does `pandas.DataFrame.merge` do?
1. What is an embedding matrix?
1. What is the relationship between an embedding and a matrix of one-hot-encoded vectors?
1. Why do we need `Embedding` if we could use one-hot-encoded vectors for the same thing?
1. What does an embedding contain before we start training (assuming we're not using a pretained model)?
1. Create a class (without peeking, if possible!) and use it.
1. What does `x[:,0]` return?
1. Rewrite the `DotProduct` class (without peeking, if possible!) and train a model with it.
1. What is a good loss function to use for MovieLens? Why? 
1. What would happen if we used cross-entropy loss with MovieLens? How would we need to change the model?
1. What is the use of bias in a dot product model?
1. What is another name for weight decay?
1. Write the equation for weight decay (without peeking!).
1. Write the equation for the gradient of weight decay. Why does it help reduce weights?
1. Why does reducing weights lead to better generalization?
1. What does `argsort` do in PyTorch?
1. Does sorting the movie biases give the same result as averaging overall movie ratings by movie? Why/why not?
1. How do you print the names and details of the layers in a model?
1. What is the "bootstrapping problem" in collaborative filtering?
1. How could you deal with the bootstrapping problem for new users? For new movies?
1. How can feedback loops impact collaborative filtering systems?
1. When using a neural network in collaborative filtering, why can we have different numbers of factors for movies and users?
1. Why is there an `nn.Sequential` in the `CollabNN` model?
1. What kind of model should we use if we want to add metadata about users and items, or information such as date and time, to a collaborative filtering model?

### Further Research

1. Take a look at all the differences between the `Embedding` version of `DotProductBias` and the `create_params` version, and try to understand why each of those changes is required. If you're not sure, try reverting each change to see what happens. (NB: even the type of brackets used in `forward` has changed!)
1. Find three other areas where collaborative filtering is being used, and find out what the pros and cons of this approach are in those areas.
1. Complete this notebook using the full MovieLens dataset, and compare your results to online benchmarks. See if you can improve your accuracy. Look on the book's website and the fast.ai forum for ideas. Note that there are more columns in the full dataset—see if you can use those too (the next chapter might give you ideas).
1. Create a model for MovieLens that works with cross-entropy loss, and compare it to the model in this chapter.