In [None]:
!curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip --output ml-latest-small.zip 
!mkdir -p ml-latest-small
!unzip ml-latest-small.zip -d sample_data/ml-latest-small
!pip install fastai==0.7.0
import sys
!{sys.executable} -m pip install torchtext==0.2.3

# Collaborative Filtering
- using MovieLens dataset Movielens

In [1]:
from fastai.learner import *
from fastai.column_data import *

In [None]:
PATH = 'sample_data/ml-latest-small/ml-latest-small/'

In [None]:
!ls {PATH}

We're working with the movielens data, which contains one rating per row, like below.  
We will use `userId` (categorical), `movieId`(categorical) and `rating` (dependent) for modeling.

In [None]:
ratings = pd.read_csv(PATH+'ratings.csv')
ratings.head()

Just for display purposes, let's read in the movie names too.

In [None]:
movies = pd.read_csv(PATH+'movies.csv')
movies.head()

### Create subset for Excel

We create a crosstab of the most popular movies and most movie-addicted users which we'll copy into Excel for creating a simple example. This isn't necessary for any of the modeling below however.

In [None]:
g = ratings.groupby('userId')['rating'].count()
topUsers = g.sort_values(ascending=False)[:15]

g = ratings.groupby('movieId')['rating'].count()
topMovies = g.sort_values(ascending=False)[:15]

top_r = ratings.join(topUsers, rsuffix='_r', how='inner', on='userId')
top_r = top_r.join(topMovies, rsuffix='_r', how='inner', on='movieId')

pd.crosstab(top_r.userId, top_r.movieId, top_r.rating, aggfunc=np.sum)

To begin with, we will use matrix factorization/decomposition instead of building a neural net.

Each prediction is a dot product of movie embedding vector and user embedding vector. In linear algebra term, it is equivalent of matrix product as one is a row and one is a column. If there is no actual rating, we set the prediction to zero (think of this as test data — not training data).

### Collaborative filtering

We create a validation set by picking random set of ID’s. wd is a weight decay for L2 regularization, and n_factors is how big an embedding matrix we want.

In [None]:
val_idxs = get_cv_idxs(len(ratings))
wd = 2e-4
n_factors = 50

We create a model data object from CSV file:

In [None]:
cf = CollabFilterDataset.from_csv(PATH, 'ratings.csv', 'userId', 'movieId', 'rating')
learn = cf.get_learner(n_factors, val_idxs, 64, opt_fn=optim.Adam)

We then get a learner that is suitable for the model data, and fit the model

In [None]:
learn.fit(1e-2, 2, wds=wd, cycle_len=1, cycle_mult=2)

Since the output is Mean Squared Error, you can take RMSE by:

In [None]:
math.sqrt(0.765)

Let's take a look at how the predictions compare to actuals for this model.

In [None]:
preds = learn.predict()

And you can also plot using seaborn sns

In [None]:
y = learn.data.val_y
sns.jointplot(preds, y, kind='hex', stat_func=None)

### Movie bias

In [None]:
movie_names = movies.set_index('movieId')['title'].to_dict()
g = ratings.groupby('movieId')['rating'].count()
topMovies = g.sort_values(ascending=False).index.values[:3000]
topMoviesIdx = np.array([cf.item2idx[o] for o in topMovies])

In [None]:
m = learn.model; m.cuda()

First, we'll look at the movie bias term. Here, our input is the movie id (a single id), and the output is the movie bias (a single float).

In [None]:
m.ib(V(topMoviesIdx))

In [None]:
movie_bias = to_np(m.ib(V(topMoviesIdx)))

In [None]:
movie_bias

In [None]:
movie_ratings = [(b[0], movie_names[i]) for i, b in zip(topMovies, movie_bias)]

In [None]:
sorted(movie_ratings, key=lambda o: o[0])[:15]

In [None]:
sorted(movie_ratings, key=itemgetter(0))[:15]

In [None]:
sorted(movie_ratings, key=lambda o: o[0], reverse=True)[:15]

### Embedding interpretation

We can now do the same thing for the embeddings.

In [None]:
movie_emb = to_np(m.i(V(topMoviesIdx)))
movie_emb.shape

Because it's hard to interpret 50 embeddings, we use PCA to simplify them down to just 3 vectors.

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
movie_pca = pca.fit(movie_emb.T).components_

In [None]:
movie_pca.shape

In [None]:
fac0 = movie_pca[0]
movie_comp = [(f, movie_names[i]) for f, i in zip(fac0, topMovies)]

Here's the 1st component. It seems to be 'easy watching' vs 'serious'.

In [None]:
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]

In [None]:
sorted(movie_comp, key=itemgetter(0))[:10]

In [None]:
fac1 = movie_pca[1]
movie_comp = [(f, movie_names[i]) for f, i in zip(fac1, topMovies)]

In [None]:
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]

In [None]:
sorted(movie_comp, key=itemgetter(0))[:10]

In [None]:
idxs = np.random.choice(len(topMovies), 50, replace=False)
X = fac0[idxs]
Y = fac1[idxs]
plt.figure(figsize=(15, 15))
plt.scatter(X, Y)
for i, x, y in zip(topMovies[idxs], X, Y):
    plt.text(x, y, movie_names[i], color=np.random.rand(3)*0.7, fontsize=11)
plt.show()    

## Collab filtering from scratch¶

### Dot product example

In [4]:
a = T([[1., 2], [3, 4]])
b = T([[2., 2], [10, 10]])

When we have a mathematical operator between tensors in numpy or PyTorch, it will do element-wise assuming that they both have the same dimensionality. The below is how you would calculate the dot product of two vectors (e.g. (1, 2)⋅(2, 2) = 6 — the first rows of matrix a and b):

In [5]:
(a*b).sum(1)

tensor([ 6., 70.])

In [None]:
class DotProduct(nn.Module):
    def forward(self, u, m):
        return (u*m).sum(1)

Now we can call it and get the expected result (notice that we do not need to say model.forward(a, b) to call the forward function — it is a PyTorch magic.)

In [None]:
model = DotProduct()
model(a, b)

## Building more complex module

This implementation has two additions to the DotProduct class:

- Two nn.Embedding matrices
- Look up our users and movies in above embedding matrices

It is quite possible that user ID’s are not contiguous which makes it hard to use as an index of embedding matrix. So we will start by creating indexes that starts from zero and contiguous and replace ratings.userId column with the index by using Panda’s apply function with an anonymous function lambda and do the same for ratings.movieId 

### Dot product model

In [None]:
u_uniq = ratings.userId.unique()
user2idx = {o: i for i, o in enumerate(m_uniq)}
ratings.userId = ratings.userId.apply(lambda x: user2idx[x])

m_uniq = ratings.movieId.unique()
movie2idx = {o: i for i, o in enumerate(m_uniq)}
ratings.movieId = ratings.movieId.apply(lambda x: movie2idx[x])

n_users = int(ratings.userId.nunique())
n_movies = int(ratings.movieId.nuinque())

In [None]:
class EmbeddingDot(nn.Module):
    def __init__(self, n_users, n_movies):
        super().__init__()
        self.u = nn.Embedding(n_users, n_factors)
        self.m = nn.Embedding(n_movies, n_factors)
        self.u.weight.data.uniform_(0, 0.05)
        self.m.weight.data.uniform_(0, 0.05)
        
    def forward(self, cats, conts):
        users, movies = cats[:, 0], cats[:, 1]
        u, m = self.u(users), self.m(movies)
        return (u*m).sum(1).view(-1, 1)

Embedding is not a tensor but a variable. A variable does the exact same operations as a tensor but it also does automatic differentiation. To pull a tensor out of a variable, call data attribute.

In [None]:
x = ratings.drop(['rating', 'timestamp'], axis=1)
y = ratings['rating'].astype(np.float32)

In [None]:
data = ColumnarModelData.from_data_frame(PATH, val_idxs, x, y, ['userId', 'movieId'], 64)

We are reusing ColumnarModelData (from fast.ai library) from Rossmann notebook, and that is the reason behind why there are both categorical and continuous variables in def forward(self, cats, conts) function in EmbeddingDot

Since we do not have continuous variable in this case, we will ignore conts and use the first and second columns of cats as users and movies . Note that they are mini-batches of users and movies.

In [None]:
wd = 1e-5
model = EmbeddingDot(n_users, n_movies).cuda()
opt = optim.SGD(model.parameters(), 1e-1, weight_decay=wd, momentum=0.9)

optim is what gives us the optimizers in PyTorch. model.parameters() is one of the function inherited from nn.Modules that gives us all the weight to be updated/learned.

In [None]:
fit(model, data, 3, opt, F.mse_loss)

In [None]:
set_lrs(opt, 0.01)

In [None]:
fit(model, data, 3, opt, F.mse_loss)

In [None]:
min_rating, max_rating = ratings.rating.min(), ratings.rating.max()
min_rating, max_rating

## Let’s improve our model

**Bias** — to adjust to generally popular movies or generally enthusiastic users.

In [None]:
def get_emb(ni, nf):
    e = nn.Embedding(ni, nf)
    e.weight.data.uniform_(-0.01, 0.01)
    return e

In [None]:
class EmbeddingDotBias(nn.Module):
    def __init__(self, n_users, n_movies):
        super().__init__()
        (self.u, self.m, self.ub, self.mb) = [get_emb(*o) for o in [
            (n_users, n_factors), (n_movies, n_factors), (n_users, 1), (n_movies, 1)
        ]]
        
    def forward(self, cats, conts):
        users, movies = cats[:, 0], cats[:, 1]
        um = (self.u(users) * self.m(movies)).sum(1)
        res = um + self.ub(users).squeeze() + self.mb(movies).squeeze()
        res = F.sigmoid(res) * (max_rating - min_rating) + min_rating
        return res.view(-1, 1)

In [None]:
wd=2e-4
model = EmbeddingDotBias(cf.n_users, cf.n_items).cuda()
opt = optim.SGD(model.parameters(), 1e-1, weight_decay=wd, momentum=0.9)

In [None]:
fit(model, data, 3, opt, F.mse_loss)

In [None]:
set_lrs(opt, 1e-2)

In [None]:
fit(model, data, 3, opt, F.mse_loss)

## Neural Net Version 

Rather than calculating the dot product of user embedding vector and movie embedding vector to get a prediction, we will concatenate the two and feed it through neural net.

In [None]:
class EmbeddingNet(nn.Module):
    def __init__(self, n_users, n_movies, nh=10, p1=0.05, p2=0.5):
        super().__init__()
        self.u, self.m) = [get_emb(*o) for o in [
            (n_users, n_factors), (n_movies, n_factors)]]
        self.lin1 = nn.Linear(n_factors*2, nh)
        self.lin2 = nn.Linear(nh, 1)
        self.drop1 = nn.Dropout(p1)
        self.drop2 = nn.Dropout(p2)
        
    def forward(self, cats, conts):
        users, movies = cats[:, 0], cats[:, 1]
        x = self.drop1(torch.cat([self.u(users), self.m(movies)], dim=1))
        x = self.drop2(F.relu(self.lin1(x)))
        return F.sigmoid(self.lin2(x)) * (max_rating - min_rating + 1) + min_rating - 0.5

Notice that we no longer have bias terms since Linear layer in PyTorch already has a build in bias. nh is a number of activations a linear layer creates (Jeremy calls it “num hidden”).

It only has one hidden layer, so maybe not “deep”, but this is definitely a neural network.

In [None]:
wd=1e-5
model = EmbeddingNet(n_users, n_movies).cuda()
opt = optim.Adam(model.parameters(), 1e-3, weight_decay=wd)

In [None]:
fit(model, data, 3, opt, F.mse_loss)

In [None]:
set_lrs(opt, 1e-3)

In [None]:
fit(model, data, 3, opt, F.mse_loss)

Notice that the loss functions are also in F (here, it s mean squared loss).

Now that we have neural net, there are many things we can try:

- Add dropouts
- Use different embedding sizes for user embedding and movie embedding
- Not only user and movie embeddings, but append movie genre embedding and/or timestamp from the original data.
- Increase/decrease number of hidden layers and activations
- Increase/decrease regularization

Currently, we are passing off the updating of weights to PyTorch’s optimizer. What does an optimizer do? and what is a momentum?