In [1]:
from fastai.collab import *
from fastai.tabular import *

## Collaborative filtering example

`collab` models use data in a `DataFrame` of user, items, and ratings.

In [2]:
user,item,title = 'userId','movieId','title'

In [3]:
path = untar_data(URLs.ML_SAMPLE)
path

WindowsPath('C:/Users/cross-entropy/.fastai/data/movie_lens_sample')

In [4]:
ratings = pd.read_csv(path/'ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,73,1097,4.0,1255504951
1,561,924,3.5,1172695223
2,157,260,3.5,1291598691
3,358,1210,5.0,957481884
4,130,316,2.0,1138999234


That's all we need to create and train a model:

In [5]:
data = CollabDataBunch.from_df(ratings, seed=42)

In [6]:
y_range = [0,5.5]

In [7]:
learn = collab_learner(data, n_factors=50, y_range=y_range)

In [None]:
learn.fit_one_cycle(3, 5e-3)

epoch,train_loss,valid_loss,time
0,1.623544,0.920729,00:52
1,0.846352,0.67129,00:52


## Movielens 100k

Let's try with the full Movielens 100k data dataset, available from http://files.grouplens.org/datasets/movielens/ml-100k.zip

Download the data set, then unzip it into the directory `.fastai/data/`

In [None]:
path=Config.data_path()/'ml-100k'

In [None]:
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
                      names=[user,item,'rating','timestamp'])
ratings.head()

In [None]:
movies = pd.read_csv(path/'u.item',  delimiter='|', encoding='latin-1', header=None,
                    names=[item, 'title', 'date', 'N', 'url', *[f'g{i}' for i in range(19)]])
movies.head()

In [None]:
len(ratings)

In [None]:
rating_movie = ratings.merge(movies[[item, title]])
rating_movie.head()

In [None]:
data = CollabDataBunch.from_df(rating_movie, seed=42, valid_pct=0.1, item_name=title)

In [None]:
data.show_batch()

In [None]:
y_range = [0,5.5]

### Create a collab_learner model with 40 embedded `factors` and fit to the `movielens100k` data
#### This means we are asking the neural network to learn `embeddings` of `movie titles` and of `users` into a 40 dimensional space.  `Embeddings` consist of a 40-dimensional `weight vector` and a `single bias` for each `user` and for each `movie title`. 
#### So the network will `discover` 40 `weights` and a `bias` for each `user` and 40 `weights` and a `bias` for each each `movie title`, and it will be up to us to try to understand what these new `factors` mean.
#### `wd` is the `weight decay parameter`
#### We obtain validation loss of of 0.81 after 5 epochs.

In [None]:
n_factors = 40
learn = collab_learner(data, n_factors=n_factors, y_range=y_range, wd=1e-1)

## Loss seems fairly insensitive to the learning rate!

In [None]:
learn.lr_find()
learn.recorder.plot(skip_end=15)

In [None]:
# are the first and second columns really `train_loss` and `valid_loss`, 
#     or is the second column really`validation error`
learn.fit_one_cycle(5, 5e-3)

### Fit another 5 epochs
#### We obtain validation loss of of 0.86 after another 5 epochs, and training loss is still decreasing.

In [None]:
learn.fit_one_cycle(5, 5e-3)

In [None]:
learn.save('dotprod')

Here's [some benchmarks](https://www.librec.net/release/v1.3/example.html) on the same dataset for the popular Librec system for collaborative filtering. They show best results based on RMSE of 0.91, which corresponds to an MSE of `0.91**2 = 0.83`.

## Interpretation

### Setup

In [None]:
learn.load('dotprod');

In [None]:
learn.model

In [None]:
g = rating_movie.groupby(title)['rating'].count()
top_movies = g.sort_values(ascending=False).index.values[:1000]
top_movies[:10]

### Movie bias

In [None]:
movie_bias = learn.bias(top_movies, is_item=True)
movie_bias.shape

In [None]:
mean_ratings = rating_movie.groupby(title)['rating'].mean()
movie_ratings = [(b, i, mean_ratings.loc[i]) for i,b in zip(top_movies,movie_bias)]

In [None]:
item0 = lambda o:o[0]

In [None]:
# movies with worst ratings
sorted(movie_ratings, key=item0)[:15]

In [None]:
# movies with best ratings
sorted(movie_ratings, key=lambda o: o[0], reverse=True)[:15]

### Movie weights

In [None]:
movie_w = learn.weight(top_movies, is_item=True)
movie_w.shape

### Use `PCA (principal components analysis)` to reduce the dimensionality of the embedding from 40 down to 3. `PCA` works by constructing orthogonal linear combinations of the embeddings (which we will call `factors`)  and ranking them in order of importance. We can then the most influential factors. 

In [None]:
n_factors = 3
movie_pca = movie_w.pca(n_factors)
movie_pca.shape

In [None]:
factor1,factor2,factor3 = movie_pca.t()
movie_comp = [(f, i) for f,i in zip(factor1, top_movies)]

### Genre specified by factor1

In [None]:
# Best movies in genre specified by factor1
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]


In [None]:
# Worst movies in genre specified by factor1
sorted(movie_comp, key=itemgetter(0))[:10]

### Genre specified by factor2

In [None]:
movie_comp = [(f, i) for f,i in zip(factor2, top_movies)]
# Best movies in genre specified by factor2
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]

In [None]:
# Worst movies in genre specified by factor2
sorted(movie_comp, key=itemgetter(0))[:10]

### Genre specified by factor3

In [None]:
movie_comp = [(f, i) for f,i in zip(factor3, top_movies)]

In [None]:
# Best movies in genre specified by factor3
sorted(movie_comp, key=itemgetter(0), reverse=True)[:10]

In [None]:
# Worst movies in genre specified by factor3
sorted(movie_comp, key=itemgetter(0))[:10]

### Genres `factor1` vs. `factor2`

In [None]:
idxs = np.random.choice(len(top_movies), 50, replace=False)
idxs = list(range(50))
X = factor1[idxs]
Y = factor2[idxs]
plt.figure(figsize=(15,15))
plt.scatter(X, Y)
for i, x, y in zip(top_movies[idxs], X, Y):
    plt.text(x,y,i, color=np.random.rand(3)*0.7, fontsize=11)
    plt.xlabel(factor1)
    plt.ylabel(factor2)
plt.show()

### Genres `factor1` vs. `factor3`

In [None]:
idxs = np.random.choice(len(top_movies), 50, replace=False)
idxs = list(range(50))
X = factor1[idxs]
Y = factor3[idxs]
plt.figure(figsize=(15,15))
plt.scatter(X, Y)
for i, x, y in zip(top_movies[idxs], X, Y):
    plt.text(x,y,i, color=np.random.rand(3)*0.7, fontsize=11)
    plt.xlabel(factor1)
    plt.ylabel(factor3)
plt.show()

### Genres in `factor2` vs. `factor3`

In [None]:
idxs = np.random.choice(len(top_movies), 50, replace=False)
idxs = list(range(50))
X = factor2[idxs]
Y = factor3[idxs]
plt.figure(figsize=(15,15))
plt.scatter(X, Y)
for i, x, y in zip(top_movies[idxs], X, Y):
    plt.text(x,y,i, color=np.random.rand(3)*0.7, fontsize=11)
    plt.xlabel(factor2)
    plt.ylabel(factor3)
plt.show()