Inspired by [this](https://github.com/yanneta/pytorch-tutorials/blob/master/collaborative-filtering-nn.ipynb) and ([associated video](https://www.youtube.com/watch?v=vrpbDpf4y98&t=1927s)).

Data available from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

The point of this notebook is to understand what is inside `ColumnarModelData.from_data_frame` from the [fastai](https://github.com/fastai/fastai/blob/master/courses/dl1/lesson5-movielens.ipynb) implementation. I believe there are too many layers of abstraction when digging into `from_data_frame`. After completing this notebook, I should understand how to feed csv data into a neural network for training.

[This](https://github.com/devforfu/pytorch_playground/blob/master/movielens.ipynb) notebook will be used to explain setting advanced hyperparamters such as learning rates.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import os
import math

import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
from torch.optim.lr_scheduler import _LRScheduler

from sklearn.model_selection import train_test_split

In [2]:
path='data/'

In [3]:
ratings = pd.read_csv(path+'ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies = pd.read_csv(path+'movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
g=ratings.groupby('userId')['rating'].count()
topUsers=g.sort_values(ascending=False)[:15]

g=ratings.groupby('movieId')['rating'].count()
topMovies=g.sort_values(ascending=False)[:15]

top_r = ratings.join(topUsers, rsuffix='_r', how='inner', on='userId')
top_r = top_r.join(topMovies, rsuffix='_r', how='inner', on='movieId')

pd.crosstab(top_r.userId, top_r.movieId, top_r.rating, aggfunc=np.sum)

movieId,1,50,110,260,296,318,356,480,527,589,593,1196,2571,2858,2959
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
68,2.5,3.0,2.5,5.0,2.0,3.0,3.5,3.5,4.0,3.5,3.5,5.0,4.5,5.0,2.5
182,4.0,4.5,3.5,3.5,5.0,4.5,5.0,3.5,4.0,2.0,4.5,3.0,5.0,5.0,5.0
249,4.0,4.0,5.0,5.0,4.0,4.5,4.5,4.0,4.5,4.0,4.0,5.0,5.0,4.5,5.0
274,4.0,4.0,4.5,3.0,5.0,4.5,4.5,3.5,4.0,4.5,4.0,4.5,4.0,5.0,5.0
288,4.5,,5.0,5.0,5.0,5.0,5.0,2.0,5.0,4.0,5.0,4.5,3.0,,3.5
307,4.0,4.5,3.5,3.5,4.5,4.5,4.0,3.5,4.5,2.5,4.5,3.0,3.5,4.0,4.0
380,5.0,4.0,4.0,5.0,5.0,3.0,5.0,5.0,,5.0,5.0,5.0,4.5,,4.0
387,,4.5,3.5,4.5,5.0,3.5,4.0,3.0,,3.5,4.0,4.5,4.0,4.5,4.5
414,4.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,5.0,4.0,5.0,5.0,5.0,5.0
448,5.0,4.0,,5.0,5.0,,3.0,3.0,,3.0,5.0,5.0,2.0,4.0,4.0


In [6]:
# split train and validation before encoding
np.random.seed(3)
msk = np.random.rand(len(ratings)) < 0.8
train = ratings[msk].copy()
val = ratings[~msk].copy()

In [7]:
# here is a handy function modified from fast.ai
# from index 0 to len uniq users
def proc_col(col, train_col=None):
    """Encodes a pandas column with continuous ids.
    If the validation list does not include the item, throw it away. (The model cannot interpret new users)
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [8]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [9]:
df_train = encode_data(train)
df_val = encode_data(val, train)

In [10]:
def train_epochs(model, epochs=10, lr=0.01, weight_decay=0.0, unsqueeze=False):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    for i in range(epochs):
        users = torch.LongTensor(df_train.userId.values) # .cuda()
        items = torch.LongTensor(df_train.movieId.values) #.cuda()
        ratings = torch.FloatTensor(df_train.rating.values) #.cuda()
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item()) 
    test_loss(model, unsqueeze)

In [11]:
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(df_val.userId.values) #.cuda()
    items = torch.LongTensor(df_val.movieId.values) #.cuda()
    ratings = torch.FloatTensor(df_val.rating.values) #.cuda()
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

## Embedding Dot

In [12]:
class EmbeddingDot(nn.Module):
    def __init__(self, n_users, n_movies, emb_size=100):
        super().__init__()
        self.u = nn.Embedding(n_users, emb_size)
        self.m = nn.Embedding(n_movies, emb_size)
        self.u.weight.data.uniform_(0,0.05)
        self.m.weight.data.uniform_(0,0.05)
        
    def forward(self, u, m):
        u = self.u(u)
        m = self.m(m)
        return (u*m).sum(1)

In [13]:
num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())

In [14]:
model = EmbeddingDot(num_users, num_items)
train_epochs(model, epochs=10, lr=0.1, weight_decay=1e-5)

12.909295082092285
4.940009117126465
2.4366283416748047
3.0966758728027344
0.8569969534873962
1.8453092575073242
2.7096493244171143
2.1920366287231445
1.0986590385437012
0.9130606651306152
test loss 1.781 


## Dot Product with Bias

In [15]:
min_rating,max_rating = ratings.rating.min(),ratings.rating.max()
min_rating,max_rating

(0.5, 5.0)

In [16]:
class EmbeddingDotBias(nn.Module):
    def __init__(self, num_users, num_movies, emb_size=100):
        super().__init__()
        self.u = nn.Embedding(num_users, emb_size)
        self.u_bias = nn.Embedding(num_users, 1)
        self.m = nn.Embedding(num_movies, emb_size)
        self.m_bias = nn.Embedding(num_movies, 1)
        self.u.weight.data.uniform_(0,0.05)
        self.m.weight.data.uniform_(0,0.05)
        self.u_bias.weight.data.uniform_(-0.01,0.01)
        self.m_bias.weight.data.uniform_(-0.01,0.01)

    def forward(self, u, m):
        U = self.u(u)
        M = self.m(m)
        b_u = self.u_bias(u).squeeze()
        b_m = self.m_bias(m).squeeze()
        return (U*M).sum(1) +  b_u  + b_m
    
    # modified from fast.ai
#     def forward(self, u, m):
#         um = (self.u(u)* self.m(m)).sum(1)
#         res = um + self.u_bias(u).squeeze() + self.m_bias(m).squeeze()
#         res = torch.sigmoid(res) * (max_rating-min_rating) + min_rating
#         return res.view(-1, 1)

In [17]:
model = EmbeddingDotBias(num_users, num_items)
train_epochs(model, epochs=10, lr=0.05, weight_decay=1e-5)

12.911107063293457
9.191313743591309
4.436343193054199
1.1668413877487183
2.4400174617767334
3.7634165287017822
2.4823460578918457
1.0890631675720215
0.8029473423957825
1.2979804277420044
test loss 2.059 


## Neural Network

In [18]:
class EmbeddingNet(nn.Module):
    def __init__(self, num_users, num_movies, n_hidden=10, emb_size=100):
        super().__init__()
        self.u = nn.Embedding(num_users, emb_size)
        self.m = nn.Embedding(num_movies, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        
#     def forward(self, u, m):
#         U = self.u(u)
#         M = self.m(m)
#         x = F.relu(torch.cat([U, M], dim=1))
#         x = self.drop1(x)
#         x = F.relu(self.lin1(x))
#         x = self.lin2(x)
#         return x
    
    # modified from fast.ai
    def forward(self, u, m):
        U = self.u(u)
        M = self.m(m)
        x = F.relu(torch.cat([U, M], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
#         x = self.drop2(F.relu(self.lin1(x)))
        return torch.sigmoid(self.lin2(x)) * (max_rating-min_rating+1) + min_rating-0.5

In [19]:
model = EmbeddingNet(num_users, num_items)
train_epochs(model, epochs=10, lr=0.05, weight_decay=1e-5, unsqueeze=True)

2.007108449935913
3.775191068649292
2.2173142433166504
1.1724269390106201
1.5189778804779053
1.556835412979126
1.4903018474578857
1.362932801246643
1.1984045505523682
1.0616257190704346
test loss 1.013 


In [20]:
train_epochs(model, epochs=10, lr=0.01, weight_decay=1e-5, unsqueeze=True)

1.0458847284317017
1.0222350358963013
1.0176129341125488
1.004241704940796
0.9897863268852234
0.9820201396942139
0.9724491834640503
0.9584047794342041
0.9445868730545044
0.9345718622207642
test loss 0.908 


In [21]:
train_epochs(model, epochs=10, lr=0.001, weight_decay=1e-5, unsqueeze=True)

0.9223780632019043
0.9197331666946411
0.9174303412437439
0.9147195816040039
0.914498507976532
0.9123048782348633
0.9106485843658447
0.9088290929794312
0.9073420763015747
0.9059321880340576
test loss 0.891 
