# Building a Joke Recommendation System

TODO:

Run on whole training data.

Use valiation set to optimize parameters (such as # of activations)

Put model on gpu

In [None]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision

In [None]:
# Only needed on google colab
!pip install xlrd
!pip install ipdb

In [1]:
import torch
import numpy as np
import pandas as pd
import os
import shutil
import random
import ipdb

import joke_utils

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset
import torch.optim as optim

## Settings

In [2]:
PATH = 'data/jester/'

test_probs = (0.1, 0.2, 0.05)  # numbers for new users, new jokes, existing users & jokes
valid_prob = 0.05

bs = 32  # mini-batch size

gauge_set = [7, 8, 13, 15, 16, 17, 18, 19]

In [None]:
if not os.path.exists(PATH):
    os.makedirs(PATH)

In [None]:
!wget http://eigentaste.berkeley.edu/dataset/jester_dataset_3.zip
!unzip jester_dataset_3.zip
shutil.move('jesterfinal151cols.xls', PATH+'jesterfinal151cols.xls')

## Format Data

In [3]:
rat = pd.read_excel(PATH+'jesterfinal151cols.xls', header = None)
rat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,62,99,99,99,99,0.21875,99,-9.28125,-9.28125,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
1,34,99,99,99,99,-9.6875,99,9.9375,9.53125,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
2,18,99,99,99,99,-9.84375,99,-9.84375,-7.21875,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
3,82,99,99,99,99,6.90625,99,4.75,-5.90625,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
4,27,99,99,99,99,-0.03125,99,-9.09375,-0.40625,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0


In [4]:
np.any(np.array(rat[gauge_set] == 99))  # Check if any rating is missing

False

In [5]:
# These jokes have been removed
rem_list = [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 20, 27, 31, 43, 51, 52, 61, 73, 80, 100, 116]
rat.drop(rem_list, axis = 1, inplace=True)

In [6]:
rat.fillna(value = 99, inplace=True)  #The last row has some missing values at the end -- 

In [7]:
# Add user ID, name column 0 (indicating the # of rated movies)
rat['user_id'] = list(range(len(rat.index)))
rat.rename({0:'num_rated'}, axis = 1, inplace=True)
rat = rat.melt(id_vars=['user_id', 'num_rated'], var_name='joke_id', value_name='rating')
rat = rat[rat['rating'] != 99]
rat.head()

Unnamed: 0,user_id,num_rated,joke_id,rating
0,0,62,7,-9.28125
1,1,34,7,9.9375
2,2,18,7,-9.84375
3,3,82,7,4.75
4,4,27,7,-9.09375


In [8]:
(len(rat.index), rat['user_id'].max(), len(gauge_set), len(set(rat['joke_id'])))

(1725737, 50691, 8, 128)

In [13]:
assert ~rat.isnull().values.any()

Summary:
- 50k users
- 128 jokes, 8 are a gauge set that everyone responded to
- 1.7 million ratings

In [14]:
rat.to_pickle(PATH+'processed_data.pkl')

## Separate train/valid/test sets

In [15]:
rat = pd.read_pickle(PATH+'processed_data.pkl')

In [16]:
rat.reset_index(drop = True, inplace=True)

In [17]:
train_idxs, valid_idxs, test_idxs, tnu, tnj, tnuj = joke_utils.get_idxs(rat, gauge_set, 
                                                                        test_probs, valid_prob)

## Create a pytorch dataset / data loaders

In [18]:
u_uniq = rat['user_id'].unique()
user2idx = {o:i for i,o in enumerate(u_uniq)}
idx2user = {i:o  for i, o in enumerate(u_uniq)}
rat['user_id'] = rat['user_id'].apply(lambda x: user2idx[x])

j_uniq = rat['joke_id'].unique()
joke2idx = {o:i for i, o in enumerate(j_uniq)}
idx2joke = {i:o for i, o in enumerate(j_uniq)}
rat['joke_id'] = rat['joke_id'].apply(lambda x: joke2idx[x])

n_users=int(rat['user_id'].nunique())
n_jokes =int(rat['joke_id'].nunique())

In [19]:
def conv2Tens(df, idx=None):
    if idx != None:
        df = df.iloc[idx]
    
    x = torch.tensor(df[['user_id', 'joke_id']].values, dtype = torch.int64)
    y = torch.tensor(df['rating'].values, dtype = torch.float32)
    
    return x, y

In [33]:
assert False  #Note: take out the 10000 from training -- only used to make notebook run faster
train_ds = TensorDataset(*conv2Tens(rat, train_idxs[:10000]))
valid_ds = TensorDataset(*conv2Tens(rat, valid_idxs))
test_ds = TensorDataset(*conv2Tens(rat, test_idxs))

In [34]:
train_dl = DataLoader(train_ds, batch_size=bs, shuffle = True)
valid_dl = DataLoader(valid_ds, batch_size=bs)
test_dl = DataLoader(test_ds, batch_size=bs)

## Simple Collaborative Filtering Model

In [50]:
y_range = (-10, 10)

def get_emb(ni,nf):
    e = nn.Embedding(ni, nf)
    e.weight.data.uniform_(0, 0.05)
    return e

class ColabSimple(nn.Module):
    '''https://github.com/fastai/fastai/blob/master/courses/dl1/lesson5-movielens.ipynb'''
    def __init__(self, n_user, n_joke, n_factors = 10):
        super().__init__()        
        (self.u, self.j, self.ub, self.jb) = [get_emb(*o) for o in [
            (n_users, n_factors), (n_jokes, n_factors), (n_users,1), (n_jokes,1)]]
        
    def forward(self, x):
        users, jokes = x[:, 0], x[:, 1]
        u, j = self.u(users), self.j(jokes)
        res = (u * j).sum(1)
        res = res + self.ub(users).squeeze() + self.jb(jokes).squeeze()
        res = torch.sigmoid(res) * (y_range[1]-y_range[0]) + y_range[0]
        return res.view(-1, 1).squeeze()

In [51]:
assert False
# Need to put on GPU  -- do the same for other models below!
# model = ColabSimple(n_users, n_jokes).cuda()
model = ColabSimple(n_users, n_jokes)
print(model)

ColabSimple(
  (u): Embedding(50692, 10)
  (j): Embedding(128, 10)
  (ub): Embedding(50692, 1)
  (jb): Embedding(128, 1)
)


In [52]:
def loss_batch(model, loss_func, xb, yb, opt=None):
    '''https://github.com/fastai/fastai_v1/blob/master/dev_nb/001a_nn_basics.ipynb'''
    loss = loss_func(model(xb), yb)

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()
        
    return loss.item(), len(xb)

def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
    '''https://github.com/fastai/fastai_v1/blob/master/dev_nb/001a_nn_basics.ipynb'''
    for epoch in range(epochs):
        
        # Fit model to training data
        model.train()
        losses,nums = zip(*[loss_batch(model, loss_func, xb, yb) for xb,yb in train_dl])
        train_loss = np.sum(np.multiply(losses,nums)) / np.sum(nums)
        
        for xb,yb in train_dl: loss_batch(model, loss_func, xb, yb, opt)
            
        # Calculate loss on validation set
        model.eval()
        with torch.no_grad():
            losses,nums = zip(*[loss_batch(model, loss_func, xb, yb)
                                for xb,yb in valid_dl])
        val_loss = np.sum(np.multiply(losses,nums)) / np.sum(nums)

        print(f'Epoch {epoch}. Training loss: {train_loss}. Validation loss: {val_loss}.')

In [53]:
opt = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
loss_func = nn.MSELoss()

In [54]:
fit(3, model, loss_func, opt, train_dl, valid_dl)

Epoch 0. Training loss: 39.257692657470706. Validation loss: 29.778694172019136.
Epoch 1. Training loss: 29.538735485839844. Validation loss: 29.425833557322015.
Epoch 2. Training loss: 44.47094608764648. Validation loss: 29.64731864300534.


## Fit Neural Net with 1 Hidden Layer

In [74]:
class ColabDeep(nn.Module):
    def __init__(self, n_user, n_joke, nhs = [10], ps = [0.25], n_factors = 10, emb_ps = 0.05):
        super().__init__() 
        
        assert len(nhs) > 0
        assert len(nhs) == len(ps)
        
        (self.u, self.j) = [get_emb(*o) for o in [
            (n_users, n_factors), (n_jokes, n_factors)]]
        
        self.num_hidden = len(nhs) 
        self.hidden_layers = []
        self.drops = []
        self.emb_drop = nn.Dropout(emb_ps)
        
        for i in range(self.num_hidden):
            if i == 0:
                self.hidden_layers.append(nn.Linear(n_factors*2, nhs[0]))
            else:
                self.hidden_layers.append(nn.Linear(nhs[i-1], nhs[i]))
            self.drops.append(nn.Dropout(ps[i]))
        
        self.last_layer = nn.Linear(nhs[self.num_hidden - 1], 1)
                
    def forward(self, x):
        users, jokes = x[:, 0], x[:, 1]
        u, j = self.u(users), self.j(jokes)
        
        X = self.emb_drop(torch.cat([u, j], dim = 1))
        
        for i in range(self.num_hidden):
            drop = self.drops[i]
            layer = self.hidden_layers[i]
            X = drop(F.relu(layer(X)))
        
        res = self.last_layer(X)
        res = torch.sigmoid(res) * (y_range[1]-y_range[0]) + y_range[0]
        return res.view(-1, 1).squeeze()

In [75]:
model2 = ColabDeep(n_users, n_jokes)

In [76]:
wd=1e-5
opt = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=wd)
loss_func = nn.MSELoss()

In [77]:
fit(1, model2, loss_func, opt, train_dl, valid_dl)

Epoch 0. Training loss: 34.12640700683594. Validation loss: 33.93807824631768.


In [78]:
model3 = ColabDeep(n_users, n_jokes, nhs = [10, 5], ps = [0.25, 0.25])

In [79]:
fit(1, model3, loss_func, opt, train_dl, valid_dl)

Epoch 0. Training loss: 38.935981842041016. Validation loss: 30.00822083715675.


## Predict on test set

In [103]:
def predict(m, dl, as_torch = False):
    
    m.eval()
    preds = []
    ys = []
    
    with torch.no_grad():     
        for xb, yb in dl:
            preds.append(m(xb))
            ys.append(yb)
            
    preds = torch.cat(preds, dim = 0)
    ys = torch.cat(ys, dim = 0)
    
    if ~as_torch: 
        preds = np.array(preds)
        ys = np.array(ys)
            
    return preds, ys

In [104]:
preds, targets = predict(model, test_dl)

In [108]:
def mse(pred, target):
    return np.mean((pred - target)**2)

In [109]:
mse(preds, targets)

29.637838