# Building a Joke Recommendation System

In [None]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision

In [None]:
# Only needed on google colab
!pip install xlrd
#!pip install ipdb

In [1]:
import torch
import numpy as np
import pandas as pd
import os
import shutil
import random
import ipdb

import joke_utils

import torch.nn as nn
import torch.nn.functional as F

## Settings

In [2]:
PATH = 'data/jester/'

test_probs = (0.1, 0.2, 0.05)  # numbers for new users, new jokes, existing users & jokes
valid_prob = 0.05

gauge_set = [7, 8, 13, 15, 16, 17, 18, 19]

In [3]:
if not os.path.exists(PATH):
    os.makedirs(PATH)

In [None]:
!wget http://eigentaste.berkeley.edu/dataset/jester_dataset_3.zip
!unzip jester_dataset_3.zip
shutil.move('jesterfinal151cols.xls', PATH+'jesterfinal151cols.xls')

## Format Data

In [None]:
rat = pd.read_excel(PATH+'jesterfinal151cols.xls', header = None)
rat.head()

In [None]:
np.any(np.array(rat[gauge_set] == 99))  # Check if any rating is missing

In [None]:
# These jokes have been removed
rem_list = [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 20, 27, 31, 43, 51, 52, 61, 73, 80, 100, 116]
rat.drop(rem_list, axis = 1, inplace=True)

In [None]:
# Add user ID, name column 0 (indicating the # of rated movies)
rat['user_id'] = list(range(len(rat.index)))
rat.rename({0:'num_rated'}, axis = 1, inplace=True)
rat = rat.melt(id_vars=['user_id', 'num_rated'], var_name='joke_id', value_name='rating')
rat = rat[rat['rating'] != 99]
rat.head()

In [None]:
(len(rat.index), rat['user_id'].max(), len(gauge_set), len(set(rat['joke_id'])))

Summary:
- 50k users
- 128 jokes, 8 are a gauge set that everyone responded to
- 1.7 million ratings

In [None]:
rat.to_pickle(PATH+'processed_data.pkl')

## Separate train/valid/test sets

In [4]:
rat = pd.read_pickle(PATH+'processed_data.pkl')

In [5]:
rat.reset_index(drop = True, inplace=True)

In [6]:
train_idxs, valid_idxs, test_idxs, tnu, tnj, tnuj = joke_utils.get_idxs(rat, gauge_set, 
                                                                        test_probs, valid_prob)

## Basic Model

In [7]:
u_uniq = rat['user_id'].unique()
user2idx = {o:i for i,o in enumerate(u_uniq)}
idx2user = {i:o  for i, o in enumerate(u_uniq)}
rat['user_id'] = rat['user_id'].apply(lambda x: user2idx[x])

j_uniq = rat['joke_id'].unique()
joke2idx = {o:i for i, o in enumerate(j_uniq)}
idx2joke = {i:o for i, o in enumerate(j_uniq)}
rat['joke_id'] = rat['joke_id'].apply(lambda x: joke2idx[x])

n_users=int(rat['user_id'].nunique())
n_jokes =int(rat['joke_id'].nunique())

In [8]:
class ColabSimple(nn.Module):
    def __init__(self, n_user, n_joke, n_factor = 10):
        super().__init__()
        self.u = nn.Embedding(n_user, n_factor)
        self.j = nn.Embedding(n_joke, n_factor)
        
        self.u.weight.data.uniform_(0, 0.05)
        self.j.weight.data.uniform_(0, 0.05)
        
    def forward(self, x):
        users, jokes = x[:, 0], x[:, 1]
        u, j = self.u(users), self.j(jokes)
        return (u * j).sum(1).view(-1, 1)

In [9]:
model = ColabSimple(n_users, n_jokes)
print(model)

ColabSimple(
  (u): Embedding(50692, 10)
  (j): Embedding(128, 10)
)


In [10]:
# example input feed forward

In [14]:
inp = rat.loc[:32, ['user_id', 'joke_id']]
inp = torch.tensor(inp.values)
inp

TypeError: 'module' object is not callable

In [19]:
torch.tensor([2, 3])

TypeError: 'module' object is not callable

In [18]:
import torch

In [21]:
!conda install pytorch

Solving environment: done


  current version: 4.4.6
  latest version: 4.5.11

Please update conda by running

    $ conda update -n base conda



## Package Plan ##

  environment location: /home/krisztian/anaconda3

  added / updated specs: 
    - pytorch


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    nccl-1.3.5                 |        cuda9.0_0         8.6 MB
    cudnn-7.1.2                |        cuda9.0_0       367.8 MB
    certifi-2018.8.24          |           py36_1         140 KB
    openssl-1.0.2p             |       h14c3975_0         3.5 MB
    numpy-base-1.15.0          |   py36h3dfced4_0         4.2 MB
    pytorch-0.4.1              |   py36ha74772b_0       215.8 MB
    mkl_fft-1.0.4              |   py36h4414c95_1         150 KB
    mkl-2018.0.3               |                1       198.7 MB
    cudatoolkit-9.0            |       h13b8566_0       340.4 MB
    ninja-1