# Building a Joke Recommendation System

In [0]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision
import torch

In [22]:
# Only needed on google colab
!pip install xlrd

Collecting xlrd
[?25l  Downloading https://files.pythonhosted.org/packages/07/e6/e95c4eec6221bfd8528bcc4ea252a850bffcc4be88ebc367e23a1a84b0bb/xlrd-1.1.0-py2.py3-none-any.whl (108kB)
[K    100% |████████████████████████████████| 112kB 4.4MB/s 
[?25hInstalling collected packages: xlrd
Successfully installed xlrd-1.1.0


In [0]:
import numpy as np
import pandas as pd
import os
import shutil

In [0]:
import random

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
PATH = 'data/jester/'

if not os.path.exists(PATH):
    os.makedirs(PATH)

In [18]:
!wget http://eigentaste.berkeley.edu/dataset/jester_dataset_3.zip
!unzip jester_dataset_3.zip
shutil.move('jesterfinal151cols.xls', PATH+'jesterfinal151cols.xls')


Redirecting output to ‘wget-log.1’.
Archive:  jester_dataset_3.zip
  inflating: jesterfinal151cols.xls  


'data/jester/jesterfinal151cols.xls'

## Format Data

In [24]:
rat = pd.read_excel(PATH+'jesterfinal151cols.xls', header = None)
rat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,62,99,99,99,99,0.21875,99,-9.28125,-9.28125,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
1,34,99,99,99,99,-9.6875,99,9.9375,9.53125,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
2,18,99,99,99,99,-9.84375,99,-9.84375,-7.21875,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
3,82,99,99,99,99,6.90625,99,4.75,-5.90625,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
4,27,99,99,99,99,-0.03125,99,-9.09375,-0.40625,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0


In [26]:
gauge_set = [7, 8, 13, 15, 16, 17, 18, 19]
np.any(np.array(rat[gauge_set] == 99))  # Check if any rating is missing

False

In [0]:
# These jokes have been removed
rem_list = [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 20, 27, 31, 43, 51, 52, 61, 73, 80, 100, 116]
rat.drop(rem_list, axis = 1, inplace=True)

In [28]:
# Add user ID, name column 0 (indicating the # of rated movies)
rat['user_id'] = list(range(len(rat.index)))
rat.rename({0:'num_rated'}, axis = 1, inplace=True)
rat = rat.melt(id_vars=['user_id', 'num_rated'], var_name='joke_id', value_name='rating')
rat = rat[rat['rating'] != 99]
rat.head()

Unnamed: 0,user_id,num_rated,joke_id,rating
0,0,62,7,-9.28125
1,1,34,7,9.9375
2,2,18,7,-9.84375
3,3,82,7,4.75
4,4,27,7,-9.09375


In [29]:
(len(rat.index), rat['user_id'].max(), len(gauge_set), len(set(rat['joke_id'])))

(1725765, 50691, 8, 128)

Summary:
- 50k users
- 128 jokes, 8 are a gauge set that everyone responded to
- 1.7 million ratings

## Separate train/valid/test sets

In [95]:
test_probs = (0.1, 0.2, 0.05)  # numbers for new users, new jokes, existing users & jokes
valid_prob = 0.05

user_idxs = set(rat['user_id'].unique())
joke_idxs = set(rat['joke_id'].unique()) - set(gauge_set)
random.seed(101)
test_user_idxs = random.sample(user_idxs, int(len(user_idxs)*test_probs[0]))
random.seed(101)
test_joke_idxs = random.sample(joke_idxs, int(len(joke_idxs)*test_probs[1]))
print(test_user_idxs[:10])
print(test_joke_idxs)

[38086, 12763, 35338, 23506, 30614, 3179, 43481, 32998, 14058, 39455]
[104, 139, 48, 147, 98, 72, 88, 28, 114, 93, 53, 107, 54, 63, 91, 119, 131, 140, 69, 85, 32, 58, 148, 44]


In [0]:
user_flag = rat['user_id'].isin(test_user_idxs)
joke_flag = rat['joke_id'].isin(test_joke_idxs)

test_user = rat[user_flag & -joke_flag]
test_joke = rat[-user_flag & joke_flag]

In [0]:
rem = rat[-user_flag & -joke_flag]
X = rem[['user_id', 'num_rated', 'joke_id']]
y = rem['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_probs[2], 
                                                    random_state=101)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, 
                                                      test_size=valid_prob, 
                                                      random_state=101)

In [100]:
assert len(y_train)+len(y_valid)+len(y_test) == len(rem.index)
len(y_train), len(y_valid), len(y_test)

(1147716, 60407, 63586)

In [101]:
X_train.head()

Unnamed: 0,user_id,num_rated,joke_id
1949131,22835,58,56
388665,33821,128,19
118158,16774,8,13
4238341,30905,27,105
3731749,31233,128,94
