# Building a Joke Recommendation System

In [1]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision

[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
# Only needed on google colab
!pip install xlrd

[33mYou are using pip version 10.0.1, however version 18.0 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [77]:
import torch
import numpy as np
import pandas as pd
import os
import shutil
import random

import joke_utils

In [4]:
from sklearn.model_selection import train_test_split

## Settings

In [16]:
PATH = 'data/jester/'

test_probs = (0.1, 0.2, 0.05)  # numbers for new users, new jokes, existing users & jokes
valid_prob = 0.05

gauge_set = [7, 8, 13, 15, 16, 17, 18, 19]

In [5]:
if not os.path.exists(PATH):
    os.makedirs(PATH)

In [6]:
!wget http://eigentaste.berkeley.edu/dataset/jester_dataset_3.zip
!unzip jester_dataset_3.zip
shutil.move('jesterfinal151cols.xls', PATH+'jesterfinal151cols.xls')

--2018-09-12 17:21:44--  http://eigentaste.berkeley.edu/dataset/jester_dataset_3.zip
Resolving eigentaste.berkeley.edu (eigentaste.berkeley.edu)... 128.32.192.72
Connecting to eigentaste.berkeley.edu (eigentaste.berkeley.edu)|128.32.192.72|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5339014 (5.1M) [application/zip]
Saving to: ‘jester_dataset_3.zip’


2018-09-12 17:21:54 (585 KB/s) - ‘jester_dataset_3.zip’ saved [5339014/5339014]

Archive:  jester_dataset_3.zip
  inflating: jesterfinal151cols.xls  


'data/jester/jesterfinal151cols.xls'

## Format Data

In [8]:
rat = pd.read_excel(PATH+'jesterfinal151cols.xls', header = None)
rat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,62,99,99,99,99,0.21875,99,-9.28125,-9.28125,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
1,34,99,99,99,99,-9.6875,99,9.9375,9.53125,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
2,18,99,99,99,99,-9.84375,99,-9.84375,-7.21875,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
3,82,99,99,99,99,6.90625,99,4.75,-5.90625,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
4,27,99,99,99,99,-0.03125,99,-9.09375,-0.40625,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0


In [9]:
np.any(np.array(rat[gauge_set] == 99))  # Check if any rating is missing

False

In [10]:
# These jokes have been removed
rem_list = [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 20, 27, 31, 43, 51, 52, 61, 73, 80, 100, 116]
rat.drop(rem_list, axis = 1, inplace=True)

In [11]:
# Add user ID, name column 0 (indicating the # of rated movies)
rat['user_id'] = list(range(len(rat.index)))
rat.rename({0:'num_rated'}, axis = 1, inplace=True)
rat = rat.melt(id_vars=['user_id', 'num_rated'], var_name='joke_id', value_name='rating')
rat = rat[rat['rating'] != 99]
rat.head()

Unnamed: 0,user_id,num_rated,joke_id,rating
0,0,62,7,-9.28125
1,1,34,7,9.9375
2,2,18,7,-9.84375
3,3,82,7,4.75
4,4,27,7,-9.09375


In [12]:
(len(rat.index), rat['user_id'].max(), len(gauge_set), len(set(rat['joke_id'])))

(1725765, 50691, 8, 128)

Summary:
- 50k users
- 128 jokes, 8 are a gauge set that everyone responded to
- 1.7 million ratings

In [14]:
rat.to_pickle(PATH+'processed_data.pkl')

In [15]:
# rat = pd.read_pickle(PATH+'processed_data.pkl')

## Separate train/valid/test sets

In [57]:
rat.reset_index(drop = True, inplace=True)

In [78]:
train_idxs, valid_idxs, test_idxs, tnu, tnj, tnuj = get_idxs(rat, gauge_set, test_probs, valid_prob)