**Context**

Jester is a joke recommender system developed at UC Berkeley to study social information filtering. Users of the system are presented a joke and then they rate them. This dataset is a collection of those ratings.

http://eigentaste.berkeley.edu/

Eigentaste: A Constant Time Collaborative Filtering Algorithm. Ken Goldberg, Theresa Roeder, Dhruv Gupta, and Chris Perkins. Information Retrieval, 4(2), 133-151. July 2001.

**Content**

Notes from the source:

Each row is a user (Row 1 = User #1)

Each column is a joke (Column 1 = Joke #1)

Ratings are given as real values from -10.00 to +10.00

99 corresponds to a null rating

As of May 2009, the jokes 7, 8, 13, 15, 16, 17, 18, 19 are the "gauge set" (as discussed in the Eigentaste paper)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds
import numpy as np
import random

In [2]:
jokes_df = pd.read_csv('jester_items.tsv', sep=":\t", header=None, engine='python').rename(columns={0: "jokeID", 1: "title"})
jokes_df.head()
jokes_df.shape

(149, 2)

In [3]:
columns = ['userID'] + range(1,151)
df = pd.read_csv('jesterfinal151cols.csv', header = None, names = columns)
print df.describe()
df.head(10)

             userID        1        2        3        4             5  \
count  50692.000000  50692.0  50692.0  50692.0  50692.0  50692.000000   
mean      34.104967     99.0     99.0     99.0     99.0     97.871901   
std       33.519225      0.0      0.0      0.0      0.0     10.631768   
min        8.000000     99.0     99.0     99.0     99.0    -10.000000   
25%       11.000000     99.0     99.0     99.0     99.0     99.000000   
50%       20.000000     99.0     99.0     99.0     99.0     99.000000   
75%       42.000000     99.0     99.0     99.0     99.0     99.000000   
max      140.000000     99.0     99.0     99.0     99.0     99.000000   

             6             7             8        9     ...       \
count  50692.0  50692.000000  50692.000000  50692.0     ...        
mean      99.0     -1.952510     -0.716500     99.0     ...        
std        0.0      5.370893      5.153371      0.0     ...        
min       99.0    -10.000000    -10.000000     99.0     ...        
25

Unnamed: 0,userID,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,62,99,99,99,99,0.21875,99,-9.28125,-9.28125,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
1,34,99,99,99,99,-9.6875,99,9.9375,9.53125,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
2,18,99,99,99,99,-9.84375,99,-9.84375,-7.21875,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
3,82,99,99,99,99,6.90625,99,4.75,-5.90625,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
4,27,99,99,99,99,-0.03125,99,-9.09375,-0.40625,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
5,46,99,99,99,99,-2.90625,99,-2.34375,-0.5,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
6,99,99,99,99,99,6.21875,99,-7.4375,-0.8125,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
7,15,99,99,99,99,8.25,99,9.0,8.875,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
8,104,99,99,99,99,-5.75,99,0.28125,0.78125,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
9,24,99,99,99,99,-7.15625,99,-5.90625,-0.09375,99,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0


In [4]:
df = pd.melt(df, id_vars=['userID'], value_name = 'rating', var_name = 'jokeID')
df.head(10)

Unnamed: 0,userID,jokeID,rating
0,62,1,99.0
1,34,1,99.0
2,18,1,99.0
3,82,1,99.0
4,27,1,99.0
5,46,1,99.0
6,99,1,99.0
7,15,1,99.0
8,104,1,99.0
9,24,1,99.0


In [5]:
df = df[df.rating != 99]
df = df[df.rating > 0]
df = df[df.duplicated(subset=['userID','jokeID'], keep=False)]
print (df)
df.head(10)


         userID jokeID   rating
202768       62      5  0.21875
202774       99      5  6.21875
202775       15      5  8.25000
202781      109      5  0.46875
202782       42      5  6.28125
202787       16      5  4.28125
202789       16      5  5.12500
202790       16      5  1.84375
202796       80      5  9.68750
202802       49      5  9.43750
202804       23      5  2.56250
202808      113      5  3.06250
202810      110      5  2.21875
202814       17      5  5.15625
202815       50      5  7.68750
202816      107      5  1.56250
202817      120      5  9.87500
202818       22      5  4.25000
202819       14      5  4.25000
202820       80      5  0.50000
202824       17      5  3.25000
202825       16      5  0.46875
202826       45      5  5.87500
202828       64      5  4.56250
202829       16      5  6.59375
202831       13      5  3.18750
202835      100      5  6.28125
202838       15      5  7.28125
202839       99      5  1.87500
202845       37      5  1.31250
...     

Unnamed: 0,userID,jokeID,rating
202768,62,5,0.21875
202774,99,5,6.21875
202775,15,5,8.25
202781,109,5,0.46875
202782,42,5,6.28125
202787,16,5,4.28125
202789,16,5,5.125
202790,16,5,1.84375
202796,80,5,9.6875
202802,49,5,9.4375


In [6]:
df = df.groupby(['userID','jokeID'], as_index=False)['rating'].mean()

In [7]:
interactions_train_df, interactions_test_df = train_test_split(df,
                                   stratify=df['userID'], 
                                   test_size=0.20)

In [8]:
interactions_full_indexed_df = df.set_index('userID')
interactions_train_indexed_df = interactions_train_df.set_index('userID')
interactions_test_indexed_df = interactions_test_df.set_index('userID')

def get_items_interacted(person_id, interactions_df):
    interacted_items = interactions_df.loc[person_id]['jokeID']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [15]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 10

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)
        all_items = set(jokes_df['jokeID'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['jokeID']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['jokeID'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['jokeID'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id, 
                                                                                    interactions_train_indexed_df), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          )#seed=item_id%(2**32)

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['jokeID'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['jokeID'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator()

In [10]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = interactions_train_df.pivot_table(index='userID', 
                                                          columns='jokeID', 
                                                          values='rating',
                                                         aggfunc='mean').fillna(0)

users_items_pivot_matrix_df.head(10)

jokeID,5,7,8,13,15,16,17,18,19,20,...,141,142,143,144,145,146,147,148,149,150
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,4.091291,4.135887,4.470871,4.322814,4.391224,4.643686,4.248032,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,6.380208,4.038043,0.0,4.584327,0.0,4.316548,4.531909,4.016176,4.661265,0.0,...,0.0,6.848958,3.008929,0.0,0.0,0.0,3.484375,0.0,0.0,0.0
10,6.28125,3.805186,3.684135,0.0,3.913157,4.220669,4.502888,4.030304,0.0,5.578125,...,0.0,5.858902,0.0,4.413603,4.600962,8.585938,2.93125,4.285511,0.0,3.496691
11,4.3125,0.0,3.956073,0.0,4.026832,4.265765,4.603171,4.097025,4.614371,0.0,...,0.0,6.483333,4.585478,5.5125,2.527344,5.240625,5.039062,4.460938,0.0,3.867457
12,0.0,3.891404,3.955154,4.254602,0.0,4.038962,4.486666,3.980464,4.493597,0.0,...,7.729167,0.0,4.62367,3.555556,0.0,0.0,4.253906,4.515625,5.78125,3.823798
13,6.979167,0.0,0.0,4.314942,4.164765,4.246611,4.653186,4.354419,4.657867,8.3125,...,0.0,0.0,5.295625,4.480114,4.43099,5.029687,4.514323,4.57711,5.809659,4.399635
14,6.59375,0.0,0.0,4.194284,4.019511,4.153017,4.483002,0.0,4.582685,6.84375,...,4.286458,0.0,4.601695,4.741935,3.894097,4.366667,0.0,4.910601,6.31901,0.0
15,4.310096,3.662999,3.754416,4.204579,3.760106,3.991612,0.0,0.0,0.0,4.590774,...,2.3125,5.652841,5.005,0.0,0.0,6.098958,4.735938,4.884328,0.0,4.497673
16,3.965278,3.674548,3.876853,4.212318,3.866745,3.865888,4.399863,3.969268,4.359865,4.957813,...,3.5,5.591631,4.739081,0.0,0.0,4.832386,0.0,4.231924,5.567587,4.355699
17,3.984375,3.689776,3.816903,0.0,4.084627,0.0,4.430173,3.860795,4.403116,3.519531,...,0.0,6.477319,0.0,0.0,4.708576,0.0,4.602679,0.0,5.2425,4.564474


In [11]:
users_items_pivot_matrix = users_items_pivot_matrix_df.as_matrix()
users_items_pivot_matrix[:10]

array([[ 0.        ,  4.0912913 ,  4.13588715, ...,  0.        ,
         0.        ,  0.        ],
       [ 6.38020833,  4.03804348,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 6.28125   ,  3.80518617,  3.68413462, ...,  4.28551136,
         0.        ,  3.49669118],
       ..., 
       [ 4.31009615,  3.66299854,  3.75441628, ...,  4.88432836,
         0.        ,  4.49767287],
       [ 3.96527778,  3.67454848,  3.87685338, ...,  4.23192402,
         5.56758721,  4.35569853],
       [ 3.984375  ,  3.68977649,  3.81690292, ...,  0.        ,
         5.2425    ,  4.56447368]])

In [12]:
users_ids = list(users_items_pivot_matrix_df.index)
print users_ids[:10]

#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)
print U.shape
print Vt.shape
sigma = np.diag(sigma)
print sigma.shape 

all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
print all_user_predicted_ratings

#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()
print cf_preds_df.head(10)

print len(cf_preds_df.columns)

[8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
(123, 15)
(15, 140)
(15, 15)
[[ 0.10539886  0.56998632  0.59446534 ...,  0.35604752 -0.16617487
   0.46131212]
 [ 2.319156    3.00086398  3.90323006 ...,  2.10714551  0.40308356
   2.61657549]
 [ 3.5872394   3.6942116   3.08566356 ...,  3.84362859 -1.94617486
   3.18356044]
 ..., 
 [ 1.17751307  2.55193503  3.03593908 ...,  4.32626162  2.98286869
   3.41382531]
 [-0.87046675  0.30552533  0.32391183 ...,  2.91240475  2.30118467
   2.65728455]
 [ 2.05643737  1.22172049  1.27447691 ...,  4.39137047  4.97590894
   3.72311746]]
             8         9         10        11        12        13        14   \
jokeID                                                                         
5       0.105399  2.319156  3.587239  4.409903  1.804997  6.493925  2.776867   
7       0.569986  3.000864  3.694212  1.208492  0.758726  0.928671  0.827853   
8       0.594465  3.903230  3.085664  4.827711  4.219290  1.187831  1.963818   
13      0.679457  3.057601  3.30

In [13]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        #self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['jokeID'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)
        return recommendations_df
    
cf_recommender_model = CFRecommender(cf_preds_df)

'''
        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'jokeID', 
                                                          right_on = 'jokeID')[['recStrength', 'jokeID', 'title', 'url', 'lang']]
'''

In [16]:
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...
122 users processed

Global metrics:
{'recall@5': 0.9737263697532842, 'modelName': 'Collaborative Filtering', 'recall@10': 1.0}


Unnamed: 0,_person_id,hits@10_count,hits@5_count,interacted_count,recall@10,recall@5
14,120,28,28,28,1.0,1.0
36,113,28,28,28,1.0,1.0
94,110,28,28,28,1.0,1.0
108,107,28,28,28,1.0,1.0
0,37,27,27,27,1.0,1.0
31,109,27,27,27,1.0,1.0
27,102,27,27,27,1.0,1.0
41,80,27,25,27,1.0,0.925926
42,96,27,27,27,1.0,1.0
74,99,27,27,27,1.0,1.0
