In [2]:
# Loading the dataset from the file 'jester-data-1.csv'
# IMPORTANT! Number 99 describes no label for that user of that joke so we won't take it in account for trainning

import pandas as pd
df = pd.read_csv("jester-data-1.csv", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,74,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,48,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


In [3]:
# Labeling 10% of the dataset cells as 99, to use as validation set
# We create an explicit copy of the original pandas df since we will need it in the near future
df_validation = df.copy()
df_validation.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,74,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,48,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


In [4]:
# Gather some intuition on the data we have
df_validation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24983 entries, 0 to 24982
Columns: 101 entries, 0 to 100
dtypes: float64(100), int64(1)
memory usage: 19.3 MB


In [9]:
# Now we have to change the 10% of cells to '99' to use later as Validation Data
# We will do it randomly

import random as rand

# Some basic calculations on the data set to perform random replacement 
users = df_validation.shape[0]
jokes = df_validation.shape[1]
cells = users * jokes

cells_not_specified = (df_validation.iloc[:,:] == 99).sum().sum()
cells_not_specified_percent = (cells_not_specified / cells) * 100

cells_specified = cells - cells_not_specified
cells_to_change = round(cells_specified * 0.1).astype(int)

row = 0
col = 0
for x in range(0, cells_to_change):  
    while (df_validation.iloc[row, col]) == 99:
        row = rand.randint(0, users - 1)
        col = rand.randint(0, jokes - 1)
    df_validation.at[row, col] = 99    

df_validation.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,99,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,...,99.0,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,99,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,99.0
2,49,99.0,99.0,99.0,99.0,9.03,9.27,99.0,9.27,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,48,99.0,99.0,99.0,99.0,99.0,8.16,-2.82,99.0,99.0,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,99.0,5.73,1.55,3.11,6.55,1.8,1.6


In [10]:
# We are going to use the latent factor modeling to infer the hidden ratings of the users

# Need to create some random data first
import numpy as np

factors = 3

latent_item_features = np.random.random((jokes, factors))
latent_user_preferences = np.random.random((users, factors))

print('Latent item features = ' + repr(latent_item_features))
print('Latent user preferences = ' + repr(latent_user_preferences))

Latent item features = array([[0.24648801, 0.8062602 , 0.82916327],
       [0.13581723, 0.57488177, 0.83246305],
       [0.38862293, 0.54284224, 0.8306284 ],
       [0.30408785, 0.14884232, 0.35308191],
       [0.82922707, 0.01008174, 0.47614702],
       [0.58293305, 0.51703404, 0.48319445],
       [0.57035424, 0.87531721, 0.59895565],
       [0.69499892, 0.39209428, 0.78534527],
       [0.03327705, 0.61187985, 0.23635725],
       [0.38677776, 0.44407993, 0.72213727],
       [0.24532639, 0.45941139, 0.89729986],
       [0.89170995, 0.65880554, 0.60315192],
       [0.89316177, 0.90951637, 0.79096795],
       [0.78782115, 0.3704574 , 0.72801482],
       [0.49165233, 0.65475199, 0.26055282],
       [0.59897425, 0.2004878 , 0.70184668],
       [0.76732527, 0.51374434, 0.29766803],
       [0.85832668, 0.0925216 , 0.08109104],
       [0.10421209, 0.12243861, 0.6497973 ],
       [0.74864205, 0.77517526, 0.34329873],
       [0.35170643, 0.05437815, 0.10039473],
       [0.06446889, 0.6983753 , 

In [11]:
# Predict ratings function.
def predict_rating(user_row, joke_col):
    """ Predict a rating given a user_row and an item_row. """
    
    user_values = latent_user_preferences[user_row]
    item_values = latent_item_features[joke_col]
    return user_values.dot(item_values)

In [12]:
# Training function.
def train(user_row, item_row, rating, alpha = 0.001):
    """ Adapt the values of user_preferences and item_factors to match
    the ones predicted by the users. """
    
    err = alpha * (rating - predict_rating(user_row, item_row))
    latent_user_preferences[user_row] += err * latent_item_features[item_row]
    latent_item_features[item_row] += err * latent_user_preferences[user_row]
    return err

In [13]:
# Train function. Ignore '99' values which indicates
# Either that the item has not been specified or it is part of the
# validation set.

def sgd_svd(iterations = 100):
    for i in range(0, iterations):
        training_errors = []
        for user_row in range(0, 100):
            for joke_col in range(0, jokes):
                rating = df_validation.iloc[user_row][joke_col]
                if (not np.isnan(rating) and rating < 99):
                    training_errors.append(train(user_row, joke_col, rating))
        if (i % 10 == 0):
            print("Training MSE, " + str(i) + ": " + str((np.array(training_errors) ** 2).mean()))

In [14]:
# Run the training function and print out updated tables.

sgd_svd()

print('Latent item features = ' + repr(latent_item_features))
print('Latent user preferences = ' + repr(latent_user_preferences))

Training MSE, 0: 5.9026064800130336e-05
Training MSE, 10: 2.3239108982807207e-05
Training MSE, 20: 2.215275870885048e-05
Training MSE, 30: 1.97953201709586e-05
Training MSE, 40: 1.739871077692763e-05
Training MSE, 50: 1.6588560400672865e-05
Training MSE, 60: 1.6294984725892056e-05
Training MSE, 70: 1.6157358698871053e-05
Training MSE, 80: 1.6079415009072958e-05
Training MSE, 90: 1.6029131914393527e-05
Latent item features = array([[ 1.21798795e+01,  1.82883850e+01,  1.28222846e+01],
       [ 1.07630741e+00, -1.16790761e+00,  1.69968765e+00],
       [ 7.99966614e-01, -1.11431727e+00,  1.45719532e+00],
       [-3.67566677e-01, -4.87277852e-01,  1.99924381e+00],
       [-5.77426708e-01, -1.58711097e+00,  2.04806406e+00],
       [-5.36786939e-01, -7.40079734e-01,  1.75572722e+00],
       [ 8.68211575e-01, -2.65334465e-01,  1.39879976e+00],
       [ 1.19802751e+00, -1.65965912e+00,  1.25505665e+00],
       [-1.11438034e+00,  4.27323677e-03,  1.22289343e+00],
       [-2.23295919e+00, -4.9393

In [15]:
# 4. Calculate the performance of the algorithm on the validation dataset.

validation_error = []
for user in range(0, 100):
    for joke in range(0, jokes):
        rating_training_data = df.iloc[user][joke]
        rating_validation_data = df_validation.iloc[user][joke]
        if (not rating_training_data == rating_validation_data):
            prediction = predict_rating(user, joke)
            validation_error.append(rating_training_data - predict_rating(user, joke))
            print("Expected: " + str(rating_training_data) + "\tActual: " + str(predict_rating(user, joke)))
print("Validation MSE: " + str((np.array(validation_error) ** 2).mean()))

Expected: 74.0	Actual: 72.115740705778
Expected: -8.5	Actual: -5.062537970249537
Expected: -6.75	Actual: -1.0926462478773031
Expected: -9.37	Actual: 2.397528681926924
Expected: -8.5	Actual: -4.718133813524545
Expected: 1.12	Actual: -2.568892970464135
Expected: 2.86	Actual: -1.9945892699981846
Expected: -4.08	Actual: 3.686009286099946
Expected: -9.08	Actual: -4.454220678601544
Expected: -8.4	Actual: -4.329505647261833
Expected: -7.14	Actual: -4.233087722463573
Expected: -6.26	Actual: -1.6890993148729239
Expected: 4.13	Actual: 6.376558572730386
Expected: 0.92	Actual: 1.68644925032447
Expected: -8.64	Actual: -8.144606760012142
Expected: 2.82	Actual: 0.3076534529581978
Expected: -1.36	Actual: 0.7053369009273762
Expected: -9.08	Actual: -8.188811135522773
Expected: 2.82	Actual: 1.126678684858807
Expected: 100.0	Actual: 17.390774817483134
Expected: 4.56	Actual: 1.3363123907957761
Expected: -0.68	Actual: 2.10873291261539
Expected: 9.17	Actual: 5.318186770279783
Expected: 4.71	Actual: 0.5400156

Expected: 6.84	Actual: 3.1825480004931395
Expected: -8.16	Actual: 2.6508619952509664
Expected: 7.52	Actual: -1.6415752615093249
Expected: -4.85	Actual: -0.9185117761727243
Expected: 4.85	Actual: 0.5462558397932588
Expected: -7.96	Actual: 0.2937236820889446
Expected: -4.95	Actual: -0.5117030795948538
Expected: -3.79	Actual: -0.5850847018902006
Expected: -9.51	Actual: -0.5398936620061009
Expected: -8.01	Actual: -0.04849552981368549
Expected: -3.69	Actual: -1.0787678480483065
Expected: 8.54	Actual: -0.7026417380046677
Expected: -3.74	Actual: 0.016034322535682155
Expected: 3.35	Actual: -0.9233385614669513
Expected: 5.63	Actual: -1.1765433059412926
Expected: -7.04	Actual: -0.7325628364234802
Expected: 5.87	Actual: -2.062746873603398
Expected: 4.03	Actual: -0.4498563466040504
Expected: 2.72	Actual: -1.36807980954249
Expected: 8.69	Actual: -0.2543221395428821
Expected: -5.97	Actual: -1.2247355547017462
Expected: 1.99	Actual: -1.9786457307454386
Expected: -4.66	Actual: -2.142470633040295
Expec

Expected: 4.76	Actual: 3.0996746375916095
Expected: 3.88	Actual: 4.194273010395745
Expected: 2.96	Actual: 1.492581365879441
Expected: 1.94	Actual: 2.7752755663055906
Expected: 2.52	Actual: 1.4803398909379735
Expected: -3.4	Actual: 2.004666752284974
Expected: 5.19	Actual: 1.6154497369153402
Expected: 1.46	Actual: 2.228702071916177
Expected: 3.74	Actual: 2.355964055979432
Expected: 1.5	Actual: 1.9548628468784255
Expected: 1.89	Actual: 1.9573710631364012
Expected: 2.18	Actual: 0.048055639003258044
Expected: 1.5	Actual: 0.756406975936543
Expected: 1.5	Actual: 0.37206340252405545
Expected: 4.08	Actual: 1.552530951630955
Expected: 1.65	Actual: 1.6302890959005905
Expected: -0.34	Actual: 1.2538100518338706
Expected: -6.07	Actual: 0.2588067314711151
Expected: -5.97	Actual: -0.05151538201987094
Expected: -7.09	Actual: 1.8157097682148728
Expected: -7.18	Actual: 2.249356310319721
Expected: -7.48	Actual: 2.7885238360131295
Expected: 8.98	Actual: 2.493314401670572
Expected: -3.88	Actual: 1.311151162

Expected: -0.58	Actual: -2.8762799290960017
Expected: -3.74	Actual: -5.139508046800598
Expected: -0.39	Actual: -0.2952311154078169
Expected: 2.91	Actual: -1.3153239487975579
Expected: -6.31	Actual: 1.7787455873984555
Expected: 1.7	Actual: 0.13211966062205205
Expected: 3.74	Actual: 6.16335230924884
Expected: -7.04	Actual: -2.084211704405913
Expected: -7.33	Actual: 4.402266348901637
Expected: 3.83	Actual: 2.7958340211422845
Expected: 73.0	Actual: 120.6139719519385
Expected: -9.56	Actual: 3.5746323241836464
Expected: 5.05	Actual: 1.4006140631078043
Expected: 6.12	Actual: 9.11322106282295
Expected: 7.96	Actual: 11.532897684201757
Expected: 9.22	Actual: 7.336169898021943
Expected: 8.5	Actual: 3.5492585201662497
Expected: 8.4	Actual: 7.489536693323995
Expected: 8.98	Actual: 9.583330772097261
Expected: -9.37	Actual: 1.3214450399708566
Expected: 8.93	Actual: 7.637033457787569
Expected: 7.14	Actual: 10.210645702765595
Expected: 8.79	Actual: 8.959847745988196
Expected: 3.4	Actual: -2.26894472355

Expected: 1.26	Actual: 1.9004972681912116
Expected: 0.24	Actual: 1.0693267473105357
Expected: 3.79	Actual: 2.823423052527344
Expected: 3.79	Actual: 1.9384730777695807
Expected: 6.55	Actual: 4.724221828623965
Expected: 6.36	Actual: -0.3480293927935215
Expected: -0.73	Actual: -3.464749643151563
Expected: 2.48	Actual: -0.0779494500836835
Expected: -2.82	Actual: 2.6419797253512614
Expected: 0.63	Actual: -0.032428510491128326
Expected: 2.28	Actual: 0.4449408486087133
Expected: -9.37	Actual: -2.230900354577789
Expected: 8.69	Actual: 5.7793933334643475
Expected: 8.59	Actual: 2.065696785919569
Expected: 3.98	Actual: 2.2617582672161722
Expected: 6.12	Actual: 5.366841170506978
Expected: 5.78	Actual: 2.0892121192738746
Expected: -4.71	Actual: -0.9980498868678144
Expected: 1.75	Actual: -1.1631742192491734
Expected: -3.5	Actual: -2.0054325470605403
Expected: -3.35	Actual: 1.1449465096005262
Expected: -3.11	Actual: -0.8131006712962354
Expected: -4.03	Actual: -3.961681911318902
Expected: -3.35	Actual

Expected: 7.52	Actual: 1.118014178670465
Expected: 8.35	Actual: 4.546423452196166
Expected: -8.45	Actual: -0.7053383099383084
Expected: 8.5	Actual: -1.279806707207841
Expected: 6.94	Actual: 0.4400851331795821
Expected: -8.01	Actual: 1.9389339838395245
Expected: 8.5	Actual: -0.30027526828494494
Expected: -8.01	Actual: -4.869687390545765
Expected: -7.86	Actual: 1.229702807508252
Expected: 6.07	Actual: 4.02174268406609
Expected: 8.01	Actual: 0.5503370290674717
Expected: 6.89	Actual: 2.6480152298904556
Expected: 7.14	Actual: -0.6570909226796654
Expected: 6.17	Actual: 2.4816198247283476
Expected: 5.39	Actual: 2.015378334722114
Expected: -4.61	Actual: -0.7030997579909775
Expected: -8.01	Actual: -0.7486569320775689
Expected: 7.57	Actual: 0.35466489497383097
Expected: 7.38	Actual: 2.221659235646272
Expected: 4.85	Actual: 4.622148571560501
Expected: 1.94	Actual: 5.347918001382036
Expected: 7.09	Actual: 3.754096931631489
Expected: 8.69	Actual: 4.616455798709542
Expected: 8.3	Actual: 3.5733277161

Expected: -5.24	Actual: -1.7488528645909067
Expected: 2.09	Actual: -0.3582399038560888
Expected: -7.18	Actual: -0.0266707974215441
Expected: 3.64	Actual: -2.2338757164704943
Expected: -1.21	Actual: -3.1205751301481675
Expected: 3.69	Actual: -1.4860367080475274
Expected: -1.26	Actual: 3.3456873206090902
Expected: 6.02	Actual: 2.5949388543395253
Expected: 0.34	Actual: 2.8454659939857914
Expected: -3.88	Actual: 1.5658844341623586
Expected: -2.04	Actual: -0.33251565014234197
Expected: 4.66	Actual: 3.1995394265975814
Expected: -3.3	Actual: 2.2429595162376637
Expected: 7.48	Actual: 2.9320977605982197
Expected: 6.02	Actual: 2.156258423283115
Expected: 0.49	Actual: 4.218951367485055
Expected: 2.91	Actual: 2.4758303039177765
Expected: 3.2	Actual: 2.7462383182640604
Expected: 6.6	Actual: 1.502951407818442
Expected: 0.19	Actual: 2.2599766267133603
Expected: 5.92	Actual: 3.371538329775037
Expected: 5.44	Actual: 2.6643167918785085
Expected: 9.13	Actual: 4.91084058896301
Expected: 1.94	Actual: 7.337

# TODO: Use pandas to find the best and the worst rated jokes

