In [1]:
from __future__ import (absolute_import, division, print_function,             
                        unicode_literals)                                      
import pickle
import os

import pandas as pd

from surprise import KNNBasic
from surprise import Dataset                                                     
from surprise import Reader                                                      
from surprise import dump
from surprise.accuracy import rmse

In [2]:
# We will train and test on the u1.base and u1.test files of the movielens-100k dataset.
# if you haven't already, you need to download the movielens-100k dataset
# You can do it manually, or by running:

#Dataset.load_builtin('ml-100k')

# Now, let's load the dataset
train_file = os.path.expanduser('~') + '/Documents/ml-100k/u1.base'
test_file = os.path.expanduser('~') + '/Documents/ml-100k/u1.test'
data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))

                
# We'll use a basic nearest neighbor approach, where similarities are computed
# between users.
algo = KNNBasic()                                                       

for trainset, testset in data.folds(): 
    algo.train(trainset)                             
    predictions = algo.test(testset)
    rmse(predictions)
                                                                               
    dump('./dump_file', predictions, trainset, algo)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9889
The dump has been saved as file ./dump_file


In [3]:


# The dump has been saved and we can now use it whenever we want.
# Let's load it and see what we can do
dump_obj = pickle.load(open('./dump_file', 'rb'))



In [4]:


predictions = dump_obj['predictions']
trainset = dump_obj['trainset']
algo = dump_obj['algo']
print('algo: {0}, k = {1}, min_k = {2}'.format(algo['name'], algo['k'], algo['min_k']))



algo: KNNBasic, k = 40, min_k = 1


In [5]:
# Let's build a pandas dataframe with all the predictions

def get_Iu(uid):
    """Return the number of items rated by given user
    
    Args:
        uid: The raw id of the user.
    Returns:
        The number of items rated by the user.
    """
    
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError:  # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """Return the number of users that have rated given item
    
    Args:
        iid: The raw id of the item.
    Returns:
        The number of users that have rated the item.
    """
    
    try:
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:  # item was not part of the trainset
        return 0

df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])    
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)

In [6]:
df.head()

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,1,6,5.0,3.468613,"{'was_impossible': False, 'actual_k': 20}",135,20,1.531387
1,1,10,3.0,3.86629,"{'was_impossible': False, 'actual_k': 40}",135,73,0.86629
2,1,12,5.0,4.538194,"{'was_impossible': False, 'actual_k': 40}",135,211,0.461806
3,1,14,5.0,4.235741,"{'was_impossible': False, 'actual_k': 40}",135,140,0.764259
4,1,17,3.0,3.228002,"{'was_impossible': False, 'actual_k': 40}",135,72,0.228002


In [7]:
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [8]:


# Let's take a look at the best predictions of the algorithm
best_predictions



Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
272,5,439,1.0,1.0,"{'was_impossible': False, 'actual_k': 3}",91,3,0.0
886,13,314,1.0,1.0,"{'was_impossible': False, 'actual_k': 2}",373,2,0.0
156,2,314,1.0,1.0,"{'was_impossible': False, 'actual_k': 2}",40,2,0.0
926,13,437,1.0,1.0,"{'was_impossible': False, 'actual_k': 3}",373,3,0.0
9276,206,314,1.0,1.0,"{'was_impossible': False, 'actual_k': 1}",33,2,0.0
19118,405,437,1.0,1.0,"{'was_impossible': False, 'actual_k': 3}",582,3,0.0
8032,181,1334,1.0,1.0,"{'was_impossible': False, 'actual_k': 1}",218,1,0.0
8041,181,1354,1.0,1.0,"{'was_impossible': False, 'actual_k': 1}",218,1,0.0
9202,201,1424,3.0,3.0,"{'was_impossible': False, 'actual_k': 1}",215,1,0.0
3018,60,1123,4.0,4.0,"{'was_impossible': False, 'actual_k': 1}",119,1,0.0
