In [2]:
from surprise import Dataset
from surprise import accuracy
from surprise import SVD
from surprise.model_selection import train_test_split

In [3]:
# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /Users/newtype/.surprise_data/ml-100k


In [7]:
data.

<surprise.dataset.DatasetAutoFolds at 0x7fbf1ba096a0>

In [4]:
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

In [5]:
# We'll use the famous SVD algorithm.
algo = SVD()

In [6]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9389


0.938935046586064

# Evaluation by Hit Rate

In [14]:
import pandas as pd
import numpy as np
import os

In [15]:
def read_data_ml100k():
    data_dir = 'ml-100k/'
    names = ['user_id', 'item_id', 'rating', 'timestamp']
    data = pd.read_csv(os.path.join(data_dir, 'u.data'), '\t', names=names,
                       engine='python')
    num_users = data.user_id.unique().shape[0]
    num_items = data.item_id.unique().shape[0]
    return data, num_users, num_items

In [17]:
data, num_users, num_items = read_data_ml100k()
data_30 = data

In [18]:
# Leave one out split
data_30['rank_latest'] = data_30.groupby(['user_id'])['timestamp'].rank(method='first', ascending=False)

train_ratings = data_30[data_30['rank_latest'] != 1]
test_ratings = data_30[data_30['rank_latest'] == 1]

# drop columns that we no longer need
train_ratings = train_ratings[['user_id', 'item_id', 'rating']]
test_ratings = test_ratings[['user_id', 'item_id', 'rating']]

In [22]:
num_users = data_30['user_id'].max()+1
num_items = data_30['item_id'].max()+1
all_movieIds = data['item_id'].unique()

In [47]:
def find_hit_rate(at=10):
    # User-item pairs for testing
    test_user_item_set = set(zip(test_ratings['user_id'], test_ratings['item_id']))

    # Dict of all items that are interacted with by each user
    user_interacted_items = data_30.groupby('user_id')['item_id'].apply(list).to_dict()

    hits = []
    for (u,i) in test_user_item_set:
        interacted_items = user_interacted_items[u]
        not_interacted_items = set(all_movieIds) - set(interacted_items)
        selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
        test_items = selected_not_interacted + [i]
        
        predicted_labels = []
        for j in range(100):
            rating_pred = algo.predict(u, test_items[j])[3]
            predicted_labels.append(rating_pred)
        #print(predicted_labels)
        top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:at].tolist()]
        
        if i in top10_items:
            hits.append(1)
        else:
            hits.append(0)

    print("\nThe Hit Ratio @ {} is {:.4f}".format(at, np.average(hits)))

In [48]:
find_hit_rate(at=1)
find_hit_rate(at=5)
find_hit_rate(at=10)


The Hit Ratio @ 1 is 1.0000

The Hit Ratio @ 5 is 1.0000

The Hit Ratio @ 10 is 1.0000
