In [6]:
import pandas as pd 
import numpy as np
import random
import surprise as sp
from surprise import Dataset, Reader, SVD, evaluate
from sklearn import cross_validation as cv

ratingscol = ['UserID', 'MovieID', 'Rating', 'Timestamp']
ratings = pd.read_csv('./ratings100k.dat', sep='::', names = ratingscol, engine='python')
ratings = ratings.drop('Timestamp', axis=1)

def sample(ratings, n, m):
    """
    Return a smaller matrix with top n users and top m items only
    @param ratings the ratings dataset 
    @param n number of users with most ratings
    @param m number of movies with most ratings
    @returns NxM matrix of USERxITEM ratings
    """

    n_users = ratings['UserID'].nunique()
    n_items = ratings['MovieID'].nunique()

    user_sample = ratings['UserID'].value_counts().head(n).index
    movie_sample = ratings['MovieID'].value_counts().head(m).index

    subset = ratings.loc[ratings['UserID'].isin(user_sample)].loc[ratings['MovieID'].isin(movie_sample)]
    return subset
data = sample(ratings,1000,100)

Unnamed: 0,UserID,MovieID,Rating
1,1,185,5.0
2,1,231,5.0
3,1,292,5.0
4,1,316,5.0
5,1,329,5.0
7,1,356,5.0
9,1,364,5.0
11,1,377,5.0
14,1,480,5.0
16,1,539,5.0


In [2]:
def train(ratings, folds, factors,):
    """
    Train a model and return it. Then we can use the model and evaluate it elsewhere
    @param data dataframe pandas dataframe to train on, with columns UserId, MovieId, Ratings
    @param n_folds number of folds for cross validation
    @returns List of (algo, test data)
    We can call methods such as `test` and `evaluate` on this object 
    """

    train_data, test_data = cv.train_test_split(ratings, test_size = 0.20)
    reader = sp.Reader(rating_scale=(1, 5))

    trainset = sp.Dataset.load_from_df(train_data, reader)
    testset = sp.Dataset.load_from_df(test_data, reader)
    trainset.split(n_folds = folds)

    algo = sp.SVD(n_factors = factors)

    for trainset, _ in trainset.folds():
        algo.train(trainset)


    testset = testset.build_full_trainset().build_testset()
    return (algo, testset)

In [4]:
def evaluate(algo, ratings, testset):
    """
    Print some u
    @param algo Surprise algorithm the model that was trained
    @oaram ratings The ratings it was trained on, in pandas Dataframe form (so we can calculate coverage)
    @param testset Surprise testset object, the data held out during cross-validation
    """
    test_predictions = algo.test(testset)
    # see how it would do on the trainset to compare, comes with the algo object
    trainset = algo.trainset.build_testset()
    train_predictions = algo.test(trainset)

    # sticking evaluate in everything for grep, training is verbose
    print "Evaluate: RMSE of the testset is {}".format(sp.accuracy.rmse(test_predictions))
    print "Evaluate: RMSE of the trainset is {}".format(sp.accuracy.rmse(train_predictions))

    print "Evaluate: MAE of the testset is {}".format(sp.accuracy.mae(test_predictions))
    print "Evaluate: MAE of the trainset is {}".format(sp.accuracy.mae(train_predictions))

#     # Hackish, baseline does not have a sense of "neighbors"
#     if (algo.__module__ == "surprise.prediction_algorithms.knns"):
#         print "Evaluate: CC of the model is {}".format(calculate_catalog_coverage(ratings, algo, algo.k))

In [8]:
# run models with some different parameters and sizes
# samples = [ [1000, 10], [5000, 50], [100000, 1000], [5000000, 2000] ]
samples = [ [1000, 10], [5000, 50] ]
factors = [5, 10, 20, 40, 50, 75]

for sample in samples:
    i, j = sample
    _dataset = sample(ratings, i, j)
# #     print "Evaluating Baseline and KNN on the dataset with {} users and {} items".format(i, j)
# #     print "Evaluating baseline"

# #     base, base_test = train_baseline(_dataset)
# #     evaluate(base, _dataset, base_test)
    _dataset = sample(ratings,1000,10)
    for f in factors:
        MF, MF_test = train(_dataset, f, 5)

        print "Evaluating MF with f of {}".format(f)
        evaluate(MF, _dataset, MF_test)

Evaluating MF with f of 5


AttributeError: 'tuple' object has no attribute 'upper'

In [None]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
algo = SVD()
algo.train(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])