In [11]:
import pandas as pd 
import numpy as np
import random
import surprise as sp
from surprise import Dataset, Reader, SVD, evaluate
from sklearn import cross_validation as cv

ratingscol = ['UserID', 'MovieID', 'Rating', 'Timestamp']
ratings = pd.read_csv('./ratings100k.dat', sep='::', names = ratingscol, engine='python')
ratings = ratings.drop('Timestamp', axis=1)

def sample(ratings, n, m):
    """
    Return a smaller matrix with top n users and top m items only
    @param ratings the ratings dataset 
    @param n number of users with most ratings
    @param m number of movies with most ratings
    @returns NxM matrix of USERxITEM ratings
    """

    n_users = ratings['UserID'].nunique()
    n_items = ratings['MovieID'].nunique()

    user_sample = ratings['UserID'].value_counts().head(n).index
    movie_sample = ratings['MovieID'].value_counts().head(m).index

    subset = ratings.loc[ratings['UserID'].isin(user_sample)].loc[ratings['MovieID'].isin(movie_sample)]
    return subset

In [12]:
def train_matrix(ratings, factor):
    """
    Train a model and return it. Then we can use the model and evaluate it elsewhere
    @param ratings dataframe pandas dataframe to train on, with columns UserId, MovieId, Ratings
    @param n_folds number of folds for cross validation
    @returns List of (algo, test data)
    We can call methods such as `test` and `evaluate` on this object 
    """

    train_data, test_data = cv.train_test_split(ratings, test_size = 0.20)
    reader = sp.Reader(rating_scale=(1, 5))

    trainset = sp.Dataset.load_from_df(train_data, reader)
    testset = sp.Dataset.load_from_df(test_data, reader)
    trainset.split(n_folds = 5)

    algo = sp.SVD(n_factors = factor)

    for trainset, _ in trainset.folds():
        algo.train(trainset)
        
    testset = testset.build_full_trainset().build_testset()
    return (algo, testset)


In [13]:
def group_predictions_by_user(predictions):
    """
    @param List of Surprise predictions objects
    @returns Dict {uid: [P1, P2, ...PN]} hash mapping user id to top n predictions
    """
    p = sorted(predictions, key = lambda x: x.uid)

    groups = {}
    for k, g in it.groupby(p, lambda x: x.uid):
        groups[k] = sorted(list(g), key = lambda x: x.est, reverse = True)

    return groups


# For every item that a user would be rated, in top k
def calculate_catalog_coverage(ratings, predictions, k):
    """
    Calculate the catalog coverage of a model over a dataset
    @param ratings pandas dataframe with UserId, MovieId, Ratings. Must be the same set the model was trained on
    @param List Surprise predictions
    @oaram k Int the top k recommendations size
    @returns Float percentage of items recommended to at least one user
    """
    n_movies = ratings['MovieId'].nunique()

    movies_reccommended = set() # keep track of which movies are recommended. Note we only care about the number

    recommendations = group_predictions_by_user(predictions)
    for u_id, recs in recommendations.iteritems():
        movies_reccommended.update(map(lambda x: x.iid, recs[0:3]))

    return len(movies_reccommended) / float(n_movies)

In [14]:
def evaluate(algo, ratings, testset, top_k):
    """
    @param algo Surprise algorithm the model that was trained
    @oaram ratings The ratings it was trained on, in pandas Dataframe form (so we can calculate coverage)
    @param testset Surprise testset object, the data held out during cross-validation
    @returns Nested Dictionary {test: {rmse, mae}, train: {rmse, mae, cc}}
    We can use these to build up arrays for plotting.
    """

    ret = {}
    ret['test'] = {}
    ret['train'] = {}

    test_predictions = algo.test(testset)
    # see how it would do on the trainset to compare, comes with the algo object
    trainset = algo.trainset.build_testset()
    train_predictions = algo.test(trainset)

    # sticking evaluate in everything for grep, training is verbose
    ret['test']['rmse'] = sp.accuracy.rmse(test_predictions)
    ret['train']['rmse'] = sp.accuracy.rmse(train_predictions)

    ret['test']['mae'] = sp.accuracy.mae(test_predictions)
    ret['train']['mae'] = sp.accuracy.mae(train_predictions)

    # Hackish, baseline does not have a sense of "neighbors"
    if (algo.__module__ == "surprise.prediction_algorithms.knns"):
        ret['test']['cc'] = calculate_catalog_coverage(ratings, test_predictions, top_k)

    return ret

In [15]:
# run models with some different parameters and sizes
# samples = [ [1000, 10], [5000, 50], [100000, 1000], [5000000, 2000] ]
samples = [ [1000, 10], [5000, 50], [10000,100], [20000,200] ]
factors = [5, 10, 20, 40, 50, 75]

for _sample in samples:
    i, j = _sample
    _dataset = sample(ratings, i, j)
    for f in factors:
        MF, MF_test = train_matrix(_dataset, f)

        print "Evaluating MF with f of {}".format(f)
        evaluate(MF, _dataset, MF_test, 5)

Evaluating MF with f of 5
RMSE: 0.8913
RMSE: 0.7613
MAE:  0.6980
MAE:  0.5935
Evaluating MF with f of 10
RMSE: 0.8181
RMSE: 0.7762
MAE:  0.6593
MAE:  0.6065
Evaluating MF with f of 20
RMSE: 0.8875
RMSE: 0.7449
MAE:  0.6809
MAE:  0.5851
Evaluating MF with f of 40
RMSE: 0.8224
RMSE: 0.7292
MAE:  0.6640
MAE:  0.5719
Evaluating MF with f of 50
RMSE: 0.8981
RMSE: 0.7027
MAE:  0.7031
MAE:  0.5516
Evaluating MF with f of 75
RMSE: 0.9062
RMSE: 0.6469
MAE:  0.6933
MAE:  0.5098
Evaluating MF with f of 5
RMSE: 0.8463
RMSE: 0.7787
MAE:  0.6609
MAE:  0.6074
Evaluating MF with f of 10
RMSE: 0.8495
RMSE: 0.7635
MAE:  0.6596
MAE:  0.5983
Evaluating MF with f of 20
RMSE: 0.8384
RMSE: 0.7609
MAE:  0.6531
MAE:  0.5942
Evaluating MF with f of 40
RMSE: 0.8476
RMSE: 0.7268
MAE:  0.6625
MAE:  0.5658
Evaluating MF with f of 50
RMSE: 0.8664
RMSE: 0.7033
MAE:  0.6809
MAE:  0.5475
Evaluating MF with f of 75
RMSE: 0.8519
RMSE: 0.6598
MAE:  0.6638
MAE:  0.5152
Evaluating MF with f of 5
RMSE: 0.8515
RMSE: 0.7945
MA