In [2]:
import pandas as pd 
import numpy as np
import random
import surprise as sp
import scipy
from surprise import Dataset, Reader, SVD, evaluate
from sklearn import cross_validation as cv
from scipy import linalg


ratingscol = ['UserID', 'MovieID', 'Rating', 'Timestamp']
ratings = pd.read_csv('./ratings100k.dat', sep='::', names = ratingscol, engine='python')
ratings = ratings.drop('Timestamp', axis=1)

def sample(ratings, n, m):
    """
    Return a smaller matrix with top n users and top m items only
    @param ratings the ratings dataset 
    @param n number of users with most ratings
    @param m number of movies with most ratings
    @returns NxM matrix of USERxITEM ratings
    """

    n_users = ratings['UserID'].nunique()
    n_items = ratings['MovieID'].nunique()

    user_sample = ratings['UserID'].value_counts().head(n).index
    movie_sample = ratings['MovieID'].value_counts().head(m).index

    subset = ratings.loc[ratings['UserID'].isin(user_sample)].loc[ratings['MovieID'].isin(movie_sample)]
    return subset



In [55]:
# %load 'funcs.py'
import surprise as sp
from surprise import AlgoBase
import pandas as pd
import numpy as np
import scipy
from sklearn import cross_validation as cv
import funcs as F
import itertools as it
from scipy import linalg

def sample(ratings, user_counts, movie_counts, n, m):
    """
    Return a smaller matrix with top n users and top m items only
    @param ratings the ratings dataset 
    @param user_counts Count values of user ratings
    @param movie_counts Movie count values
    @param n number of users with most ratings
    @param m number of movies with most ratings
    @returns NxM matrix of USERxITEM ratings
    """
    n_users = ratings['UserId'].nunique()
    n_items = ratings['MovieId'].nunique()

    user_sample = user_counts.head(n).index
    movie_sample = movie_counts.head(m).index

    print len(user_sample)
    print len(movie_sample)

    subset = ratings.loc[ratings['UserId'].isin(user_sample)].loc[ratings['MovieId'].isin(movie_sample)]
    # we don't need the timestamp
    del subset['Timestamp']
    return subset

def normalize_user_means(ratings):
    """
    @param Ratings pandas dataframe
    @returns user mean normalized dataframe
    """
    u_i_df = F.build_user_item_matrix(ratings)
    means = u_i_df.mean(axis=1)

    def f(row):
        u_id = row["UserId"]
        new_r = row["Rating"] - means[u_id]
        return new_r

    ratings['Rating'] = ratings.apply(f, axis = 1)
    return ratings


ratingscol = ['UserId', 'MovieId', 'Rating', 'Timestamp']
ratings = pd.read_csv('./ratings100k.dat', sep='::', names = ratingscol, engine='python')
ratings = ratings.drop('Timestamp', axis=1)
ratings = normalize_user_means(ratings)
ratings_matrix = F.build_user_item_matrix(ratings)
ratings_matrix = ratings_matrix.values

In [4]:
def svd(matrix):
    U, s, Vh = scipy.linalg.svd(matrix)
    return U.shape,  s.shape, Vh.shape
    
svd(ratings_matrix)

((730, 730), (730,), (6373, 6373))

In [60]:
class MF():
    
    def __init__(self, R, k, lr, reg, iterations):
        """
        Perform matrix factorization to predict empty
        entries in a matrix.

        Arguments
        - R (ndarray)   : user-item rating matrix
        - K (int)       : number of latent factors
        - lr (float) : learning rate
        - reg (float)  : regularization parameter
        """

        self.R = R
        self.num_users, self.num_items = R.shape
        self.k = k
        self.lr = lr
        self.reg = reg
        self.iterations = iterations
        
    def train(self):
        # Initialize user and item latent feature matrice
        # P is a k x n matrix(n = number of users)
        # Q is a m x k matrix(m = number of items)
        self.P = np.random.normal(scale=1./self.k, size=(self.num_users, self.k))
        self.Q = np.random.normal(scale=1./self.k, size=(self.num_items, self.k))

        # Initialize the user/item biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)

        # Don't include ratings of 0 into mean because those items are unrated
        self.b = np.mean(self.R[np.where(self.R != 0)])

        # Create a list of training samples
        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
            ]

        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            mse = self.mse()
            training_process.append((i, mse))
            if (i+1) % 1 == 0:
                print("Iteration: %d ; error = %.4f" % (i+1, mse))

        return training_process

    def sgd(self):
        """
        Perform stochastic gradient descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            # i = user index, j = item index, r = preexisting rating
            prediction = self.get_rating(i, j)
            err = (r - prediction)

            # Update biases
            self.b_u[i] += self.lr * (err - self.reg * self.b_u[i])
            self.b_i[j] += self.lr * (err - self.reg * self.b_i[j])

            # Update user and item latent feature matrices
            self.P[i, :] += self.lr * (err * self.Q[j, :] - self.reg * self.P[i,:])
            self.Q[j, :] += self.lr * (err * self.P[i, :] - self.reg * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        error = error/np.count_nonzero(self.R)
        return np.sqrt(error)

    def full_matrix(self):
        """
        Compute the full matrix using the resultant biases, P and Q
        """
        return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)

In [61]:
mf = MF(ratings_matrix, k=50, lr=0.005, reg=0.02, iterations=50)
mf.train()



Iteration: 1 ; error = 0.9240
Iteration: 2 ; error = 0.8966
Iteration: 3 ; error = 0.8828
Iteration: 4 ; error = 0.8736
Iteration: 5 ; error = 0.8668
Iteration: 6 ; error = 0.8616
Iteration: 7 ; error = 0.8570
Iteration: 8 ; error = 0.8533
Iteration: 9 ; error = 0.8499
Iteration: 10 ; error = 0.8471
Iteration: 11 ; error = 0.8444
Iteration: 12 ; error = 0.8419
Iteration: 13 ; error = 0.8397
Iteration: 14 ; error = 0.8374
Iteration: 15 ; error = 0.8351
Iteration: 16 ; error = 0.8328
Iteration: 17 ; error = 0.8304
Iteration: 18 ; error = 0.8278
Iteration: 19 ; error = 0.8250
Iteration: 20 ; error = 0.8221
Iteration: 21 ; error = 0.8186
Iteration: 22 ; error = 0.8146
Iteration: 23 ; error = 0.8101
Iteration: 24 ; error = 0.8052
Iteration: 25 ; error = 0.7996
Iteration: 26 ; error = 0.7934
Iteration: 27 ; error = 0.7868
Iteration: 28 ; error = 0.7796
Iteration: 29 ; error = 0.7722
Iteration: 30 ; error = 0.7643
Iteration: 31 ; error = 0.7562
Iteration: 32 ; error = 0.7479
Iteration: 33 ; e

[(0, 0.92395593525316222),
 (1, 0.89659636004759691),
 (2, 0.88276115229152685),
 (3, 0.87355977563968434),
 (4, 0.86675428379635555),
 (5, 0.86156965717716893),
 (6, 0.85700386491259906),
 (7, 0.85329696072338235),
 (8, 0.84991729085037526),
 (9, 0.84708518920789699),
 (10, 0.84437319983672421),
 (11, 0.84194408504496399),
 (12, 0.83970041526244532),
 (13, 0.83736200373750957),
 (14, 0.83513256607418529),
 (15, 0.83279649727022687),
 (16, 0.83044011637848481),
 (17, 0.82783102047263313),
 (18, 0.82502252730261005),
 (19, 0.82205759504139797),
 (20, 0.81859884543792272),
 (21, 0.81455575584524975),
 (22, 0.81012706402438583),
 (23, 0.80521301087670061),
 (24, 0.79955036438483773),
 (25, 0.7933848512811239),
 (26, 0.78680848413499926),
 (27, 0.77964500673498849),
 (28, 0.77215945797084784),
 (29, 0.76430445246944634),
 (30, 0.75617049455666707),
 (31, 0.7478541619401855),
 (32, 0.73930810476296716),
 (33, 0.73063164894876853),
 (34, 0.72167397717389947),
 (35, 0.71267454782383843),
 (36

In [12]:
def train_matrix(ratings, factor):
    """
    Train a model and return it. Then we can use the model and evaluate it elsewhere
    @param ratings dataframe pandas dataframe to train on, with columns UserId, MovieId, Ratings
    @param n_folds number of folds for cross validation
    @returns List of (algo, test data)
    We can call methods such as `test` and `evaluate` on this object 
    """

    train_data, test_data = cv.train_test_split(ratings, test_size = 0.20)
    reader = sp.Reader(rating_scale=(1, 5))

    trainset = sp.Dataset.load_from_df(train_data, reader)
    testset = sp.Dataset.load_from_df(test_data, reader)
    trainset.split(n_folds = 5)

    algo = sp.SVD(n_factors = factor)

    for trainset, _ in trainset.folds():
        algo.train(trainset)
        
    testset = testset.build_full_trainset().build_testset()
    return (algo, testset)


In [13]:
def group_predictions_by_user(predictions):
    """
    @param List of Surprise predictions objects
    @returns Dict {uid: [P1, P2, ...PN]} hash mapping user id to top n predictions
    """
    p = sorted(predictions, key = lambda x: x.uid)

    groups = {}
    for k, g in it.groupby(p, lambda x: x.uid):
        groups[k] = sorted(list(g), key = lambda x: x.est, reverse = True)

    return groups


# For every item that a user would be rated, in top k
def calculate_catalog_coverage(ratings, predictions, k):
    """
    Calculate the catalog coverage of a model over a dataset
    @param ratings pandas dataframe with UserId, MovieId, Ratings. Must be the same set the model was trained on
    @param List Surprise predictions
    @oaram k Int the top k recommendations size
    @returns Float percentage of items recommended to at least one user
    """
    n_movies = ratings['MovieId'].nunique()

    movies_reccommended = set() # keep track of which movies are recommended. Note we only care about the number

    recommendations = group_predictions_by_user(predictions)
    for u_id, recs in recommendations.iteritems():
        movies_reccommended.update(map(lambda x: x.iid, recs[0:3]))

    return len(movies_reccommended) / float(n_movies)

In [14]:
def evaluate(algo, ratings, testset, top_k):
    """
    @param algo Surprise algorithm the model that was trained
    @oaram ratings The ratings it was trained on, in pandas Dataframe form (so we can calculate coverage)
    @param testset Surprise testset object, the data held out during cross-validation
    @returns Nested Dictionary {test: {rmse, mae}, train: {rmse, mae, cc}}
    We can use these to build up arrays for plotting.
    """

    ret = {}
    ret['test'] = {}
    ret['train'] = {}

    test_predictions = algo.test(testset)
    # see how it would do on the trainset to compare, comes with the algo object
    trainset = algo.trainset.build_testset()
    train_predictions = algo.test(trainset)

    # sticking evaluate in everything for grep, training is verbose
    ret['test']['rmse'] = sp.accuracy.rmse(test_predictions)
    ret['train']['rmse'] = sp.accuracy.rmse(train_predictions)

    ret['test']['mae'] = sp.accuracy.mae(test_predictions)
    ret['train']['mae'] = sp.accuracy.mae(train_predictions)

    # Hackish, baseline does not have a sense of "neighbors"
    if (algo.__module__ == "surprise.prediction_algorithms.knns"):
        ret['test']['cc'] = calculate_catalog_coverage(ratings, test_predictions, top_k)

    return ret

In [15]:
# run models with some different parameters and sizes
# samples = [ [1000, 10], [5000, 50], [100000, 1000], [5000000, 2000] ]
samples = [ [1000, 10], [5000, 50], [10000,100], [20000,200] ]
factors = [5, 10, 20, 40, 50, 75]

for _sample in samples:
    i, j = _sample
    _dataset = sample(ratings, i, j)
    for f in factors:
        MF, MF_test = train_matrix(_dataset, f)

        print "Evaluating MF with f of {}".format(f)
        evaluate(MF, _dataset, MF_test, 5)

Evaluating MF with f of 5
RMSE: 0.8913
RMSE: 0.7613
MAE:  0.6980
MAE:  0.5935
Evaluating MF with f of 10
RMSE: 0.8181
RMSE: 0.7762
MAE:  0.6593
MAE:  0.6065
Evaluating MF with f of 20
RMSE: 0.8875
RMSE: 0.7449
MAE:  0.6809
MAE:  0.5851
Evaluating MF with f of 40
RMSE: 0.8224
RMSE: 0.7292
MAE:  0.6640
MAE:  0.5719
Evaluating MF with f of 50
RMSE: 0.8981
RMSE: 0.7027
MAE:  0.7031
MAE:  0.5516
Evaluating MF with f of 75
RMSE: 0.9062
RMSE: 0.6469
MAE:  0.6933
MAE:  0.5098
Evaluating MF with f of 5
RMSE: 0.8463
RMSE: 0.7787
MAE:  0.6609
MAE:  0.6074
Evaluating MF with f of 10
RMSE: 0.8495
RMSE: 0.7635
MAE:  0.6596
MAE:  0.5983
Evaluating MF with f of 20
RMSE: 0.8384
RMSE: 0.7609
MAE:  0.6531
MAE:  0.5942
Evaluating MF with f of 40
RMSE: 0.8476
RMSE: 0.7268
MAE:  0.6625
MAE:  0.5658
Evaluating MF with f of 50
RMSE: 0.8664
RMSE: 0.7033
MAE:  0.6809
MAE:  0.5475
Evaluating MF with f of 75
RMSE: 0.8519
RMSE: 0.6598
MAE:  0.6638
MAE:  0.5152
Evaluating MF with f of 5
RMSE: 0.8515
RMSE: 0.7945
MA