In [4]:
import numpy as np
import pandas as pd
import statistics

In [5]:
#Define header information of input entry data

In [6]:
header = ['user_id', 'movie_id', 'rating', 'timestamp']

In [7]:
#Read data from file

In [8]:
rating_data = pd.read_csv('u.data', sep='\t', names=header)

In [9]:
#Print out the top 30 rows of data

In [10]:
rating_data.head(30)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


In [11]:
# count the number of users
n_users = rating_data.user_id.unique().shape[0]

In [12]:
# count the number of movies
n_items = rating_data.movie_id.unique().shape[0]

In [15]:
from sklearn import cross_validation as cv

In [16]:
train_data, test_data = cv.train_test_split(rating_data, test_size=0.20)

In [17]:
train_data_matrix = np.zeros((n_users, n_items))

In [18]:
# Re-arrange the input training rating matrix
for line in train_data.itertuples():
        train_data_matrix[line[1]-1, line[2]-1] = line[3]

In [19]:
test_data_matrix = np.zeros((n_users, n_items))

In [20]:
# Re-arrange the input testing rating matrix
for line in test_data.itertuples():
        test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [21]:
from sklearn.metrics.pairwise import pairwise_distances

In [25]:
# get user-user cosine similarity
user_similarity_cosine = 1 - pairwise_distances(train_data_matrix, metric='cosine')

In [26]:
# get item-item cosine similarity
item_similarity_cosine = 1 - pairwise_distances(train_data_matrix.T, metric='cosine')

In [27]:
# get user-user jaccard similarity
user_similarity_jaccard = 1 - pairwise_distances(train_data_matrix, metric='jaccard')

In [28]:
# get item-item jaccard similarity
item_similarity_jaccard = 1 - pairwise_distances(train_data_matrix.T, metric='jaccard')

In [29]:
# define the function to build the prediction results
def predict(ratings, similarity, type='user'):
        if type == 'user':
            mean_user_rating = ratings.mean(axis=1)
            ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
            pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
        elif type == 'item':
            pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
        return pred

In [30]:
#user-user cosine similarity based results
user_prediction_cosine = predict(train_data_matrix, user_similarity_cosine, type='user')

In [31]:
#item-item cosine similarity based results
item_prediction_cosine = predict(train_data_matrix, item_similarity_cosine, type='item')

In [32]:
#user-user jaccard similarity based results
user_prediction_jaccard = predict(train_data_matrix, user_similarity_jaccard, type='user')

In [33]:
#item-item jaccard similarity based results
item_prediction_jaccard = predict(train_data_matrix, item_similarity_jaccard, type='item')

In [34]:
#define the function to evaluate the results
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    where_are_NaNs = np.isnan(prediction)
    prediction[where_are_NaNs] = 0
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [35]:
print 'RMSE of User-user cosine similarity based Filtering: ' + str(rmse(user_prediction_cosine, test_data_matrix))
print 'RMSE of Item-item cosine similarity based Filtering: ' + str(rmse(item_prediction_cosine, test_data_matrix))

RMSE of User-user cosine similarity based Filtering: 2.92439473758
RMSE of Item-item cosine similarity based Filtering: 3.14488136012


In [36]:
print 'RMSE of User-user jaccard similarity based Filtering: ' + str(rmse(user_prediction_jaccard, test_data_matrix))
print 'RMSE of Item-item jaccard similarity based Filtering: ' + str(rmse(item_prediction_jaccard, test_data_matrix))

RMSE of User-user jaccard similarity based Filtering: 2.91977378425
RMSE of Item-item jaccard similarity based Filtering: 3.06072703352


In [37]:
# Start doing Model-based SVD approach
import scipy.sparse as sp
from scipy.sparse.linalg import svds

In [38]:
# using svd function to generate three matrices
u, s, vt = svds(train_data_matrix, k = 20)

In [40]:
s_diag_matrix=np.diag(s)

In [41]:
#calculate the prediction results
svd_prediction = np.dot(np.dot(u, s_diag_matrix), vt)

In [45]:
print 'User-based CF MSE: ' + str(rmse(svd_prediction, test_data_matrix))

User-based CF MSE: 2.64975016914
