In [1]:
import pandas as pd
import numpy as np

In [4]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('~/Downloads/ml-100k/u.data', sep='\t', names=header)

In [14]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(n_users, n_item)

943 1682


In [15]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(df, test_size=0.25)

In [19]:
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]
    
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [22]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine') # matrix transpose

In [33]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T

    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    
    return pred

In [34]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [40]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rsme(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [43]:
print(rsme(user_prediction, test_data_matrix))
print(rsme(item_prediction, test_data_matrix))

3.122269490851989
3.4484202196493206


In [46]:
sparsity = round(1.0-len(df)/float(n_users*n_items), 3)
print('sparsity : ', sparsity*100, '%')

sparsity :  93.7 %


In [48]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

u, s, vt = svds(train_data_matrix, k=20)
s_diag_matrix = np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print(rsme(X_pred, test_data_matrix))

2.719781208585213
