In [1]:
import numpy as np
import pandas as pd
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('./testData/ml-100k/u.data', sep='\t', names=header)


In [2]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))

Number of users = 943 | Number of movies = 1682


In [3]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(df, test_size=0.25)



In [13]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]  

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [25]:
print(train_data_matrix.shape)
print(train_data_matrix)

print(test_data_matrix.shape)
print(test_data_matrix)

print(train_data.shape)
print(train_data.head())

print(test_data.shape)
print(test_data.head())

(943, 1682)
[[ 5.  0.  4. ...,  0.  0.  0.]
 [ 4.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 5.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  5.  0. ...,  0.  0.  0.]]
(943, 1682)
[[ 0.  3.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
(75000, 4)
       user_id  item_id  rating  timestamp
79452      837      151       5  875721734
19432       55      273       5  878176047
30549       62      222       5  879372480
96696      601      473       3  876347665
57359      846       60       4  883948606
(25000, 4)
       user_id  item_id  rating  timestamp
43591      374      192       5  880395665
89845      773     1170       3  888539711
68102      646      349       2  888529127
67214      486      304       3  879874186
2715        56      215       5  892678547


In [44]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [45]:
print(user_similarity.shape)
print(user_similarity)
print('------------------------')
print(item_similarity.shape)
print(item_similarity)

(943, 943)
[[ 0.          0.85909705  0.96705435 ...,  0.88635092  0.91089101
   0.6979636 ]
 [ 0.85909705  0.          0.91614603 ...,  0.8654595   0.87344595
   0.89824814]
 [ 0.96705435  0.91614603  0.         ...,  0.89787388  0.89659186
   0.9794676 ]
 ..., 
 [ 0.88635092  0.8654595   0.89787388 ...,  0.          0.93244014
   0.91699792]
 [ 0.91089101  0.87344595  0.89659186 ...,  0.93244014  0.          0.82045138]
 [ 0.6979636   0.89824814  0.9794676  ...,  0.91699792  0.82045138  0.        ]]
------------------------
(1682, 1682)
[[ 0.          0.72913604  0.76189606 ...,  1.          1.          0.9447895 ]
 [ 0.72913604  0.          0.7439058  ...,  1.          1.          1.        ]
 [ 0.76189606  0.7439058   0.         ...,  1.          1.          1.        ]
 ..., 
 [ 1.          1.          1.         ...,  0.          1.          1.        ]
 [ 1.          1.          1.         ...,  1.          0.          1.        ]
 [ 0.9447895   1.          1.         ...,  1.  

In [46]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [47]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [51]:
print(item_prediction.shape)
print(item_prediction)
print('------------------------------')
print(train_data_matrix.shape)
print(train_data_matrix)

(943, 1682)
[[ 0.37196934  0.38669486  0.39229869 ...,  0.44021416  0.44021416
   0.42960576]
 [ 0.08344479  0.09968649  0.09551048 ...,  0.10113028  0.10113028
   0.10102746]
 [ 0.06631246  0.07037581  0.068484   ...,  0.06900654  0.06900654
   0.06949897]
 ..., 
 [ 0.03177041  0.04047538  0.03918103 ...,  0.04580607  0.04580607
   0.04490941]
 [ 0.12452105  0.13166281  0.13767954 ...,  0.14396193  0.14396193
   0.14374452]
 [ 0.20263842  0.19574595  0.21630666 ...,  0.24866151  0.24866151
   0.2412066 ]]
------------------------------
(943, 1682)
[[ 5.  0.  4. ...,  0.  0.  0.]
 [ 4.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 5.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  5.  0. ...,  0.  0.  0.]]


In [52]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [91]:
print(test_data_matrix.nonzero())
print('-------------------')
print(train_data_matrix.shape)
print(train_data_matrix)
print('-------------------')
print(train_data_matrix[test_data_matrix.nonzero()])
print(user_prediction[test_data_matrix.nonzero()])

(array([  0,   0,   0, ..., 942, 942, 942]), array([   1,    4,   13, ..., 1027, 1043, 1046]))
-------------------
(943, 1682)
[[ 5.  0.  4. ...,  0.  0.  0.]
 [ 4.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 5.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  5.  0. ...,  0.  0.  0.]]
-------------------
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.56876779  0.50407637  0.85458174 ...,  0.4189301   0.17890428
  0.39310633]


In [92]:
print('User-based CF RMSE: %2f' % rmse(user_prediction, test_data_matrix))
print('Item-based CF RMSE: %2f' % rmse(item_prediction, test_data_matrix))

User-based CF RMSE: 3.137320
Item-based CF RMSE: 3.464914


In [95]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

print('User-based CF MSE: %2f' % rmse(X_pred, test_data_matrix))

User-based CF MSE: 2.733553


In [101]:
print(u.shape)
print(s.shape)
print(vt.shape)
s_diag_matrix

(943, 20)
(20,)
(20, 1682)


array([[  69.35629979,    0.        ,    0.        ,    0.        ,
           0.        ,    0.        ,    0.        ,    0.        ,
           0.        ,    0.        ,    0.        ,    0.        ,
           0.        ,    0.        ,    0.        ,    0.        ,
           0.        ,    0.        ,    0.        ,    0.        ],
       [   0.        ,   69.81753592,    0.        ,    0.        ,
           0.        ,    0.        ,    0.        ,    0.        ,
           0.        ,    0.        ,    0.        ,    0.        ,
           0.        ,    0.        ,    0.        ,    0.        ,
           0.        ,    0.        ,    0.        ,    0.        ],
       [   0.        ,    0.        ,   71.00036503,    0.        ,
           0.        ,    0.        ,    0.        ,    0.        ,
           0.        ,    0.        ,    0.        ,    0.        ,
           0.        ,    0.        ,    0.        ,    0.        ,
           0.        ,    0.        ,    0.   