In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from scipy.sparse.linalg import norm

In [2]:
# good article: http://blog.ethanrosenthal.com/2015/11/02/intro-to-collaborative-filtering/
movies = pd.read_csv('ml-latest-small/movies.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
links = pd.read_csv('ml-latest-small/links.csv')

In [3]:
# note that some movieIds are lacking data, let's condense them, using the Ids in movies
movies['id'] = movies.index
movies = movies.set_index('movieId')
movies.head(5)

Unnamed: 0_level_0,title,genres,id
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
2,Jumanji (1995),Adventure|Children|Fantasy,1
3,Grumpier Old Men (1995),Comedy|Romance,2
4,Waiting to Exhale (1995),Comedy|Drama|Romance,3
5,Father of the Bride Part II (1995),Comedy,4


In [4]:
movies.tail(30)

Unnamed: 0_level_0,title,genres,id
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
159690,Teenage Mutant Ninja Turtles: Out of the Shado...,Action|Adventure|Comedy,9095
159755,Popstar: Never Stop Never Stopping (2016),Comedy,9096
159858,The Conjuring 2 (2016),Horror,9097
159972,Approaching the Unknown (2016),Drama|Sci-Fi|Thriller,9098
160080,Ghostbusters (2016),Action|Comedy|Horror|Sci-Fi,9099
160271,Central Intelligence (2016),Action|Comedy,9100
160438,Jason Bourne (2016),Action,9101
160440,The Maid's Room (2014),Thriller,9102
160563,The Legend of Tarzan (2016),Action|Adventure,9103
160565,The Purge: Election Year (2016),Action|Horror|Sci-Fi,9104


In [5]:
# use the mapped Id in the rating table
ratings['id'] = movies.loc[ratings.movieId].id.values

# looks like there are some movies never got rated
print(ratings.movieId.unique().shape)

(9066,)


In [6]:
tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,Cambodia,1170560997
3,15,32892,Russian,1170626366
4,15,34162,forgettable,1141391765


In [7]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,id
0,1,31,2.5,1260759144,30
1,1,1029,3.0,1260759179,833
2,1,1061,3.0,1260759182,859
3,1,1129,2.0,1260759185,906
4,1,1172,4.0,1260759205,931


In [8]:
# user_movie rating matrix
user_movie = sparse.coo_matrix((ratings.rating, (ratings.userId, ratings.id)))
movie_user = user_movie.transpose()
movie_user.shape

(9123, 672)

In [9]:
# build user to user similarity 
# this generalize the user2user_fast above
# @param rating_matrix: rating_matrix[i][j] is user i rating to movie j
def compute_similarity(rating_matrix):
    # create a matrix that represent row similarity
    (N, M) = rating_matrix.shape # row/col index starts from 1
    rating_matrix = rating_matrix.tocsc()
    
    # dot products
    inner = rating_matrix * rating_matrix.transpose()
    norms = np.sqrt(inner.diagonal())
    
    # there are some movies not rated, we should set the rating to a dummy value (1.0) when computing cosine similarity
    norms[norms == 0.0] = 1.0
    norms = norms.reshape((N, 1))
    norms = sparse.csc_matrix(norms)
    denom = norms * norms.transpose()
    
    return inner / denom

In [None]:
user_sim = compute_similarity(user_movie)

In [None]:
movie_sim = compute_similarity(movie_user)

In [None]:
# top N similarity by distance
sim.shape

In [None]:
def getTopN_mapped(mapped_id, sim, n=10):
    s = np.squeeze(np.asarray(sim[mapped_id, :]))
    s[mapped_id] = -1 # set itself to -1 to exclude
    topn_index = np.argsort(s)[::-1][:n] # [::-1] reverse to descending order
    return topn_index

def getTopN(movie_id, movies, sim, n=10):
    # map the movie_id to id
    mapped_id = movies.loc[movie_id].id
    topn_index = getTopN_mapped(mapped_id, sim, n)
    return movies[movies.id.isin(topn_index)]

In [None]:
movie_id=90603
print("Query movie is: ")
movies.loc[[movie_id]]

In [None]:
getTopN(movie_id, movies, sim)

In [None]:
# sanity check, pick two movies and compute their similarity manually
m1 = 12
m2 = 34
m1id = movies.loc[m1].id
m2id = movies.loc[m2].id
movies.loc[[m1, m2]]

In [None]:
# similarity computed from the function
sim[m1id, m2id]

In [None]:
# compute similarity by hand
tmp = movie_user.tolil()
v1 = tmp[m1id]
v2 = tmp[m2id]

In [None]:
numerator = v1.dot(v2.transpose()).toarray().reshape(-1)
denom = (sparse.linalg.norm(v1) * sparse.linalg.norm(v2))

In [None]:
numerator[0] / denom # matches!!

In [None]:
getTopN(356, movies, sim) # forrest gump

In [None]:
getTopN(10, movies, sim) # forrest gump

In [None]:
sparsity = 1.0*len(user_movie.nonzero()[0])/(user_movie.shape[0] * user_movie.shape[1])
print("Sparsity = ", sparsity)

In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [63]:
from evaluation import *
train, test = train_test_split(user_movie)

In [11]:
train, test

(<672x9123 sparse matrix of type '<class 'numpy.float64'>'
 	with 99757 stored elements in LInked List format>,
 <672x9123 sparse matrix of type '<class 'numpy.float64'>'
 	with 247 stored elements in LInked List format>)

In [64]:
train_user_sim = compute_similarity(train)

For user-based collaborative filtering, we predict that a user's $u$'s rating for item $i$ is given by the weighted sum of all other users' ratings for item $i$ where the weighting is the cosine similarity between the each user and the input user $u$, normalized by the number of ${r_{u'i}}$ ratings:

$$
r_{u'i} = \frac{\sum_{u'}sim(u, u')r_{u'i}}{\sum_{u'}\lvert sim(u, u')\rvert}
$$

In [65]:
train.shape, train_user_sim.shape, type(train_user_sim)

((672, 9123), (672, 672), numpy.matrixlib.defmatrix.matrix)

In [66]:
user_prediction = predict_similarity(train, train_user_sim)

In [67]:
print('User-based CF MSE: ' + str(get_mse(user_prediction, test.todense())))

User-based CF MSE: 15.9119433198
