In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from scipy.sparse.linalg import norm

In [2]:
movies = pd.read_csv('ml-latest-small/movies.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
links = pd.read_csv('ml-latest-small/links.csv')

In [None]:
movies.head(5)

In [None]:
tags.head(5)

In [52]:
ratings.head(5)
movie_sum_score = ratings.groupby('movieId').sum()
movie_sum_score[movie_sum_score.rating == 0]
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [3]:
# user_movie rating matrix
user_movie = sparse.coo_matrix((ratings.rating, (ratings.userId, ratings.movieId)))
movie_user = user_movie.transpose()

In [None]:
# build user to user similarity matrix
def user2user_fast(ratings, user_movie):
    # create a matrix that represent user similarity
    nusers = ratings.userId.max()  # user id starts from 1
    user_similarity = np.ones((nusers+1, nusers+1))
    user_movie = user_movie.tocsc()
    
    # dot products
    inner = user_movie * user_movie.transpose()
    norms = np.sqrt(inner.diagonal())
    
    # compute similarity between user i, j
    count = 0
    for i in range(1, nusers):
        coli = inner.getcol(i)
        for j in range(i+1, nusers):
            # col[i][j] is the dot b/t user i, j
            numerator = coli[j].toarray()[0][0]
            denom = norms[i] * norms[j]
            user_similarity[i][j] = user_similarity[j][i] = numerator / denom
            
            count += 1
            if (count % 10001 == 0):
                print("{} and {} sim: {}".format(i, j, user_similarity[i][j]))
            
    return user_similarity 

In [None]:
user_similarity = user2user_fast(ratings)

In [None]:
np.save('user_similarity.npy', user_similarity)

In [None]:
# query top k similar items from top N users
def getUserRec(userid, user_movie, user_similarity, N=10, k=10):
    # get top N similar users
    similar_users = user_similarity[userid, :]
    # exclude index=0 and index=userid
    similar_users[0] = similar_users[userid] = -1
    topn_index = np.argsort(similar_users)[::-1][:N] # [::-1] reverse to descending order
    topn_similarity = sparse.lil_matrix(similar_users[topn_index])
    
    # compute the weighted average for items rated by the topN users
    user_movie = user_movie.tolil() # for slicing
    topn_ratings = user_movie[topn_index, :]
    topn_movie_scores = topn_ratings.multiply(topn_similarity.transpose())
    weighted_scores = np.array(topn_movie_scores.sum(axis=0)).flatten()
    
    # return the top K movies that the users have not rated
    user_watched_indices = user_movie[userid, :].nonzero()[1]
    weighted_scores[user_watched_indices] = 0
    topk_index = np.argsort(weighted_scores)[::-1][:k]
    
    return topk_index

In [None]:
# TODO: sort favorites generes for each user
userId = 400
movie_liked = ratings[ratings.userId == userId].movieId
movie_liked_names = movies[movies.movieId.isin(movie_liked)]
movie_liked_names.head(10)

In [None]:
topk = getUserRec(userId, user_movie, user_similarity)
movie_names = movies[movies.movieId.isin(topk)]
movie_names

In [23]:
# build user to user similarity 
# this generalize the user2user_fast above
# @param rating_matrix: rating_matrix[i][j] is user i rating to movie j
def compute_similarity(rating_matrix):
    # create a matrix that represent row similarity
    (N, M) = rating_matrix.shape # row/col index starts from 1
    print("Shape=", N, M)
    #row_similarity = np.ones((N+1, N+1))
    rating_matrix = rating_matrix.tocsc()
    
    # dot products
    print("dot product")
    dots = rating_matrix * rating_matrix.transpose()
    norms = np.sqrt(dots.diagonal())
    #print(norms.shape, norms.nonzero()[0].shape)
    norms[norms == 0.0] = 1.0 # fill missing values
    norms = norms.reshape((N, 1))
    norms = sparse.csc_matrix(norms)
    denom = norms * norms.transpose()
    
    print("dots", type(dots), "denom", type(denom))
    #row_similarity = dots / denom
    print("return")
    return (dots, denom)
    #return row_similarity 

In [24]:
# compute_similarity(user_movie) computes the user similarity
(dots, denom) = compute_similarity(movie_user)

Shape= 163950 672
dot product
(163950,) (9066,)
dots <class 'scipy.sparse.csc.csc_matrix'> denom <class 'scipy.sparse.csc.csc_matrix'>
return


In [30]:
dots = movie_user*movie_user.transpose()

In [None]:
# there are some movieIds not used, and therefore they should be treated as "dummy"
(N, M) = movie_user.shape
norms = np.sqrt(dots.diagonal())
norms[norms == 0.0] = 1.0
norms = norms.reshape((N, 1))
norms = sparse.csc_matrix(norms)
denom = norms * norms.transpose()

In [10]:
a=np.array(range(1,6))
print(a)
a=a.reshape((5, 1))
print(a.shape)
m=a * a.transpose()
n=np.diag(m.diagonal())
n

[1 2 3 4 5]
(5, 1)


array([[ 1,  0,  0,  0,  0],
       [ 0,  4,  0,  0,  0],
       [ 0,  0,  9,  0,  0],
       [ 0,  0,  0, 16,  0],
       [ 0,  0,  0,  0, 25]])

In [11]:
m = 2*np.ones((5, 5))
print(n)
print(m)
n/m

[[ 1  0  0  0  0]
 [ 0  4  0  0  0]
 [ 0  0  9  0  0]
 [ 0  0  0 16  0]
 [ 0  0  0  0 25]]
[[ 2.  2.  2.  2.  2.]
 [ 2.  2.  2.  2.  2.]
 [ 2.  2.  2.  2.  2.]
 [ 2.  2.  2.  2.  2.]
 [ 2.  2.  2.  2.  2.]]


array([[  0.5,   0. ,   0. ,   0. ,   0. ],
       [  0. ,   2. ,   0. ,   0. ,   0. ],
       [  0. ,   0. ,   4.5,   0. ,   0. ],
       [  0. ,   0. ,   0. ,   8. ,   0. ],
       [  0. ,   0. ,   0. ,   0. ,  12.5]])

In [None]:
n.non