In [1]:
import numpy as np
import pandas as pd

In [2]:
rating_df = pd.read_csv('the-movies-dataset/ratings_small.csv')
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [3]:
rating_df.shape

(100004, 4)

In [4]:
rating_df.nunique()

userId         671
movieId       9066
rating          10
timestamp    78141
dtype: int64

In [5]:
# sort data frame by timestemp for splitting
rating_df = rating_df.sort_values('timestamp')
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
52635,383,21,3.0,789652009
52641,383,47,5.0,789652009
52684,383,1079,3.0,789652009
56907,409,21,5.0,828212412
56909,409,25,4.0,828212412


In [6]:
# map user id and movie id to integer starting from 0 to N (num of users) and M (num of movies)
from sklearn.preprocessing import LabelEncoder
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

user_ids = user_encoder.fit_transform(rating_df.userId)
movie_ids = movie_encoder.fit_transform(rating_df.movieId)

In [7]:
num_train = int(len(user_ids) * 0.8)
train_user_ids = user_ids[:num_train]
train_movie_ids = movie_ids[:num_train]
train_ratings = rating_df.rating.values[:num_train]
val_user_ids = user_ids[num_train:]
val_movie_ids = movie_ids[num_train:]
val_ratings = rating_df.rating.values[num_train:]

In [8]:
# set up user-movie matrix
num_users = user_ids.max() + 1
num_movies = movie_ids.max() + 1
user2movie = np.zeros([num_users, num_movies])
user2movie[train_user_ids, train_movie_ids] = train_ratings
user2movie

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [9]:
# credit to https://gist.github.com/bwhite/3726239
def dcg_at_k(r, k):
    '''
    Compute DCG
    args:
        r: np.array, to be evaluated
        k: int, number of entries to be considered
    
    returns:
        dcg: float, computed dcg
        
    '''
    r = r[:k]
    dcg = np.sum(r / np.log2(np.arange(2, len(r) + 2)))
    return dcg


In [10]:
def ndcg_at_k(r, k, method=0):
    '''
    Compute NDCG
    args:
        r: np.array, to be evaluated
        k: int, number of entries to be considered
    
    returns:
        dcg: float, computed ndcg
        
    '''
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)

    return dcg_at_k(r, k) / dcg_max

In [11]:
def hit_rate(predicted, true, k):
    """
    predicted : np.array
        預測結果
    true : np.array
        正確答案
    k : int
        取前k項evaluate. i.e. 如果k=10 就是看預測的前十個結果與正確答案是否有相符之商品.
    Returns:
      hr : float
        hit rate
    ============================
    範例:
    predicted = [[1,2,4], [3,5,1], [4,1,3],[2,3,1],[4,1,3], [1,2,5],[4,1,3]]
    true = [1,5,4,3,1,1,2]
    hit_rate(predicted, true, 3)
    >>> 0.8571428571428571
    """
    assert len(true) == len(predicted), "預測長度與正確答案不同"
    return sum([t in pred[:k] for t, pred in zip(true, predicted)]) / len(true)

In [12]:
# compute average hit rate for all users
def evaluate_prediction(predictions, k):
    '''
    Return the average ndcg for each users
    args:
        predictions: np.array user-item predictions
    returns:
        hit_rate: float, computed hit rate
    '''

    hit = 0
    
    # iterate
    for target_user in np.unique(val_user_ids):
        # get movie ids and ratings associated with the target user.
        target_val_movie_ids = val_movie_ids[val_user_ids == target_user] 
        target_val_ratings = val_ratings[val_user_ids == target_user] 
            
        clicked_movie_id = target_val_movie_ids[np.argmax(target_val_ratings)]
        predicted_order = np.argsort(-predictions[target_user])

        # if clicked id is within the top k prediction, hit += 1
        if np.where(predicted_order == clicked_movie_id)[0][0] < k:
            hit+=1
    
    
    return hit / len(val_user_ids)


In [13]:
from scipy.sparse import csr_matrix

user2movie = csr_matrix(user2movie)
user2movie

<671x9066 sparse matrix of type '<class 'numpy.float64'>'
	with 80003 stored elements in Compressed Sparse Row format>

In [14]:
from scipy.sparse.linalg import svds

u, s, vt = svds(user2movie, k=30)
u.shape, s.shape, vt.shape

((671, 30), (30,), (30, 9066))

In [15]:
predictions = np.matmul(u * np.expand_dims(s, 0), vt)
evaluate_prediction(predictions, 40)

0.0042997850107494625