In [2]:
import sys

In [3]:
import pandas as pd 
#Reading user file:
u_cols =  ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')

n_users = users.shape[0]
print('Number of users:', n_users)
# users.head() #uncomment this to see some few examples

Number of users: 943


In [4]:
# Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.values
rate_test = ratings_test.values

print ('Number of traing rates:', rate_train.shape)
print ('Number of test rates:', rate_test.shape)

Number of traing rates: (90570, 4)
Number of test rates: (9430, 4)


In [5]:
#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,
 encoding='latin-1')

n_items = items.shape[0]
print ('Number of items:', n_items)

Number of items: 1682


In [6]:
ratings_base.values[ratings_base.values[:,0] ==2]

array([[        2,         1,         4, 888550871],
       [        2,        10,         2, 888551853],
       [        2,        14,         4, 888551853],
       [        2,        19,         3, 888550871],
       [        2,        25,         4, 888551648],
       [        2,       100,         5, 888552084],
       [        2,       111,         4, 888551853],
       [        2,       127,         5, 888552084],
       [        2,       237,         4, 888552017],
       [        2,       242,         5, 888552084],
       [        2,       255,         4, 888551341],
       [        2,       257,         4, 888551062],
       [        2,       258,         3, 888549961],
       [        2,       269,         4, 888550774],
       [        2,       272,         5, 888979061],
       [        2,       273,         4, 888551647],
       [        2,       274,         3, 888551497],
       [        2,       275,         5, 888550939],
       [        2,       276,         4, 88855

In [7]:
X0 = items.values
X_train_counts = X0[:, -19:]

In [8]:
X_train_counts[1011]

array([0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=object)

In [9]:
#tfidf
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')

#cái này là tfidf của cả rating train, phần 19 thuộc tính của film
#sau đó dùng chuẩn norm l2 để đưa thành matrix features
tfidf = transformer.fit_transform(X_train_counts.tolist()).toarray()

In [10]:
tfidf.shape

(1682, 19)

In [11]:
import numpy as np
def get_items_rated_by_user(rate_matrix, user_id):
    """
    in each line of rate_matrix, we have infor: user_id, item_id, rating (scores), time_stamp
    we care about the first three values
    return (item_ids, scores) rated by user user_id
    """
    y = rate_matrix[:,0] # all users
    # item indices rated by user_id
    # we need to +1 to user_id since in the rate_matrix, id starts from 1 
    # while index in python starts from 0
    
    # lấy ra 2 mảng, 1 mảng là ids của phim, 1 mảng là rating cho phim đó, theo thứ tự
    
    ids = np.where(y == user_id +1)[0] 
    item_ids = rate_matrix[ids, 1] - 1 # index starts from 0 
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)

In [12]:
from sklearn.linear_model import Ridge
from sklearn import linear_model

d = tfidf.shape[1] # data dimension
W = np.zeros((d, n_users))
b = np.zeros((1, n_users))

for n in range(n_users):    
    ids, scores = get_items_rated_by_user(rate_train, n)
    clf = Ridge(alpha=0.01, fit_intercept  = True)
    Xhat = tfidf[ids, :]
    if (len(Xhat) != len(scores)):
        print(str(Xhat.shape) + " " + str(scores.shape))
    
    clf.fit(Xhat, scores) 
    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_

In [13]:
clf.coef_

array([ 0.        ,  0.50472501, -0.17880425,  0.62374606, -0.64407185,
       -0.48569057,  1.14410443,  0.        ,  0.68732031, -1.01966702,
        0.        ,  0.34051813,  0.71185349, -0.03314575, -0.39809336,
       -0.84995181,  0.5628234 ,  0.36875069, -0.75671318])

In [14]:
np.set_printoptions(threshold=sys.maxsize)
Xhat

array([[0.        , 0.53676706, 0.65097024, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.53676706, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.78940732, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.61386976, 0.        , 0.        ],
       [0.        , 0.53804896, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.34165721, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.   

In [15]:
# predicted scores
Yhat = tfidf.dot(W) + b

In [17]:
n = 172
np.set_printoptions(precision=2) # 2 digits after . 
ids, scores = get_items_rated_by_user(rate_test, n)
Yhat[n, ids]
print('Rated movies ids :', ids )
print('True ratings     :', scores)
print('Predicted ratings:', Yhat[ids, n])

Rated movies ids : [241 259 285 288 293 299 321 322 327 937]
True ratings     : [5 4 5 4 5 4 4 5 5 3]
Predicted ratings: [4.   4.56 4.2  3.84 4.   3.96 4.79 3.96 4.85 4.33]


In [22]:
from math import sqrt
def evaluate(Yhat, rates, W, b):
    se = 0
    cnt = 0
    for n in range(n_users):
        ids, scores_truth = get_items_rated_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = scores_truth - scores_pred 
        se += (e*e).sum(axis = 0)
        cnt += e.size 
    return sqrt(se/cnt)

print ('RMSE for training:', evaluate(Yhat, rate_train, W, b))
print ('RMSE for test    :', evaluate(Yhat, rate_test, W, b))

RMSE for training: 0.908980456282672
RMSE for test    : 1.2703282700393037
