In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd

# Reading user file
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols)
n_users = users.shape[0]
print("Number of users: ", n_users)

# Reading rate file
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols)
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols)

rate_train = np.array(ratings_base)
rate_test = np.array(ratings_test)

# rate_train = ratings_base.as_matrix()
# rate_test = ratings_test.as_matrix()

# print("Number of traing rates: ", rate_train.shape[0])
# print("Number of test rates: ", rate_test.shape[0])

print(ratings_base.shape)
print(ratings_test.shape)

Number of users:  943
(90570, 4)
(9430, 4)


In [33]:
print(rate_train[0, :])

[        1         1         5 874965758]


In [2]:
# Reading items file 
i_cols = ['movie_id', 'movie_title', 'realease_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
# print(len(i_cols))

items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols)

n_items = items.shape[0]

print("Number of items: ", n_items)

X0 = np.array(items)
X_train_counts = X0[:, -19:]

print(X_train_counts)



Number of items:  1682
[[0 0 0 ... 0 0 0]
 [0 1 1 ... 1 0 0]
 [0 0 0 ... 1 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [20]:
a = np.array([[1, 2, 3, 4, 5, 6], [2, 3, 4, 5, 6, 7]])
a[:, -2:]

array([[5, 6],
       [6, 7]])

In [3]:
# Xay dung ma tran dac trung cho moi phim
# Su dung TfidfTransformer trong thu vien sklearn

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
X = transformer.fit_transform(X_train_counts.tolist()).toarray()

print(X[0, :])

[0.         0.         0.         0.74066017 0.57387209 0.34941857
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.        ]


In [4]:
def get_items_rated_by_user(rate_matrix, user_id):
    y = rate_matrix[:, 0] # all user
    # item indexes rated by user_id, we need to +1 to user_id since in the rate_matrix, id starts from 1 but id in python start from 0
    ids = np.where(y == user_id + 1)[0] # indexes of row in rate_matrix that has rated by user_id
    items_ids = rate_matrix[ids, 1] - 1 # index starts from 0
    scores = rate_matrix[ids, 2]

    return (items_ids, scores)

In [5]:
# Bay gio di tim vector trong so cho moi nguoi dung, su dung Ridge Regression
from sklearn.linear_model import Ridge
from sklearn import linear_model

d = X.shape[1] # data dimension
W = np.zeros((d, n_users))
b = np.zeros(n_users)

for n in range(n_users):
    ids, scores = get_items_rated_by_user(rate_train, n)
    model = Ridge(alpha=0.1, fit_intercept=True)
    Xhat = X[ids, :]
    model.fit(Xhat, scores)
    W[:, n] = model.coef_
    b[n] = model.intercept_

Yhat = X.dot(W) + b


In [6]:
n = 121
np.set_printoptions(precision=2)
ids, scores = get_items_rated_by_user(rate_train, n)
print('Rated movie ids: ', ids)
print('True ratings: ', scores)
print('Predicted ratings: ', Yhat[ids, n])

Rated movie ids:  [  10   27   45   56   68   69   82   85  126  134  174  179  186  189
  190  192  196  213  238  268  356  381  402  422  428  463  468  469
  510  512  518  552  569  581  659  660  672  698  723  726  735  736
  791  955 1043 1044 1073 1112 1118 1167 1266]
True ratings:  [1 4 5 2 2 5 5 5 5 4 5 5 4 4 5 4 5 2 4 5 3 3 4 4 3 5 5 3 5 4 4 3 3 5 3 4 3
 5 4 4 4 4 3 4 5 4 4 5 3 4 4]
Predicted ratings:  [1.8  3.84 4.12 4.12 3.93 4.07 4.07 4.12 4.29 3.88 4.33 3.96 4.29 3.96
 5.15 4.12 4.08 2.35 3.4  4.11 4.12 4.04 4.34 4.03 4.25 4.12 4.12 3.36
 4.12 3.8  4.37 4.08 3.36 4.08 4.12 3.36 3.02 4.12 4.08 4.08 4.08 4.04
 4.11 4.12 4.04 4.12 4.04 4.12 4.08 4.12 4.12]


In [7]:
# Danh gia mo hinh
def evaluate(Yhat, rates, W, b):
    se = cnt = 0
    for n in range(n_users):
        ids, scores_truth = get_items_rated_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = scores_truth - scores_pred
        se += (e * e).sum(axis = 0)
        cnt += e.size
    return np.sqrt(se / cnt)

print("RMSE for training: %.2f" % evaluate(Yhat, rate_train, W, b))
print("RMSE for test: %.2f" % evaluate(Yhat, rate_test, W, b))

RMSE for training: 0.91
RMSE for test: 1.14


In [34]:
a = np.array([1, 2, 3, 4, 5])
print(np.where(a % 2 == 0))

(array([1, 3]),)
