In [1]:
import pandas as pd
import numpy as np

#Read user file:
u_cols=['user_id', 'age', 'sex', 'occupation', 'zip_code']
users=pd.read_csv('ml-100k/u.user',sep="|",names=u_cols,encoding='latin-1')
n_users = users.shape[0]
print('Number of users: ', n_users)

#Read ratings file:
r_cols=['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.to_numpy()
rate_test = ratings_test.to_numpy()

print('Number of traing rates: ', rate_train.shape[0])
print('Number of test rates: ', rate_test.shape[0])

Number of users:  943
Number of traing rates:  90570
Number of test rates:  9430


In [2]:
#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

n_items = items.shape[0]
print('Number of items: ', n_items)

Number of items:  1682


In [3]:
# Build Item Profile with Movie Genres, they are the last 19 columns
X0 = np.asmatrix(items)
X_train_counts = X0[:,-19:]

In [4]:
print(rate_train[:4, :])

[[        1         1         5 874965758]
 [        1         2         3 876893171]
 [        1         3         4 878542960]
 [        1         4         3 876893119]]


In [5]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
X = transformer.fit_transform(X_train_counts.tolist()).toarray()

In [6]:
def get_items_rated_by_users(rate_matrix,user_id):
    """
    in each line of rate_matrix, we have infor: user_id, item_id, rating (scores), time_stamp
    we care about the first three values
    """
    y=rate_matrix[:,0] # all users
    # item indices rated by user_id
    # we need to +1 to user_id since in the rate_matrix, id starts from 1 
    # while index in python starts from 0
    ids = np.where(y == user_id +1)[0] 
    item_ids = rate_matrix[ids, 1] - 1 # index starts from 0 
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)

In [7]:
from sklearn.linear_model import Ridge
from sklearn import linear_model
d=X.shape[1] #data dimension
W = np.zeros((d, n_users))
b = np.zeros((1, n_users))
for n in range(n_users):
    ids, scores = get_items_rated_by_users(rate_train,n)
    if(len(ids)==0):
        W[:,n]=0
        b[0,n]=0
        continue
    clf=Ridge(alpha=0.01, fit_intercept=True)
    Xhat=X[ids,:]
    clf.fit(Xhat,scores)
    W[:,n]=clf.coef_
    b[0,n]=clf.intercept_

In [8]:
#predicted scores
Yhat=X.dot(W)+b

In [10]:
n = 30
np.set_printoptions(precision=2) # 2 digits after .
ids, scores = get_items_rated_by_users(rate_test, n)
print('Rated movies ids : ', ids )
print('True ratings : ', scores)
print('Predicted ratings: ', Yhat[ids, n])

Rated movies ids :  [134 301 320 483 492 497 503 681 704 885]
True ratings :  [4 4 4 5 5 4 5 2 5 2]
Predicted ratings:  [3.42 3.56 4.7  3.82 3.66 3.53 4.19 3.67 4.03 3.64]


In [14]:
from math import sqrt
def evaluate(Yhat, rates, W, b):
    se=0
    cnt=0
    for n in range(n_users):
        ids,scores_truth=get_items_rated_by_users(rates,n)
        scores_pred=Yhat[ids,n]
        e=scores_truth-scores_pred
        se+=np.linalg.norm(e)
        cnt+=e.size
    return sqrt(se/cnt)

print('RMSE for training: ', evaluate(Yhat, rate_train, W, b))
print('RMSE for test : ', evaluate(Yhat, rate_test, W, b))

RMSE for training:  0.27453050104193866
RMSE for test :  0.6113579566395051
