In [1]:
import pandas as pd

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols)
users

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [2]:
users['occupation'].nunique()

21

In [3]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
rating_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols)
rating_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols)

rate_train = rating_base.values
rate_test = rating_test.values

print(rate_train.shape)
print(rate_test.shape)

(90570, 4)
(9430, 4)


In [6]:
genre = pd.read_csv('ml-100k/u.genre')
genre

Unnamed: 0,unknown|0
0,Action|1
1,Adventure|2
2,Animation|3
3,Children's|4
4,Comedy|5
5,Crime|6
6,Documentary|7
7,Drama|8
8,Fantasy|9
9,Film-Noir|10


In [8]:
i_cols = ['movie id', 'movie title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 
          'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 
          'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols)
items

Unnamed: 0,movie_id,movie_title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
X0 = items.values
Xtr_counts = X0[:, -19:]
Xtr_counts.shape

(1682, 19)

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm='l2')
tfidf = transformer.fit_transform(Xtr_counts.tolist()).toarray()

In [16]:
import numpy as np
def get_items_rates_by_user(rate_matrix, user_id):
    """
    in each line of rate_matrix, we have infor: user_id, item_id, rating (scores), time_stamp
    we care about the first three values
    return(item_ids, scores) rated by user_id
    """
    y = rate_matrix[:, 0] # all users
    ids = np.where(y == user_id + 1)[0]
    item_ids = rate_matrix[ids, 1] - 1
    scores = rate_matrix[ids, 2]
    return(item_ids, scores)

In [17]:
tfidf.shape

(1682, 19)

In [20]:
from sklearn.linear_model import Ridge
from sklearn import linear_model

d = tfidf.shape[1]
W = np.zeros((d, users.shape[0]))
b = np.zeros((1, users.shape[0]))

for n in range(users.shape[0]):
    ids, scores = get_items_rates_by_user(rate_train, n)
    clf = Ridge(alpha=0.01, fit_intercept=True)
    Xhat = tfidf[ids, :]
    
    clf.fit(Xhat, scores)
    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_

In [21]:
# after computed W and b parameter, ratings for each items that predictable by:
Yhat = tfidf.dot(W) + b

In [22]:
n = 10
np.set_printoptions(precision=2)
ids, scores = get_items_rates_by_user(rate_test, n)
Yhat[n, ids]

print('Rated movies ids: ', ids)
print('True ratings: ', scores)
print('Predicted ratings: ', Yhat[ids, n])

Rated movies ids:  [ 37 109 110 226 424 557 722 724 731 739]
True ratings:  [3 3 4 3 4 3 5 3 3 4]
Predicted ratings:  [3.18 3.13 3.42 3.09 3.35 5.2  4.01 3.35 3.42 3.72]


In [25]:
import math
# use Root Mean Squared Error (RMSE)
def evaluate(Yhat, rates, W, b):
    se = 0
    cnt = 0
    for n in range(users.shape[0]):
        ids, scores_truth = get_items_rates_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = scores_truth - scores_pred # error
        se += (e*e).sum(axis=0)
        cnt += e.size
    return math.sqrt(se/cnt)

print('RMSE for training: ', evaluate(Yhat, rate_train, W, b))
print('RMSE for test: ', evaluate(Yhat, rate_test, W, b))

RMSE for training:  0.908980456282672
RMSE for test:  1.270328270039304
