In [44]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import Ridge
from sklearn import linear_model

# Read dataset

In [45]:
# Reading user profile
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1') # type = dataFrame, shape = (943, 5)
n_users = users.shape[0]
print('Number of users:', n_users)
users.head() # type = dataFrame

Number of users: 943


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [46]:
# Reading rating file
r_cols = ['users_id', 'movie_id', 'rating', 'unix_timestamp']

rating_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
rating_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = rating_base.values # type = ndarray, shape = (90570, 4)
rate_test = rating_test.values # shape = (9430, 4)

print('Number of traing rates:',rate_train.shape[0])
print('Number of test rates:',rate_test.shape[0])

rating_base.head()

Number of traing rates: 90570
Number of test rates: 9430


Unnamed: 0,users_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


# Item profiles

In [47]:
# Reading item file
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1') # shape = (1682, 24)
n_items = items.shape[0]
print('Number of items:', n_items)
items.head()

Number of items: 1682


Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [48]:
X0 = items.values
X_train_counts = X0[:, -19:] # shape = (1682, 19)
print(X_train_counts[0])

[0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [57]:
def get_items_rated_by_user(rate_matrix, user_id):
    """
    Parameters:
        rate_matrix (2-d array): shape = (total no. of ratings, 4)
            cols = (user_id, item_id, rating, time_stamp). We care about the first 3 values
    Returns:
        item_ids (1-d array): item indices rated by user user_id
        scores (1-d array): corresponding scores of items rated by user user_id

    """
    
    y = rate_matrix[:,0] # get users_id of all users

    # look for item indices rated by user_id and get their scores
    ids = np.where(y == user_id + 1)[0] # +1 because in the rate_matrix, users_id starts from 1
    item_ids = rate_matrix[ids, 1] - 1 # -1 becasue in python, index starts from 0
    scores = rate_matrix[ids, 2]

    return (item_ids, scores)

In [50]:
# build feature vectors using TF-IDF
transformer = TfidfTransformer(smooth_idf=True, norm='l2')
tfidf = transformer.fit_transform(X_train_counts.tolist()) # type = csr_matrix, shape = (1682, 19)
print(tfidf[0])
tfidf = tfidf.toarray() # shape = (1682, 19)
print(tfidf[0])

  (0, 5)	0.3494185741533606
  (0, 4)	0.5738720889486926
  (0, 3)	0.7406601687429238
[0.         0.         0.         0.74066017 0.57387209 0.34941857
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.        ]


# Models for Users

In [62]:
d = tfidf.shape[1] # data dimension, = 19
W = np.zeros((d, n_users)) # shape = (19, 943)
b = np.zeros((1, n_users)) # shape = (1, 943)

for n in range(n_users):
    item_ids, scores = get_items_rated_by_user(rate_train, n)
    clf = Ridge(alpha=0.01, fit_intercept=True)
    Xhat = tfidf[item_ids, :]
    
    clf.fit(Xhat, scores)
    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_
    
Yhat = tfidf.dot(W) + b # shape = (n_items, n_users) = (1682, 943)

In [52]:
# Example forr user_id = 10
n = 10
item_ids_ex, scores_ex = get_items_rated_by_user(rate_test, n)
print('Rated movie ids:', item_ids_ex)
print('True ratings:', scores_ex)
print('Predicted ratings:', Yhat[item_ids_ex, n])

Rated movie ids: [ 37 109 110 226 424 557 722 724 731 739]
True ratings: [3 3 4 3 4 3 5 3 3 4]
Predicted ratings: [3.18348495 3.1328256  3.42107302 3.091764   3.34744746 5.20450987
 4.01304057 3.34744746 3.42107302 3.71623166]


# Model Evaluation

In [64]:
def evaluate(Yhat, rate_matrix):
    """
    Parameters:
        Yhat (2d-array): shape = (n_items, n_users) = (1682, 943)
        rate_matrix (2-d array): shape = (total_ratings, 4)
    Returns:
        RMSE (scalar) between Yhat and true scores from rate_matrix 
    """
    se = 0
    cnt = 0
    for n in range(n_users):
        item_ids, scores_truth = get_items_rated_by_user(rate_matrix, n)
        scores_pred = Yhat[item_ids, n]
        e = scores_truth - scores_pred
        se += (e*e).sum()
        cnt += e.shape[0]
    return np.sqrt(se/cnt)

print('RMSE for training:',evaluate(Yhat, rate_train))
print('RMSE for test:',evaluate(Yhat, rate_test))

RMSE for training: 0.908980456282672
RMSE for test: 1.2703282700393035
