In [26]:
from pathlib import Path
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
from sklearn.linear_model import Ridge
from tqdm import tqdm

In [6]:
path = Path.home() / 'OneDrive - Seagroup/ai/kaggle_dataset/movie_lens/ml-100k'

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(path / 'u.user', sep='|', names=u_cols, encoding='latin-1')
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [7]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_base = pd.read_csv(path / 'ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv(path / 'ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_base.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [8]:
i_cols = [
    'movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
    'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
    'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

items = pd.read_csv(path / 'u.item', sep='|', names=i_cols, encoding='latin-1')
items.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
x_train_item = items.to_numpy()[:, -19:]

transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
tfidf_item = transformer.fit_transform(x_train_item).toarray()
tfidf_item.shape

(1682, 19)

In [24]:
def get_items_rated_by_user(rate_matrix, user_id):
    y = rate_matrix[:, 0]
    ids = np.where(y == user_id + 1)[0]
    item_ids = rate_matrix[ids, 1] - 1
    scores = rate_matrix[ids, 2]
    return item_ids, scores

get_items_rated_by_user(ratings_base.to_numpy(), 2)

(array([180, 257, 259, 263, 267, 270, 271, 287, 298, 299, 301, 302, 306,
        316, 317, 318, 319, 320, 321, 323, 324, 325, 326, 328, 329, 332,
        335, 337, 338, 339, 340, 341, 343, 344, 345, 346, 347, 348, 349,
        350, 351, 352, 353, 354], dtype=int64),
 array([4, 2, 4, 2, 3, 3, 2, 2, 3, 2, 2, 3, 3, 2, 4, 2, 5, 5, 3, 2, 1, 2,
        4, 4, 2, 2, 1, 2, 3, 5, 1, 4, 4, 3, 5, 5, 4, 3, 3, 3, 2, 1, 3, 3],
       dtype=int64))

In [27]:
n_users = users.shape[0]
d = tfidf_item.shape[1]
W = np.zeros((d, n_users))
b = np.zeros((1, n_users))

for n in tqdm(range(n_users)):
    ids, scores = get_items_rated_by_user(ratings_base.to_numpy(), n)
    clf = Ridge(alpha=0.01, fit_intercept=True)
    Xhat = tfidf_item[ids, :]

    clf.fit(Xhat, scores)
    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_

100%|██████████| 943/943 [00:00<00:00, 2660.94it/s]


In [32]:
W.shape, b.shape

((19, 943), (1, 943))

In [50]:
n = 10
Yhat = tfidf_item.dot(W) + b

ids, scores = get_items_rated_by_user(ratings_test.to_numpy(), n)
name = items[items['movie id'].isin(ids.tolist())]['movie title'].to_numpy().tolist()

print(f'Rated movies: {name}')
print(f'True ratings: {scores}')
print(f'Predicted ratings: {Yhat[ids, n]}')

Rated movies: ['Nadja (1994)', 'Mystery Science Theater 3000: The Movie (1996)', 'Operation Dumbo Drop (1995)', 'Die Hard 2 (1990)', 'Children of the Corn: The Gathering (1996)', 'Farinelli: il castrato (1994)', 'Nine Months (1995)', 'Circle of Friends (1995)', 'Corrina, Corrina (1994)', 'Pretty Woman (1990)']
True ratings: [3 3 4 3 4 3 5 3 3 4]
Predicted ratings: [3.18 3.13 3.42 3.09 3.35 5.2  4.01 3.35 3.42 3.72]
