In [None]:
# -*- coding: utf-8 -*-

import torch
import pandas as pd
import numpy as np

dtype = torch.float
device = torch.device('cpu')


In [None]:
rating = np.load('the-movies-dataset/numpy/small_ratings.npy')
users = np.load('the-movies-dataset/numpy/users.npy')[:100]
movies = np.load('the-movies-dataset/numpy/movies.npy')[:100]

users_num, movies_num, k = len(users), len(movies), 5
rating_len = len(rating)

print(rating[:, 2].max())
print(rating[:, 0].max())
print(len(users)-1)
print(rating_len)

In [None]:
# normalization
rating[:, 2] -= 2.5
rating[:, 2] /= 2.5

# thita
p = torch.randn(users_num, k, device=device, dtype=dtype) / 10
q = torch.randn(movies_num, k, device=device, dtype=dtype) / 10

# bias
b = torch.zeros(users_num, movies_num)
mean = torch.tensor(rating[:, 2].mean())

print(rating[:, 2].max())
print(mean)

In [None]:
# r̂ ui = mean + bu + bi + qTipu
def predict(u, i):
    pu = p[u].view(1, p[u].size()[0])
    qi = q[i].view(1, q[i].size()[0])
    pq = torch.mm(pu, qi.t())[0][0]
    return  mean + b[u][i] + pq

predict(3, 0)

In [None]:
def loss(u, i, r, rr=0.02):
    '''
    params
        u: user id
        i: movie id
        r: rating
        rr: regularization_rate
    '''
    return (r - predict(u, i)) ** 2 + rr * (
        torch.norm(b[u][i], 2) + torch.norm(p[u], 2) + torch.norm(q[i],2))

loss(0, 0, 3)

In [None]:
def iter_rating(rating, max_len=None):
    for idx, i in enumerate(rating):
        if max_len and idx >= max_len:
            break
        yield int(i[0]), int(i[1]), i[2]
iter_rating(rating).__next__()

In [None]:
epoch = 250

def lr(epoch):
    if epoch > 200:
        return 1e-4
    if epoch > 100:
        return 1e-2
    return 1e-1

def train(p, q, b, lr_func, rr=0.02, verbose=False):
    
    for ep in range(epoch):
        np.random.shuffle(rating)
        loss_sum = 0
        lr = lr_func(ep)

        for u, i, r in iter_rating(rating):
            r = torch.tensor(r)
            loss_ui = loss(u, i, torch.tensor(r), rr=rr)
            eui = r - predict(u, i)

            p[u] += lr * (eui * q[i] - rr * p[u])
            q[i] += lr * (eui * p[u] - rr * q[i])

            b[u][i] += lr * (eui - rr * b[u][i])
            
            loss_sum += loss_ui

        if verbose:
            if (epoch + 1) % 10 == 0:
                print('epoch=%03d, loss=%.4f' % (ep, loss_sum / rating_len))
    
    return p, q, b

p, q, b = train(p, q, b, lr, verbose=True)     

In [None]:
for u, i, r in iter_rating(rating, 20):
    print('target: % .2f' % r, 'predict: % .2f' % float(predict(u, i)))

In [None]:
# predict user
user_id = 20
movie_rates = []
movie_predict_rates = []

for u, i, r in iter_rating(rating):
    if u == user_id:
        movie_rates.append((i, r))

for m in range(movies_num):
    movie_predict_rates.append((m, float(predict(user_id, m))))
    
movie_rates.sort(key=lambda x: x[1], reverse=True)
movie_predict_rates.sort(key=lambda x: x[1], reverse=True)

In [None]:
import json
movie_data = []
df = pd.read_csv('the-movies-dataset/movies_metadata.csv')

for index, row in df.iloc[:, [3, 8]].iterrows():
    movie_data += [{'title': row['original_title'], 'genres': [x['name'] for x in json.loads(row['genres'].replace('\'', '"'))]}]
# movie_data


In [None]:
print('User ', users[user_id])
print('from rating, he/she likes:')
print('%s %16s %25s %43s' % ('movie_id', 'rating', 'movie_title', 'movie_genres'))
for m, r in movie_rates:
    if r > 0.5:
        r = r * 2.5 + 2.5
        mid = movies[m]-1
        print('%8s %16s %25s %43s' % (mid, r, movie_data[mid]['title'], movie_data[mid]['genres']))

print('')
print('from rating, he/she might like:')
print('%s %16s %25s %43s' % ('movie_id', 'rating_predict', 'movie_title', 'movie_genres'))
for m, r in movie_predict_rates[:5]:
    mid = movies[m]-1
    r = r * 2.5 + 2.5
    print('%8s %16s %25s %43s' % (mid, '%.2f' % r, movie_data[mid]['title'], movie_data[mid]['genres']))