In [2]:
# -*- coding: utf-8 -*-

import torch
import pandas as pd
import numpy as np

dtype = torch.float
device = torch.device('cpu')


In [3]:
rating = np.load('the-movies-dataset/numpy/small_ratings.npy')
users = np.load('the-movies-dataset/numpy/users.npy')[:100]
movies = np.load('the-movies-dataset/numpy/movies.npy')[:100]

users_num, movies_num, k = len(users), len(movies), 5
rating_len = len(rating)

print(rating[:, 2].max())
print(rating[:, 0].max())
print(len(users)-1)
print(rating_len)

5.0
99.0
99
624


In [4]:
# normalization
rating[:, 2] -= 2.5
rating[:, 2] /= 2.5

# thita
p = torch.randn(users_num, k, device=device, dtype=dtype) / 10
q = torch.randn(movies_num, k, device=device, dtype=dtype) / 10

# bias
b = torch.zeros(users_num, movies_num)
mean = torch.tensor(rating[:, 2].mean())

print(rating[:, 2].max())
print(mean)

1.0
tensor(0.4282)


In [5]:
# r̂ ui = mean + bu + bi + qTipu
def predict(u, i):
    pu = p[u].view(1, p[u].size()[0])
    qi = q[i].view(1, q[i].size()[0])
    pq = torch.mm(pu, qi.t())[0][0]
    return  mean + b[u][i] + pq

predict(3, 0)

tensor(0.4000)

In [6]:
def loss(u, i, r, rr=0.02):
    '''
    params
        u: user id
        i: movie id
        r: rating
        rr: regularization_rate
    '''
    return (r - predict(u, i)) ** 2 + rr * (
        torch.norm(b[u][i], 2) + torch.norm(p[u], 2) + torch.norm(q[i],2))

loss(0, 0, 3)

tensor(6.6527)

In [7]:
def iter_rating(rating, max_len=None):
    for idx, i in enumerate(rating):
        if max_len and idx >= max_len:
            break
        yield int(i[0]), int(i[1]), i[2]
iter_rating(rating).__next__()

(0, 30, 0.0)

In [9]:
epoch = 250

def lr(epoch):
    if epoch > 200:
        return 1e-4
    if epoch > 100:
        return 1e-2
    return 1e-1

def train(p, q, b, lr_func, rr=0.02, verbose=False):
    
    for ep in range(epoch):
        np.random.shuffle(rating)
        loss_sum = 0
        lr = lr_func(ep)

        for u, i, r in iter_rating(rating):
            r = torch.tensor(r)
            loss_ui = loss(u, i, torch.tensor(r), rr=rr)
            eui = r - predict(u, i)

            p[u] += lr * (eui * q[i] - rr * p[u])
            q[i] += lr * (eui * p[u] - rr * q[i])

            b[u][i] += lr * (eui - rr * b[u][i])
            
            loss_sum += loss_ui

        if verbose:
            if (epoch + 1) % 10 == 0:
                print('epoch=%03d, loss=%.4f' % (ep, loss_sum / rating_len))
    
    return p, q, b

p, q, b = train(p, q, b, lr, verbose=True)     

epoch=000, loss=0.1756
epoch=001, loss=0.1411
epoch=002, loss=0.1142
epoch=003, loss=0.0928
epoch=004, loss=0.0761
epoch=005, loss=0.0629
epoch=006, loss=0.0528
epoch=007, loss=0.0450
epoch=008, loss=0.0390
epoch=009, loss=0.0345
epoch=010, loss=0.0310
epoch=011, loss=0.0283
epoch=012, loss=0.0263
epoch=013, loss=0.0246
epoch=014, loss=0.0234
epoch=015, loss=0.0224
epoch=016, loss=0.0215
epoch=017, loss=0.0209
epoch=018, loss=0.0203
epoch=019, loss=0.0199
epoch=020, loss=0.0195
epoch=021, loss=0.0191
epoch=022, loss=0.0189
epoch=023, loss=0.0186
epoch=024, loss=0.0184
epoch=025, loss=0.0182
epoch=026, loss=0.0180
epoch=027, loss=0.0178
epoch=028, loss=0.0176
epoch=029, loss=0.0175
epoch=030, loss=0.0173
epoch=031, loss=0.0172
epoch=032, loss=0.0170
epoch=033, loss=0.0169
epoch=034, loss=0.0167
epoch=035, loss=0.0166
epoch=036, loss=0.0165
epoch=037, loss=0.0164
epoch=038, loss=0.0162
epoch=039, loss=0.0161
epoch=040, loss=0.0160
epoch=041, loss=0.0159
epoch=042, loss=0.0158
epoch=043, 

In [10]:
for u, i, r in iter_rating(rating, 20):
    print('target: % .2f' % r, 'predict: % .2f' % float(predict(u, i)))

target:  0.40 predict:  0.40
target:  0.20 predict:  0.21
target:  0.40 predict:  0.40
target:  1.00 predict:  0.98
target:  0.60 predict:  0.60
target:  0.00 predict:  0.01
target:  0.20 predict:  0.20
target:  0.60 predict:  0.60
target:  0.20 predict:  0.20
target:  0.20 predict:  0.20
target:  0.60 predict:  0.59
target:  0.60 predict:  0.60
target:  0.20 predict:  0.21
target:  0.00 predict:  0.01
target:  0.60 predict:  0.60
target:  1.00 predict:  0.99
target:  0.60 predict:  0.59
target:  0.00 predict:  0.01
target:  0.40 predict:  0.40
target:  0.20 predict:  0.21


In [11]:
# predict user
user_id = 20
movie_rates = []
movie_predict_rates = []

for u, i, r in iter_rating(rating):
    if u == user_id:
        movie_rates.append((i, r))

for m in range(movies_num):
    movie_predict_rates.append((m, float(predict(user_id, m))))
    
movie_rates.sort(key=lambda x: x[1], reverse=True)
movie_predict_rates.sort(key=lambda x: x[1], reverse=True)

In [12]:
import json
movie_data = []
df = pd.read_csv('the-movies-dataset/movies_metadata.csv')

for index, row in df.iloc[:, [3, 8]].iterrows():
    movie_data += [{'title': row['original_title'], 'genres': [x['name'] for x in json.loads(row['genres'].replace('\'', '"'))]}]
# movie_data


  interactivity=interactivity, compiler=compiler, result=result)


In [41]:
print('User ', users[user_id])
print('from rating, he/she likes:')
print('%s %16s %25s %43s' % ('movie_id', 'rating', 'movie_title', 'movie_genres'))
for m, r in movie_rates:
    if r > 0.5:
        r = r * 2.5 + 2.5
        mid = movies[m]-1
        print('%8s %16s %25s %43s' % (mid, r, movie_data[mid]['title'], movie_data[mid]['genres']))

print('')
print('from rating, he/she might like:')
print('%s %16s %25s %43s' % ('movie_id', 'rating_predict', 'movie_title', 'movie_genres'))
for m, r in movie_predict_rates[:5]:
    mid = movies[m]-1
    r = r * 2.5 + 2.5
    print('%8s %16s %25s %43s' % (mid, '%.2f' % r, movie_data[mid]['title'], movie_data[mid]['genres']))

User  21
from rating, he/she likes:
movie_id           rating               movie_title                                movie_genres
      33              4.0                      Babe    ['Fantasy', 'Drama', 'Comedy', 'Family']
      31              4.0            Twelve Monkeys  ['Science Fiction', 'Thriller', 'Mystery']
      46              4.0                     Se7en            ['Crime', 'Mystery', 'Thriller']

from rating, he/she might like:
movie_id   rating_predict               movie_title                                movie_genres
      29             4.47                 摇啊摇，摇到外婆桥                          ['Drama', 'Crime']
      72             4.32            Les misérables                        ['Drama', 'History']
      49             4.11        The Usual Suspects              ['Drama', 'Crime', 'Thriller']
      39             4.10  Cry, the Beloved Country                                   ['Drama']
      15             4.08                    Casino                

In [36]:
0.2 * 2.5 + 2.5

3.0