In [1]:
# -*- coding: utf-8 -*-

import torch
import pandas as pd
import numpy as np

dtype = torch.float
device = torch.device('cpu')


In [2]:
rating = np.load('the-movies-dataset/numpy/small_ratings.npy')
users = np.load('the-movies-dataset/numpy/users.npy')[:100]
movies = np.load('the-movies-dataset/numpy/movies.npy')[:100]

users_num, movies_num, k = len(users), len(movies), 10
rating_len = len(rating)

print(rating[:, 2].max())
print(rating[:, 0].max())
print('users:', len(users))
print('movies:', len(movies))
print('ratings:', rating_len)

5.0
99.0
users: 100
movies: 100
ratings: 624


In [3]:
# normalization
rating[:, 2] -= 2.5
rating[:, 2] /= 2.5

# thita
p = torch.randn(users_num, k, device=device, dtype=dtype) / 10
q = torch.randn(movies_num, k, device=device, dtype=dtype) / 10
y = torch.randn(movies_num, k, device=device, dtype=dtype) / 10
w = torch.randn(movies_num, movies_num, device=device, dtype=dtype) / 10
c = torch.randn(movies_num, movies_num, device=device, dtype=dtype) / 10

# bias
bu = torch.zeros(users_num)
bi = torch.zeros(movies_num)
mean = torch.tensor(rating[:, 2].mean())

print(rating[:, 2].max())
print(mean)

1.0
tensor(0.4282)


# Backup Thita

In [5]:
# thita
p_b = p.clone()
q_b = q.clone()
y_b = y.clone()
w_b = w.clone()
c_b = c.clone()

# bias
bu_b = bu.clone()
bi_b = bi.clone()

# Recover Thita

In [119]:
# thita
p = p_b
q = q_b
y = y_b
w = w_b
c = c_b

# bias
bu = bu_b
bi = bi_b

# Start Training

In [6]:
def iter_rating(rating, max_len=None):
    for idx, i in enumerate(rating):
        if max_len and idx >= max_len:
            break
        yield int(i[0]), int(i[1]), i[2]
iter_rating(rating).__next__()

(0, 30, 0.0)

In [7]:
# generate N (implicit feedback)
N = torch.zeros(users_num, movies_num, device=device, dtype=dtype)
R = torch.zeros(users_num, movies_num, device=device, dtype=dtype)
for u, i, r in iter_rating(rating):
    N[u][i] = 1
    R[u][i] = r

number_of_nu = []
for u in range(users_num):
    number_of_nu.append(torch.nonzero(N[u]).size(0))
N

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [1., 1., 0.,  ..., 1., 0., 0.],
        [1., 0., 1.,  ..., 0., 0., 0.]])

In [8]:
def predict(u, i):
    ''' predict user u to item i's rating
    
    r̂ ui = mean + bu + bi + qTi (pu + sum(yi)) + sum_k(w) + sum_k(c)
    
    params
        u: user id
        i: movie id
    
    '''
    prediction = mean + bu[u] + bi[i]
    
    pu = p[u].view(1, p[u].size()[0])
    qi = q[i].view(1, q[i].size()[0])
    prediction += torch.mm(pu, qi.t())[0][0]
    
    sum_y = torch.zeros(k)
    sum_w = 0
    sum_c = 0
    for j in range(movies_num):
        if N[u][j] == 0: 
            # user didn't rate the movie j
            sum_y += y[j]
            sum_c += c[i][j]
        else:
            sum_w += w[i][j] * (R[u][j] - bu[u] - bi[j])
                
    sum_y /= np.sqrt(number_of_nu[u])  
    sum_y = sum_y.view(1, sum_y.size()[0])
    prediction += torch.mm(pu, sum_y.t())[0][0]
    
    prediction += sum_w / np.sqrt((movies_num - number_of_nu[u]))
    prediction+= sum_c / np.sqrt(number_of_nu[u])
    
    return prediction

predict(3, 0)

tensor(0.2864)

In [9]:
def loss(u, i, r, rr=0.002):
    ''' Loss fuction

    rui - predicted rui + L2 Loss
    
    params
        u: user id
        i: movie id
        r: rating
        rr: regularization_rate   
        
    '''
    sum_y = 0
    sum_w = 0
    sum_c = 0
    
    for j in range(movies_num):
        if N[u][j] == 0: 
            # user didn't rate the movie j
            sum_y += torch.norm(y[j],2)
            sum_c += torch.norm(c[i][j],2)
        else:
            sum_w += torch.norm(w[i][j],2)

    return (r - predict(u, i)) ** 2 + rr * (
        torch.norm(bu[u], 2) + torch.norm(bi[i], 2) +
        torch.norm(q[i], 2) + torch.norm(p[u],2) + sum_y + sum_w + sum_c
    )

loss(0, 0, 3)

tensor(4.3912)

In [11]:
epoch = 200

def lr(epoch):
    if epoch > 300:
        return 1e-4
    if epoch > 200:
        return 5e-3
    return 1e-2

def train(p, q, bi, bu, lr_func, rr=0.2, verbose=False):
    
    for ep in range(epoch):
        np.random.shuffle(rating)
        loss_sum = 0
        lr = lr_func(ep)

        for u, i, r in iter_rating(rating):
            r = torch.tensor(r)
            loss_ui = loss(u, i, torch.tensor(r), rr=rr)
            eui = r - predict(u, i)
#             print(loss_ui, eui)

            sum_y = torch.zeros(k)
            for j in range(movies_num):
                if N[u][j] == 0: 
                    # user didn't rate the movie j
                    sum_y += y[j]
                    c[i][j] += lr * (eui / np.sqrt(number_of_nu[u]) - rr * c[i][j])
                else:
                    w[i][j] += lr * (eui * (R[u][j] - bu[u] -bi[j]) / np.sqrt(movies_num - number_of_nu[u]) - rr * w[i][j])


            q[i] += lr * (eui * (p[u] + sum_y) - rr * q[i]) # TODO
            p[u] += lr * (eui * q[i] - rr * p[u])
            

            bu[u] += lr * (eui - rr * bu[u])
            bi[i] += lr * (eui - rr * bi[i])
            
            loss_sum += loss_ui

        if verbose:
            print('epoch = %3d, loss = %.4f' % (ep + 1, loss_sum / rating_len))
    
    return p, q, bi, bu

p, q, bi, bu = train(p, q, bi, bu, lr, verbose=True)

epoch =   1, loss = 7.2041
epoch =   2, loss = 7.1531
epoch =   3, loss = 7.1077
epoch =   4, loss = 7.0651
epoch =   5, loss = 7.0296
epoch =   6, loss = 6.9913
epoch =   7, loss = 6.9560
epoch =   8, loss = 6.9258
epoch =   9, loss = 6.8931
epoch =  10, loss = 6.8635
epoch =  11, loss = 6.8362
epoch =  12, loss = 6.8069
epoch =  13, loss = 6.7823
epoch =  14, loss = 6.7601
epoch =  15, loss = 6.7360
epoch =  16, loss = 6.7126
epoch =  17, loss = 6.6915
epoch =  18, loss = 6.6704
epoch =  19, loss = 6.6513
epoch =  20, loss = 6.6326
epoch =  21, loss = 6.6151
epoch =  22, loss = 6.5989
epoch =  23, loss = 6.5818
epoch =  24, loss = 6.5650
epoch =  25, loss = 6.5506
epoch =  26, loss = 6.5366
epoch =  27, loss = 6.5211
epoch =  28, loss = 6.5065
epoch =  29, loss = 6.4957
epoch =  30, loss = 6.4842
epoch =  31, loss = 6.4715
epoch =  32, loss = 6.4588
epoch =  33, loss = 6.4492
epoch =  34, loss = 6.4392
epoch =  35, loss = 6.4276
epoch =  36, loss = 6.4189
epoch =  37, loss = 6.4088
e

In [163]:
for u, i, r in iter_rating(rating, 20):
    print('target: % .2f' % r, 'predict: % .2f' % float(predict(u, i)))

target:  0.40 predict:  0.42
target:  0.80 predict:  0.57
target: -0.60 predict: -0.43
target:  0.60 predict:  0.60
target:  0.20 predict:  0.26
target: -0.60 predict: -0.16
target:  0.60 predict:  0.56
target:  0.60 predict:  0.63
target:  0.20 predict:  0.30
target:  0.40 predict:  0.46
target:  1.00 predict:  0.80
target:  0.60 predict:  0.66
target:  1.00 predict:  0.79
target:  0.60 predict:  0.57
target:  0.60 predict:  0.62
target:  0.00 predict:  0.07
target:  0.20 predict:  0.31
target:  0.60 predict:  0.40
target: -0.20 predict: -0.05
target: -0.80 predict: -0.59


In [164]:
# predict user
user_id = 20
movie_rates = []
movie_predict_rates = []

for u, i, r in iter_rating(rating):
    if u == user_id:
        movie_rates.append((i, r))

for m in range(movies_num):
    movie_predict_rates.append((m, float(predict(user_id, m))))
    
movie_rates.sort(key=lambda x: x[1], reverse=True)
movie_predict_rates.sort(key=lambda x: x[1], reverse=True)

In [165]:
import json
movie_data = []
df = pd.read_csv('the-movies-dataset/movies_metadata.csv')

for index, row in df.iloc[:, [3, 8]].iterrows():
    movie_data += [{'title': row['original_title'], 'genres': [x['name'] for x in json.loads(row['genres'].replace('\'', '"'))]}]
# movie_data


In [166]:
print('User ', users[user_id])
print('from rating, he/she likes:')
print('%s %16s %25s %43s' % ('movie_id', 'rating', 'movie_title', 'movie_genres'))
for m, r in movie_rates:
    if r > 0.5:
        r = r * 2.5 + 2.5
        mid = movies[m]-1
        print('%8s %16s %25s %43s' % (mid, r, movie_data[mid]['title'], movie_data[mid]['genres']))

print('')
print('from rating, he/she might like:')
print('%s %16s %25s %43s' % ('movie_id', 'rating_predict', 'movie_title', 'movie_genres'))
for m, r in movie_predict_rates[:5]:
    mid = movies[m]-1
    r = r * 2.5 + 2.5
    print('%8s %16s %25s %43s' % (mid, '%.2f' % r, movie_data[mid]['title'], movie_data[mid]['genres']))

User  21
from rating, he/she likes:
movie_id           rating               movie_title                                movie_genres
      33              4.0                      Babe    ['Fantasy', 'Drama', 'Comedy', 'Family']
      31              4.0            Twelve Monkeys  ['Science Fiction', 'Thriller', 'Mystery']
      46              4.0                     Se7en            ['Crime', 'Mystery', 'Thriller']

from rating, he/she might like:
movie_id   rating_predict               movie_title                                movie_genres
      29             6.12                 摇啊摇，摇到外婆桥                          ['Drama', 'Crime']
      81             4.51                   Antonia                         ['Drama', 'Comedy']
      26             4.50              Now and Then               ['Comedy', 'Drama', 'Family']
      39             4.31  Cry, the Beloved Country                                   ['Drama']
      96             4.25                  Shopping ['Action', 'Adv