In [1]:
%config Completer.use_jedi = False

In [2]:
import os
import sys
import time
import pandas as pd
import numpy as np
import scipy
# from matplotlib import pyplot as plt
import pickle
from tqdm import tqdm
import torch
from linetimer import CodeTimer

from utils import convert_ids_to_ordered

tqdm.pandas()

In [3]:
aspects = pd.read_csv('data/aspects.csv').set_index("aspect_id")
features = pd.read_csv('data/features.csv').set_index('feature_id')
organizations = pd.read_csv('data/organisations.csv').set_index('org_id')
reviews = pd.read_csv('data/reviews.csv')
rubrics = pd.read_csv('data/rubrics.csv').set_index('rubric_id')
test_users = pd.read_csv('data/test_users.csv').set_index('user_id')
users = pd.read_csv('data/users.csv').set_index('user_id')

  exec(code_obj, self.user_global_ns, self.user_ns)


#### Preprocessing the reviews

In [4]:
reviews = reviews[reviews.rating.notna()]

In [5]:
reviews['rating'] = reviews['rating'].apply(int)

In [6]:
users_ordered, orgs_ordered, reviews_ordered = convert_ids_to_ordered(users, organizations, reviews)

In [7]:
validation_split_day = 1050
train_reviews = reviews_ordered[reviews_ordered.ts < validation_split_day]
test_reviews = reviews_ordered[reviews_ordered.ts >= validation_split_day]

### Training
P - latent vectors for clients  
Q - latent vectors for organizations   
R - ratings

I minimize $||R - PQ^T||^2$ + Reg

In [8]:
latent_size = 8
Ps = torch.randn(len(users), latent_size) / latent_size + 1
Qs = torch.randn(len(organizations), latent_size) / latent_size + 1

In [9]:
mean_loss = np.mean((test_reviews.rating - train_reviews.rating.mean())**2)
print(f"Loss of the simplest baseline: {mean_loss}")

Loss of the simplest baseline: 1.4542272189284848


In [10]:
reviews_array = reviews_ordered[['ordered_id_user', 'ordered_id_org', 'rating']].values

In [11]:
learning_rate = 0.01
average_loss = 20
for epoch in range(3):
    print(f"Epoch {epoch}")
    if average_loss < 10:
        learning_rage = 0.05
    for i, review in enumerate(reviews_array):
        user_id, org_id, true_rating = review

        pred_rating = Ps[user_id].dot(Qs[org_id])
        error = pred_rating - true_rating
        Ps_grad = learning_rate * error * Qs[org_id]
        Qs_grad = learning_rate * error * Ps[user_id]
        
#         print(error.item(), Ps[user_id].item(), Qs[org_id].item(), Ps_grad.item(), Qs_grad.item())
        Ps[user_id] -= Ps_grad
        Qs[org_id] -= Qs_grad
        
        Ps[user_id][Ps[user_id] < 0] = 0.01
        Qs[org_id][Qs[org_id] < 0] = 0.01
        
        loss = error ** 2
        average_loss = 0.99999 * average_loss + 0.00001 * loss.item()
        if i % 100000 == 0:
            print(f"Iteration {i:07d}: Train loss", average_loss)

Epoch 0
Iteration 0000000: Train loss 20.000163085289003
Iteration 0100000: Train loss 12.60454071557804
Iteration 0200000: Train loss 7.95053707435249
Iteration 0300000: Train loss 5.614398251675314
Iteration 0400000: Train loss 4.350297332322247
Iteration 0500000: Train loss 3.6505616218140973
Iteration 0600000: Train loss 3.2212150131995063
Iteration 0700000: Train loss 2.942014774401203
Iteration 0800000: Train loss 2.7506332762177603
Iteration 0900000: Train loss 2.61221029781535
Iteration 1000000: Train loss 2.4732513404492735
Iteration 1100000: Train loss 2.3933822256187334
Iteration 1200000: Train loss 2.3089722850372754
Iteration 1300000: Train loss 2.216735360066854
Iteration 1400000: Train loss 2.153378510641417
Iteration 1500000: Train loss 2.0982054800993493
Iteration 1600000: Train loss 2.0573318578633173
Iteration 1700000: Train loss 2.0309325833584215
Iteration 1800000: Train loss 1.970761494841691
Iteration 1900000: Train loss 1.7820239596574619
Iteration 2000000: Trai