In [1]:
%config Completer.use_jedi = False

In [2]:
import os
import sys
import time
import pandas as pd
import numpy as np
import scipy
# from matplotlib import pyplot as plt
import pickle
from tqdm import tqdm
import torch
from linetimer import CodeTimer

from utils import convert_ids_to_ordered

tqdm.pandas()

In [3]:
aspects = pd.read_csv('data/aspects.csv').set_index("aspect_id")
features = pd.read_csv('data/features.csv').set_index('feature_id')
organizations = pd.read_csv('data/organisations.csv').set_index('org_id')
reviews = pd.read_csv('data/reviews.csv')
rubrics = pd.read_csv('data/rubrics.csv').set_index('rubric_id')
test_users = pd.read_csv('data/test_users.csv').set_index('user_id')
users = pd.read_csv('data/users.csv').set_index('user_id')

  exec(code_obj, self.user_global_ns, self.user_ns)


#### Preprocessing the reviews

In [31]:
reviews = reviews[reviews.rating.notna()]

In [32]:
reviews['rating'] = reviews['rating'].apply(int)

In [33]:
users_ordered, orgs_ordered, reviews_ordered = convert_ids_to_ordered(users, organizations, reviews)

In [34]:
validation_split_day = 1050
train_reviews = reviews_ordered[reviews_ordered.ts < validation_split_day]
test_reviews = reviews_ordered[reviews_ordered.ts >= validation_split_day]

### Training
P - latent vectors for clients  
Q - latent vectors for organizations   
R - ratings

I minimize $||R - PQ^T||^2$ + Reg

In [149]:
latent_size = 32
Ps = torch.randn(len(users), latent_size) / latent_size
Qs = torch.randn(len(organizations), latent_size) / latent_size

In [150]:
nPs = np.random.rand(len(users), latent_size)
nQs = np.random.rand(len(organizations), latent_size)

In [151]:
mean_loss = np.mean((test_reviews.rating - train_reviews.rating.mean())**2)
print(f"Loss of the simplest baseline: {mean_loss}")

Loss of the simplest baseline: 1.4542272189284848


In [152]:
reviews_array = reviews_ordered[['ordered_id_user', 'ordered_id_org', 'rating']].values

In [153]:
learning_rate = 0.001
average_loss = mean_loss
while True:
    for i, review in enumerate(reviews_array):
        user_id, org_id, true_rating = review

        pred_rating = Ps[user_id].dot(Qs[org_id])
        error = pred_rating - true_rating
        Ps[user_id] -= learning_rate * error * Qs[org_id]
        Qs[org_id] -= learning_rate * error * Ps[user_id]

        loss = error ** 2
        average_loss = 0.99999 * average_loss + 0.00001 * loss.item()
        if i % 100000 == 0:
            print(f"Epoch {i}: Train loss ", average_loss)

Epoch 0: Train loss  1.4542523521805304
Epoch 100000: Train loss  13.123707902114448
Epoch 200000: Train loss  17.46911574531724
Epoch 300000: Train loss  19.002465330674973
Epoch 400000: Train loss  19.577737173429327
Epoch 500000: Train loss  19.81177523718375
Epoch 600000: Train loss  19.890437245407117
Epoch 700000: Train loss  19.903939041911837
Epoch 800000: Train loss  19.88792984664689
Epoch 900000: Train loss  19.905440696374995
Epoch 1000000: Train loss  19.92968972063316
Epoch 1100000: Train loss  19.93903185087784
Epoch 1200000: Train loss  19.922197601272387
Epoch 1300000: Train loss  19.92024778110268
Epoch 1400000: Train loss  19.930022136128983
Epoch 1500000: Train loss  19.937011952936093
Epoch 1600000: Train loss  19.923739640349588
Epoch 1700000: Train loss  19.901442790317798
Epoch 1800000: Train loss  19.944825229189224
Epoch 1900000: Train loss  19.98473263120452
Epoch 2000000: Train loss  20.157615361932393
Epoch 2100000: Train loss  20.011993736626163
Epoch 2200

KeyboardInterrupt: 