In [1]:
%config Completer.use_jedi = False

In [2]:
import os
import sys
import time
import pandas as pd
import numpy as np
import scipy
from matplotlib import pyplot as plt
import pickle
from tqdm import tqdm
from linetimer import CodeTimer

from utils import convert_ids_to_ordered, MovingAverage

tqdm.pandas()

In [3]:
aspects = pd.read_csv('data/aspects.csv').set_index("aspect_id")
features = pd.read_csv('data/features.csv').set_index('feature_id')
organizations = pd.read_csv('data/organisations.csv').set_index('org_id')
reviews = pd.read_csv('data/reviews.csv')
rubrics = pd.read_csv('data/rubrics.csv').set_index('rubric_id')
test_users = pd.read_csv('data/test_users.csv').set_index('user_id')
users = pd.read_csv('data/users.csv').set_index('user_id')

  exec(code_obj, self.user_global_ns, self.user_ns)


#### Preprocessing the reviews

In [4]:
reviews = reviews[reviews.rating.notna()]

In [5]:
reviews['rating'] = reviews['rating'].apply(int)

In [6]:
users_ordered, orgs_ordered, reviews_ordered = convert_ids_to_ordered(users, organizations, reviews)

In [7]:
validation_split_day = 1050
train_reviews = reviews_ordered[reviews_ordered.ts < validation_split_day]
test_reviews = reviews_ordered[reviews_ordered.ts >= validation_split_day]

### Training
P - latent vectors for clients  
Q - latent vectors for organizations   
R - ratings

I minimize $||R - PQ^T||^2$ + Reg

In [8]:
latent_size = 16
Ps = np.random.randn(len(users), latent_size) / latent_size + 1
Qs = np.random.randn(len(organizations), latent_size) / latent_size + 1

In [9]:
mean_loss = np.mean((test_reviews.rating - train_reviews.rating.mean())**2)
print(f"Loss of the simplest baseline: {mean_loss}")

Loss of the simplest baseline: 1.4542272189284848


In [10]:
train_reviews_array = train_reviews[['ordered_id_user', 'ordered_id_org', 'rating']].values
test_reviews_array = test_reviews[['ordered_id_user', 'ordered_id_org', 'rating']].values

In [14]:
def test_model(
        Ps: np.ndarray,
        Qs: np.ndarray,
    ) -> float:
    
    losses = []
    for i, review in enumerate(test_reviews_array):
        user_id, org_id, true_rating = review
        pred_rating = Ps[user_id].dot(Qs[org_id])
        error = pred_rating - true_rating
        loss = error ** 2
        losses.append(loss)
    return np.mean(losses)

In [18]:
def train_model(
        Ps: np.ndarray,
        Qs: np.ndarray,
        learning_rate: float = 0.01,
        epochs: int = 7,
        log_every: int = 1000000,
    ) -> float:
    
    average_loss = MovingAverage(1e-6, 20)
    for epoch in range(epochs):
        for i, review in enumerate(train_reviews_array):
            user_id, org_id, true_rating = review

            pred_rating = Ps[user_id].dot(Qs[org_id])
            error = pred_rating - true_rating
            Ps_grad = learning_rate * error * Qs[org_id]
            Qs_grad = learning_rate * error * Ps[user_id]

            Ps[user_id] -= Ps_grad
            Qs[org_id] -= Qs_grad

            Qs[org_id][Qs[org_id] < 0] = 0.01

            loss = error ** 2
            average_loss.add(loss)
            if i % log_every == 0:
                print(f"Iteration {i:07d}: Train loss", average_loss)
        print()
        print(f"Test loss: {test_model(Ps, Qs)}")

In [19]:
latent_size = 8
Ps = np.random.randn(len(users), latent_size) / latent_size + 1
Qs = np.random.randn(len(organizations), latent_size) / latent_size + 1
train_model(Ps, Qs)

Iteration 0000000: Train loss 20.000016212855694
Iteration 1000000: Train loss 9.482846645549255
Iteration 2000000: Train loss 4.511358547045575
Iteration 3000000: Train loss 2.671864147129732

Test loss: 2.645540458988994
Iteration 0000000: Train loss 2.6354127447499702
Iteration 1000000: Train loss 1.8690497784022628
Iteration 2000000: Train loss 1.3872248297382983
Iteration 3000000: Train loss 1.2924014166700613

Test loss: 2.3965364825833886
Iteration 0000000: Train loss 1.2947222716774913
Iteration 1000000: Train loss 1.2461426394380593
Iteration 2000000: Train loss 1.0709216803305397
Iteration 3000000: Train loss 1.087981259474203

Test loss: 2.302750856876137
Iteration 0000000: Train loss 1.092509638116143
Iteration 1000000: Train loss 1.1066394454478319
Iteration 2000000: Train loss 0.9712943458763917
Iteration 3000000: Train loss 0.998684494334877

Test loss: 2.253055170294532
Iteration 0000000: Train loss 1.0031913811285154
Iteration 1000000: Train loss 1.0297397743461776
Ite