In [1]:
%config Completer.use_jedi = False

In [2]:
import os
import sys
import time
import pandas as pd
import numpy as np
import scipy
from matplotlib import pyplot as plt
import pickle
from tqdm import tqdm
from linetimer import CodeTimer

from utils import convert_ids_to_ordered, MovingAverage

tqdm.pandas()

In [3]:
aspects = pd.read_csv('data/aspects.csv').set_index("aspect_id")
features = pd.read_csv('data/features.csv').set_index('feature_id')
organizations = pd.read_csv('data/organisations.csv').set_index('org_id')
reviews = pd.read_csv('data/reviews.csv')
rubrics = pd.read_csv('data/rubrics.csv').set_index('rubric_id')
test_users = pd.read_csv('data/test_users.csv').set_index('user_id')
users = pd.read_csv('data/users.csv').set_index('user_id')

  exec(code_obj, self.user_global_ns, self.user_ns)


#### Preprocessing the reviews

In [4]:
reviews = reviews[reviews.rating.notna()]
reviews['rating'] = reviews['rating'].apply(int)

In [5]:
users_ordered, orgs_ordered, reviews_ordered = convert_ids_to_ordered(users, organizations, reviews)

In [6]:
validation_split_day = 1050
train_reviews = reviews_ordered[reviews_ordered.ts < validation_split_day]
test_reviews = reviews_ordered[reviews_ordered.ts >= validation_split_day]

### Training
P - latent vectors for clients  
Q - latent vectors for organizations   
R - ratings

I minimize $||R - PQ^T||^2$ + Reg

In [7]:
mean_loss = np.mean((test_reviews.rating - train_reviews.rating.mean())**2)
print(f"Loss of the simplest baseline: {mean_loss}")

Loss of the simplest baseline: 1.4542272189284848


In [8]:
train_reviews_array = train_reviews[['ordered_id_user', 'ordered_id_org', 'rating']].values
test_reviews_array = test_reviews[['ordered_id_user', 'ordered_id_org', 'rating']].values

In [9]:
def test_model(
        Ps: np.ndarray,
        Qs: np.ndarray,
        bias: float,
    ) -> float:
    
    losses = []
    for i, review in enumerate(test_reviews_array):
        user_id, org_id, true_rating = review
        pred_rating = Ps[user_id].dot(Qs[org_id]) + bias
        error = pred_rating - true_rating
        loss = error ** 2
        losses.append(loss)
    return np.mean(losses)

In [10]:
def train_model(
        Ps: np.ndarray,
        Qs: np.ndarray,
        bias: float,
        learning_rate: float = 0.01,
        C: float = 0.0,
        epochs: int = 7,
        log_every: int = 1000000,
    ) -> float:
    
    average_loss = MovingAverage(1e-6, 20)
    for epoch in range(epochs):
        for i, review in enumerate(train_reviews_array):
            user_id, org_id, true_rating = review
            
            pred_rating = Ps[user_id].dot(Qs[org_id]) + bias
            error = pred_rating - true_rating
            Ps_grad = learning_rate * (error * Qs[org_id] + C * Ps[user_id])
            Qs_grad = learning_rate * (error * Ps[user_id] + C * Qs[org_id])
            bias_grad = learning_rate * error
            
            Ps[user_id] -= Ps_grad
            Qs[org_id] -= Qs_grad
            bias -= bias_grad

            Qs[org_id][Qs[org_id] < 0] = 0.01

            loss = error ** 2
            average_loss.add(loss)
            if i % log_every == 0:
                print(f"Iteration {i:07d}: Train loss", average_loss)
        print(f"Test loss: {test_model(Ps, Qs, bias)}")
        print()

In [None]:
latent_size = 4
Ps = np.random.randn(len(users), latent_size) / latent_size + 1
Qs = np.random.randn(len(organizations), latent_size) / latent_size + 1
bias = 4 + np.random.randn()
train_model(Ps, Qs, bias, C=0.1, epochs=3)

Iteration 0000000: Train loss 19.99999390599476
Iteration 1000000: Train loss 8.348709156612847
Iteration 2000000: Train loss 3.8424197738816095
Iteration 3000000: Train loss 2.247600869425811
Test loss: 1.65428526271952

Iteration 0000000: Train loss 2.216695394338302
Iteration 1000000: Train loss 1.6306245272235604
Iteration 2000000: Train loss 1.263067787516948
Iteration 3000000: Train loss 1.2060928823406096
Test loss: 1.6365496724923827

Iteration 0000000: Train loss 1.2086355352504634
Iteration 1000000: Train loss 1.1948853229617484
