In [None]:
%config Completer.use_jedi = False

In [None]:
import os
import sys
import time
import pandas as pd
import numpy as np
import scipy
# from matplotlib import pyplot as plt
import pickle
from tqdm import tqdm
import torch
from linetimer import CodeTimer

from utils import convert_ids_to_ordered, MovingAverage

tqdm.pandas()

In [None]:
aspects = pd.read_csv('data/aspects.csv').set_index("aspect_id")
features = pd.read_csv('data/features.csv').set_index('feature_id')
organizations = pd.read_csv('data/organisations.csv').set_index('org_id')
reviews = pd.read_csv('data/reviews.csv')
rubrics = pd.read_csv('data/rubrics.csv').set_index('rubric_id')
test_users = pd.read_csv('data/test_users.csv').set_index('user_id')
users = pd.read_csv('data/users.csv').set_index('user_id')

#### Preprocessing the reviews

In [None]:
reviews = reviews[reviews.rating.notna()]

In [None]:
reviews['rating'] = reviews['rating'].apply(int)

In [None]:
users_ordered, orgs_ordered, reviews_ordered = convert_ids_to_ordered(users, organizations, reviews)

In [None]:
validation_split_day = 1050
train_reviews = reviews_ordered[reviews_ordered.ts < validation_split_day]
test_reviews = reviews_ordered[reviews_ordered.ts >= validation_split_day]

### Training
P - latent vectors for clients  
Q - latent vectors for organizations   
R - ratings

I minimize $||R - PQ^T||^2$ + Reg

In [None]:
latent_size = 8
Ps = np.random.randn(len(users), latent_size) / latent_size + 1
Qs = np.random.randn(len(organizations), latent_size) / latent_size + 1

In [None]:
mean_loss = np.mean((test_reviews.rating - train_reviews.rating.mean())**2)
print(f"Loss of the simplest baseline: {mean_loss}")

In [None]:
reviews_array = reviews_ordered[['ordered_id_user', 'ordered_id_org', 'rating']].values

In [None]:
learning_rate = 0.01
average_loss = MovingAverage(1e-6, 20)
for epoch in range(3):
    if average_loss.value() < 10:
        learning_rage = 0.05
    for i, review in enumerate(reviews_array):
        user_id, org_id, true_rating = review

        pred_rating = Ps[user_id].dot(Qs[org_id])
        error = pred_rating - true_rating
        Ps_grad = learning_rate * error * Qs[org_id]
        Qs_grad = learning_rate * error * Ps[user_id]
        
        Ps[user_id] -= Ps_grad
        Qs[org_id] -= Qs_grad
        
        Ps[user_id][Ps[user_id] < 0] = 0.01
        Qs[org_id][Qs[org_id] < 0] = 0.01
        
        loss = error ** 2
        average_loss.add(loss)
        if i % 500000 == 0:
            print(f"Iteration {i:07d}: Train loss", average_loss)