In [1]:
%config Completer.use_jedi = False

In [2]:
import os
import sys
import time
import pandas as pd
import numpy as np
import scipy
from matplotlib import pyplot as plt
import pickle
from tqdm import tqdm
from linetimer import CodeTimer

from utils import convert_ids_to_ordered, MovingAverage

tqdm.pandas()

In [3]:
aspects = pd.read_csv('data/aspects.csv').set_index("aspect_id")
features = pd.read_csv('data/features.csv').set_index('feature_id')
organizations = pd.read_csv('data/organisations.csv').set_index('org_id')
reviews = pd.read_csv('data/reviews.csv')
rubrics = pd.read_csv('data/rubrics.csv').set_index('rubric_id')
test_users = pd.read_csv('data/test_users.csv').set_index('user_id')
users = pd.read_csv('data/users.csv').set_index('user_id')

  exec(code_obj, self.user_global_ns, self.user_ns)


#### Preprocessing the reviews

In [8]:
reviews = reviews[reviews.rating.notna()]
reviews['rating'] = reviews['rating'].apply(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['rating'] = reviews['rating'].apply(int)


In [9]:
users_ordered, orgs_ordered, reviews_ordered = convert_ids_to_ordered(users, organizations, reviews)

In [10]:
validation_split_day = 1050
train_reviews = reviews_ordered[reviews_ordered.ts < validation_split_day]
test_reviews = reviews_ordered[reviews_ordered.ts >= validation_split_day]

In [13]:
next(iter(train_reviews.values))

array([2, 105, nan, 212571, 49084], dtype=object)

In [15]:
fast_train_reviews = {}
for line in tqdm(train_reviews.values):
    fast_train_reviews[(line[3], line[4])] = line[0]

100%|████████████████████████████| 3037917/3037917 [00:02<00:00, 1204314.40it/s]


### Training
P - latent vectors for clients  
Q - latent vectors for organizations   
R - ratings

I minimize $||R - PQ^T||^2$ + Reg

In [16]:
train_reviews_array = train_reviews[['ordered_id_user', 'ordered_id_org', 'rating']].values
test_reviews_array = test_reviews[['ordered_id_user', 'ordered_id_org', 'rating']].values

In [27]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [28]:
def test_model(
        Ps: np.ndarray,
        Qs: np.ndarray,
        bias: float,
    ) -> float:
    losses = []
    for i, review in enumerate(test_reviews_array):
        user_id, org_id, true_rating = review
        probability = sigmoid(Ps[user_id].dot(Qs[org_id]) + bias)
        loss = -(true_rating * np.log(probability) + (1 - true_rating) * np.log(1 - probability))
        losses.append(loss)
    return np.mean(losses)

In [59]:
def train_single_review(
        user_id: np.ndarray, 
        org_id: np.ndarray,       
        label: int,
        Ps: np.ndarray,
        Qs: np.ndarray,
        bias: float,  
        learning_rate: float = 0.01,
        C: float = 0.0,
        dropout_rate: float = 0.0,
    ):
    latent_size = Ps.shape[1]
    # dropout part
    bitmask = np.random.choice([True, False], size=(latent_size,), p=[1-dropout_rate, dropout_rate])
    multiplier = latent_size / max(1, np.sum(bitmask))
    P = Ps[user_id] * bitmask * multiplier
    Q = Qs[org_id] * bitmask
    # forward pass
    pivot = P.dot(Q) + bias
    prob = sigmoid(pivot)
    loss = -(label * np.log(prob) + (1 - label) * np.log(1 - prob))
    # backward pass
    pivot_grad = -label * (1 - prob) + (1 - label) * prob
    Ps_grad = learning_rate * pivot_grad * Q
    Qs_grad = learning_rate * pivot_grad * P
    bias_grad = learning_rate * pivot_grad
    # parameters update
    Ps[user_id] -= Ps_grad
    Qs[org_id] -= Qs_grad
    bias -= bias_grad
    
    Qs[org_id][Qs[org_id] < 0] = 0.01
    return loss

In [65]:
def train_model(
        epochs: int = 7,
        log_every: int = 1000000,
        **kwargs,
    ) -> float:
    average_loss = MovingAverage(1 / log_every, 1)
    for epoch in range(epochs):
        for i, review in enumerate(train_reviews_array):
            user_id, org_id, label = review
            label = int(label >= 4.0)
            loss = train_single_review(user_id, org_id, label, **kwargs)
            average_loss.add(loss)
            if i % log_every == 0:
                print(f"Iteration {i:07d}: Train loss", average_loss)
        print(f"Test loss: {test_model(Ps, Qs, bias)}")
        print()

In [None]:
latent_size = 8
# Ps = np.random.randn(len(users), latent_size) / latent_size + 1
# Qs = np.random.randn(len(organizations), latent_size) / latent_size + 1
# bias = np.random.randn()
train_model(Ps=Ps, Qs=Qs, bias=bias, C=0.0, dropout_rate=0.0, epochs=1, log_every=200000)

Iteration 0000000: Train loss 1.0000008141396022
Iteration 0200000: Train loss 0.8415996407896846
Iteration 0400000: Train loss 0.7888284009417137
