In [1]:
%config Completer.use_jedi = False

In [2]:
import os
import sys
import time
import pandas as pd
import numpy as np
import scipy
from matplotlib import pyplot as plt
import pickle
from tqdm import tqdm
from linetimer import CodeTimer
from multiple_logging import setup_logger
import datetime

from utils import convert_ids_to_ordered, MovingAverage

tqdm.pandas()

In [3]:
aspects = pd.read_csv('data/aspects.csv').set_index("aspect_id")
features = pd.read_csv('data/features.csv').set_index('feature_id')
organizations = pd.read_csv('data/organisations.csv').set_index('org_id')
reviews = pd.read_csv('data/reviews.csv')
rubrics = pd.read_csv('data/rubrics.csv').set_index('rubric_id')
test_users = pd.read_csv('data/test_users.csv').set_index('user_id')
users = pd.read_csv('data/users.csv').set_index('user_id')

  exec(code_obj, self.user_global_ns, self.user_ns)


#### Preprocessing the reviews

In [4]:
reviews = reviews[reviews.rating.notna()]
reviews['rating'] = reviews['rating'].apply(int)

In [5]:
users_ordered, orgs_ordered, reviews_ordered = convert_ids_to_ordered(users, organizations, reviews)
n_users = len(users_ordered)
n_orgs = len(orgs_ordered)

In [6]:
validation_split_day = 1050
train_reviews = reviews_ordered[reviews_ordered.ts < validation_split_day]
test_reviews = reviews_ordered[reviews_ordered.ts >= validation_split_day]

In [7]:
fast_train_reviews = {}
for line in tqdm(train_reviews.values):
    fast_train_reviews[(line[3], line[4])] = line[0]

100%|████████████████████████████| 3037917/3037917 [00:02<00:00, 1268425.06it/s]


In [8]:
train_reviews_array = train_reviews[['ordered_id_user', 'ordered_id_org', 'rating']].values
test_reviews_array = test_reviews[['ordered_id_user', 'ordered_id_org', 'rating']].values

In [9]:
main_logger = setup_logger('main_logger', 'logs/binary_als_experiments.log')
detailed_logger = setup_logger('detailed_logger', 'logs/binary_als_experiments_detailed.log')
pivot_logger = setup_logger('pivot_logger', 'logs/pivots_nans.log')

In [10]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [19]:
def get_not_rated_pair():
    user_id = np.random.randint(n_users)
    org_id = np.random.randint(n_orgs)
    while (user_id, org_id) in fast_train_reviews:
        user_id = np.random.randint(n_users)
        org_id = np.random.randint(n_orgs)
    return user_id, org_id  

In [20]:
def test_model(
        Ps: np.ndarray,
        Qs: np.ndarray,
        bias: float,
        neg_example_coef: int,
    ) -> float:
    losses = []
    for i, review in enumerate(test_reviews_array):
        user_id, org_id, label = review
        label = int(label >= 4.0)
        probability = sigmoid(Ps[user_id].dot(Qs[org_id]) + bias)
        loss = -(label * np.log(probability) + (1 - label) * np.log(1 - probability))
        losses.append(loss)
        
    for i in range(len(test_reviews_array) * neg_example_coef):
        user_id, org_id = get_not_rated_pair()
        label = 0
        probability = sigmoid(Ps[user_id].dot(Qs[org_id]) + bias)
        loss = -(label * np.log(probability) + (1 - label) * np.log(1 - probability))
        losses.append(loss)
    
    return np.mean(losses)

In [21]:
def train_single_review(
        user_id: np.ndarray, 
        org_id: np.ndarray,       
        label: int,
        Ps: np.ndarray,
        Qs: np.ndarray,
        bias: float,  
        learning_rate: float = 0.2,
        C: float = 0.0,
        dropout_rate: float = 0.0,
    ):
    latent_size = Ps.shape[1]
    # dropout part
    bitmask = np.random.choice([True, False], size=(latent_size,), p=[1-dropout_rate, dropout_rate])
    multiplier = latent_size / max(1, np.sum(bitmask))
    P = Ps[user_id] * bitmask * multiplier
    Q = Qs[org_id] * bitmask
    # forward pass
    pivot = P.dot(Q) + bias
    prob = sigmoid(pivot)
    loss = -(label * np.log(prob) + (1 - label) * np.log(1 - prob))
    # backward pass
    pivot_grad = -label * (1 - prob) + (1 - label) * prob
    Ps_grad = learning_rate * (pivot_grad * Q  + P * C)
    Qs_grad = learning_rate * (pivot_grad * P + Q * C)
    bias_grad = learning_rate * pivot_grad
    # parameters update
    Ps[user_id] -= Ps_grad
    Qs[org_id] -= Qs_grad
    bias -= bias_grad
    
    Qs[org_id][Qs[org_id] < 0] = 0.01
    return loss

In [22]:
def train_model(
        epochs: int = 7,
        log_every: int = 1000000,
        neg_example_coef: int = 5,
        **kwargs,
    ) -> float:
    message = (f"Experiments setup: Epochs {epochs}, log_every {log_every}, latent_size {kwargs['Ps'].shape[1]}, "
               f"learning rate {kwargs['learning_rate']}, dropout {kwargs['dropout_rate']}, C {kwargs['C']}, "
               f"neg_example_coef {neg_example_coef}")
    main_logger.info(message)
    detailed_logger.info(message)
    
    average_loss = MovingAverage(1 / log_every, 1)
    for epoch in range(epochs):
        detailed_logger.info(f"Epoch {epoch}")
        pivot_logger.info(f"{np.max(kwargs['Qs'])}, {np.min(kwargs['Qs'])}, {np.max(kwargs['Ps'])}, {np.min(kwargs['Ps'])}")
        for i, review in enumerate(train_reviews_array):
            user_id, org_id, label = review
            label = int(label >= 4.0)
            loss = train_single_review(user_id, org_id, label, **kwargs)
            average_loss.add(loss)
            
            for _ in range(neg_example_coef):
                user_id, org_id = get_not_rated_pair()
                label = 0
                loss = train_single_review(user_id, org_id, label, **kwargs)
                average_loss.add(loss)
                
            if i % log_every == 0:
                detailed_logger.info(f"Iteration {i:07d}: Train loss {average_loss}", )
            
        test_loss = test_model(Ps, Qs, bias, neg_example_coef)
        main_logger.info(f"Epoch {epoch}: Train loss {average_loss}, Test loss {test_loss}")
        detailed_logger.info(f"Test loss {test_loss}")
        with open(f"logs/model{_id}.pickle", 'wb') as file:
            pickle.dump({
                "Ps": kwargs["Ps"],
                "Qs": kwargs["Qs"],
                "bias": kwargs["bias"],
            },
            file)

In [26]:
latent_size = 8
Ps = np.random.randn(len(users), latent_size) / latent_size
Qs = np.random.randn(len(organizations), latent_size) / latent_size + 1
bias = np.random.randn()

In [27]:
_id = np.random.randint(1e16)

In [None]:
main_logger.info(f'\n\n\nStarting new experiment, ID: {_id}')
detailed_logger.info(f'\n\n\nStarting new experiment, ID: {_id}')
train_model(Ps=Ps, Qs=Qs, bias=bias, C=0.2, dropout_rate=0.0, epochs=50, log_every=1000000, learning_rate=0.1)

### Make submission

In [29]:
test_users = test_users.join(users_ordered)
msk_mask = np.array(orgs_ordered.city == 'msk')

In [30]:
targets = []
for user_id, user_city in tqdm(zip(test_users.ordered_id, test_users.city)):
    pivots = Ps[user_id] @ Qs.T
    if user_city == 'spb':
        pivots[msk_mask] = -999999
    if user_city == 'msk':
        pivots[~msk_mask] = -999999
    best_orgs_ordered_ids = pivots.argsort()[-20:]
    best_orgs_native_ids = ' '.join(
        reversed(list(map(str, orgs_ordered.index[best_orgs_ordered_ids])))
    )
    targets.append(best_orgs_native_ids)

16967it [01:08, 246.03it/s]


In [31]:
submission = pd.DataFrame({'target': targets}, index=test_users.index)

In [32]:
from scripts.mnap import compute_mnap

In [33]:
submission.to_csv(f'submissions/id{_id}.csv')

In [34]:
res = compute_mnap(submission)

16967it [00:00, 19001.52it/s]


In [35]:
main_logger.info(f'MNAP: {res}')
detailed_logger.info(f'MNAP: {res}')