In [1]:
%config Completer.use_jedi = False

In [2]:
import os
import sys
import time
import pandas as pd
import numpy as np
import scipy
from scipy import sparse as sp
from matplotlib import pyplot as plt
import pickle
from tqdm import tqdm
import datetime
from joblib import Parallel, delayed

from scripts.multiple_logging import setup_logger
from scripts.utils import convert_ids_to_ordered, MovingAverage
from scripts.mnap import compute_mnap

In [3]:
X_train = sp.load_npz("sparse_data/X_train.npz")
X_test = sp.load_npz("sparse_data/X_test.npz")
y_train = pd.read_csv('sparse_data/y_train.csv')
y_test = pd.read_csv("sparse_data/y_test.csv")

In [4]:
y_train = 2 * (y_train.rating >=4).astype(int) - 1
y_test = 2 * (y_test.rating >= 4).astype(int) - 1

## Train fastFM

In [5]:
from fastFM import mcmc, als, sgd
from sklearn.metrics import mean_squared_error

In [6]:
import optuna

In [7]:
organizations = pd.read_csv('data/organisations.csv').set_index('org_id')
reviews = pd.read_csv('data/reviews.csv')
test_users = pd.read_csv('data/test_users.csv').set_index('user_id')
users = pd.read_csv('data/users.csv').set_index('user_id')

reviews = reviews[reviews.rating.notna()]
reviews['rating'] = reviews['rating'].apply(int)
reviews['org_id'] = reviews['org_id'].astype(str)
reviews['user_id'] = reviews['user_id'].astype(str)
organizations.index = organizations.index.astype(str)
users.index = users.index.astype(str)
test_users.index = test_users.index.astype(str)

users_ordered, orgs_ordered, reviews_ordered = convert_ids_to_ordered(users, organizations, reviews)
n_users = len(users_ordered)
n_orgs = len(orgs_ordered)

test_users = test_users.join(users_ordered)
msk_mask = np.array(orgs_ordered.city == 'msk')
orgs_ordered['other_city'] = orgs_ordered['city'].apply(lambda x: 'spb' if x == 'msk' else 'msk')

ordered_to_initial_org = {}
for i in range(len(orgs_ordered)):
    ordered_to_initial_org[i] = orgs_ordered['ordered_id'].index[i]

ordered_to_initial_user = {}
for i in range(len(users_ordered)):
    ordered_to_initial_user[i] = users_ordered['ordered_id'].index[i]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
from scripts.mnap import compute_mnap

def get_recommendations(review_batch):
    recommendations = []
    index = review_batch.groupby('ordered_id_user').rating.nlargest(20).index
    for j in range(len(index)//20):
        a = index[j*20:(j+1)*20]
        array = np.array(list(map(list, a)))
        recommended_organizations = []
        for line_index in array[:, 1]:
            org = review_batch.loc[line_index].ordered_id_org
            recommended_organizations.append(ordered_to_initial_org[org])
        user_recommendation = ' '.join(recommended_organizations)
        user = ordered_to_initial_user[array[0, 0]]
        recommendations.append([user, user_recommendation])
    return recommendations

def eval_single_batch(machine, index):
    submission_review_batch = test_users.iloc[index*100:(index+1)*100].merge(
            orgs_ordered[['other_city', 'ordered_id']], how='inner',
            left_on='city', right_on='other_city', suffixes=('_user', '_org'))
    x_submit = sp.load_npz(f"sparse_data/submission_batches/batch{index}.npz")
    values = machine.predict_proba(x_submit)
    submission_review_batch['rating'] = values
    return get_recommendations(submission_review_batch)

def eval_model(machine):    
    submissions = []
    for i in tqdm(range(len(test_users) // 100 + 1)):
        submissions.extend(eval_single_batch(machine, i))
    result = pd.DataFrame(submissions, columns = ['user_id', 'target'])
    result = result.set_index('user_id')
    result.index = result.index.astype(np.uint64)
    res = compute_mnap(result)
    return res

In [9]:
def assess_model(trial):
    x = time.time()
    n_iter = trial.suggest_int('n_iter', 1, 100, log=True)
    model = trial.suggest_categorical('model', ['als', 'sgd'])
    rank = trial.suggest_int('rank', 2, 32, log=True)
    init_stdev = trial.suggest_float('init_stdev', 0.00001, 1, log=True)
    reg_w = trial.suggest_float('reg_w', 0.001, 100, log=True)
    reg_v = trial.suggest_float('reg_v', 0.001, 1000, log=True)
    step_size = trial.suggest_float('step_size', 0.00001, 1, log=True)
    
    if model == 'als':
        machine = als.FMClassification(
            n_iter=n_iter, 
            init_stdev=init_stdev, 
            rank=rank, 
            l2_reg_w=reg_w, 
            l2_reg_V=reg_v)
        machine.fit(X_train, y_train)            
            
    if model == 'sgd':
        machine = sgd.FMClassification(n_iter=n_iter, 
                                       l2_reg_w=reg_w,
                                       l2_reg_V=reg_v, 
                                       rank=rank, 
                                       step_size=step_size, 
                                       init_stdev=init_stdev)
        machine.fit(X_train, y_train)
    training_time = time.time() - x
    x = time.time()
    mnap = eval_model(machine)
    eval_time = time.time() - x
    with open("hyperparams_tune.log", 'a') as file:
        file.write(f"Model: {model}, n_iter: {n_iter}, rank: {rank}, init_stdev: {init_stdev}, "
                   f"reg_w: {reg_w}, reg_v: {reg_v}, step_size: {step_size}\n")
        file.write(f"MNAP: {mnap}\n")
        file.write(f"Time spent: {training_time / 60} {eval_time / 60}\n\n")
    return mnap

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(assess_model, n_trials=10)
df = study.trials_dataframe(attrs=("number", "value", "params"))
df.set_index("number", inplace=True)
df.to_csv(f"optuna.csv", index=True)

[32m[I 2021-11-26 17:34:05,552][0m A new study created in memory with name: no-name-7f04f251-2c8b-4ee5-989a-f20ca2a56c80[0m
 64%|██████▎   | 108/170 [24:06<12:05, 11.70s/it]