In [1]:
%config Completer.use_jedi = False

In [2]:
import os
import sys
import time
import pandas as pd
import numpy as np
import scipy
from scipy import sparse as sp
from matplotlib import pyplot as plt
import pickle
from tqdm import tqdm
import datetime

from scripts.multiple_logging import setup_logger
from scripts.utils import convert_ids_to_ordered, MovingAverage
from scripts.mnap import compute_mnap

In [3]:
X_train = sp.load_npz("sparse_data/X_train.npz")
X_test = sp.load_npz("sparse_data/X_test.npz")
y_train = pd.read_csv('sparse_data/y_train.csv')
y_test = pd.read_csv("sparse_data/y_test.csv")

In [4]:
y_train = 2 * (y_train.rating >=4).astype(int) - 1
y_test = 2 * (y_test.rating >= 4).astype(int) - 1

## Train fastFM

In [5]:
from fastFM import mcmc, als, sgd
from sklearn.metrics import mean_squared_error

In [6]:
rank = 4
seed = 15
step_size = 0.3
init_stdev = 0.1
l2_reg_w = 0.1
l2_reg_V = 0.1

In [7]:
# baseline to outperform
np.sqrt(np.mean((y_test - np.mean(y_train))**2))

0.7685089945056228

### SGD

In [8]:
# rmse_test_re = []
# rmse_train_re = [0]
# iterations = range(1, 2000, 50)
# for i in iterations:
#     fm = sgd.FMRegression(n_iter=i, l2_reg_w=l2_reg_w,l2_reg_V=l2_reg_V, rank=rank, random_state=seed, step_size=step_size, init_stdev=init_stdev)
#     rmse_test_re.append(np.sqrt(mean_squared_error(fm.predict(X_test), y_test)))
#     rmse_train_re.append(np.sqrt(mean_squared_error(fm.predict(X_train), y_train)))
#     print(rmse_train_re[-1], rmse_test_re[-1])

### MCMC

In [19]:
machine = mcmc.FMClassification(n_iter=20, rank=12, init_stdev=init_stdev)

In [20]:
rmse_test_re = []
machine.fit_predict(X_train, y_train, X_test)
for i in range(1):
    y_pred = machine.fit_predict(X_train, y_train, X_test)
    rmse_test_re.append(np.sqrt(mean_squared_error(y_pred, y_test)))
    print(rmse_test_re[-1])

0.847867918789338


### ALS

In [21]:
from fastFM import als

In [22]:
machine = als.FMClassification(n_iter=0, init_stdev=0.1, rank=3, l2_reg_w=1, l2_reg_V=5)
machine.fit(X_train, y_train)

FMClassification(l2_reg_V=5, l2_reg_w=1, n_iter=0, rank=3)

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
accuracy_score(y_test, [1] * len(y_test))

0.820024034026597

In [25]:
rmse_test_metric = []
rmse_train_metric = []
for i in range(20):
    rmse_test_metric.append(accuracy_score(machine.predict(X_test) > 0.5, y_test))
    rmse_train_metric.append(accuracy_score(machine.predict(X_train) > 0.5, y_train))
    print(rmse_train_metric[-1], rmse_test_metric[-1])
    machine.fit(X_train, y_train, n_more_iter=1)

0.7215519713013884 0.717154743562255
0.8159301258065971 0.789087288725781
0.819178074977032 0.7909970464408532
0.8199276017086707 0.7903870542028798
0.820374618529736 0.7900097565515992
0.8206007603236033 0.7897255367349958
0.8207357212195067 0.7895510157949761
0.820834143921641 0.7893316180418085
0.8209019535425096 0.7891820286646489
0.820957912938372 0.7889609688072906
0.8209766757946316 0.7887565299918391
0.8210145306800679 0.7885155248841929
0.8210283559425752 0.7883609491944612
0.8210477771446686 0.7881914145670136
0.8210606148884252 0.7880534599191885
0.821083327819687 0.7878872495001221
0.821091557142608 0.7877243632894372
0.8210787193988512 0.7875481802452269
0.8210767443613503 0.7874185361183551
0.8210849736842711 0.7873138235543433


### Make predictions on submit dataset

In [26]:
organizations = pd.read_csv('data/organisations.csv').set_index('org_id')
reviews = pd.read_csv('data/reviews.csv')
test_users = pd.read_csv('data/test_users.csv').set_index('user_id')
users = pd.read_csv('data/users.csv').set_index('user_id')

reviews = reviews[reviews.rating.notna()]
reviews['rating'] = reviews['rating'].apply(int)
reviews['org_id'] = reviews['org_id'].astype(str)
reviews['user_id'] = reviews['user_id'].astype(str)
organizations.index = organizations.index.astype(str)
users.index = users.index.astype(str)
test_users.index = test_users.index.astype(str)

users_ordered, orgs_ordered, reviews_ordered = convert_ids_to_ordered(users, organizations, reviews)
n_users = len(users_ordered)
n_orgs = len(orgs_ordered)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [27]:
test_users = test_users.join(users_ordered)
msk_mask = np.array(orgs_ordered.city == 'msk')
orgs_ordered['other_city'] = orgs_ordered['city'].apply(lambda x: 'spb' if x == 'msk' else 'msk')

In [28]:
ordered_to_initial_org = {}
for i in range(len(orgs_ordered)):
    ordered_to_initial_org[i] = orgs_ordered['ordered_id'].index[i]
    
ordered_to_initial_user = {}
for i in range(len(users_ordered)):
    ordered_to_initial_user[i] = users_ordered['ordered_id'].index[i]

In [29]:
def get_recommendations(review_batch):
    recommendations = []
    index = submission_review_batch.groupby('ordered_id_user').rating.nlargest(20).index
    for j in range(len(index)//20):
        a = index[j*20:(j+1)*20]
        array = np.array(list(map(list, a)))
        recommended_organizations = []
        for line_index in array[:, 1]:
            org = submission_review_batch.loc[line_index].ordered_id_org
            recommended_organizations.append(ordered_to_initial_org[org])
        user_recommendation = ' '.join(recommended_organizations)
        user = ordered_to_initial_user[array[0, 0]]
        recommendations.append([user, user_recommendation])
    return recommendations

In [30]:
submissions = []
for i in tqdm(range(len(test_users) // 100 + 1)):
    submission_review_batch = test_users.iloc[i*100:(i+1)*100].merge(
        orgs_ordered[['other_city', 'ordered_id']], how='inner',
        left_on='city', right_on='other_city', suffixes=('_user', '_org'))
    x_submit = sp.load_npz(f"sparse_data/submission_batches/batch{i}.npz")
    
    values = machine.predict(x_submit)
    submission_review_batch['rating'] = values
    submissions.extend(get_recommendations(submission_review_batch))

100%|██████████| 170/170 [08:53<00:00,  3.14s/it]


In [31]:
from datetime import datetime

In [33]:
result = pd.DataFrame(submissions, columns = ['user_id', 'target'])
result = result.set_index('user_id')
result.index = result.index.astype(np.uint64)
result.to_csv(f'submissions/libFM_{datetime.now()}.csv')

In [None]:
from scripts.mnap import compute_mnap
res = compute_mnap(result)
res