In [4]:
%config Completer.use_jedi = False

In [5]:
import os
import sys
import time
import pandas as pd
import numpy as np
import scipy
from matplotlib import pyplot as plt
import pickle
from tqdm import tqdm
from linetimer import CodeTimer
import datetime

from scripts.multiple_logging import setup_logger
from scripts.utils import convert_ids_to_ordered, MovingAverage
from scripts.mnap import compute_mnap
tqdm.pandas()

In [6]:
aspects = pd.read_csv('data/aspects.csv').set_index("aspect_id")
features = pd.read_csv('data/features.csv').set_index('feature_id')
organizations = pd.read_csv('data/organisations.csv').set_index('org_id')
reviews = pd.read_csv('data/reviews.csv')
rubrics = pd.read_csv('data/rubrics.csv').set_index('rubric_id')
test_users = pd.read_csv('data/test_users.csv').set_index('user_id')
users = pd.read_csv('data/users.csv').set_index('user_id')

  exec(code_obj, self.user_global_ns, self.user_ns)


#### Preprocessing the reviews

In [7]:
reviews = reviews[reviews.rating.notna()]
reviews['rating'] = reviews['rating'].apply(int)
reviews['org_id'] = reviews['org_id'].astype(str)
reviews['user_id'] = reviews['user_id'].astype(str)
organizations.index = organizations.index.astype(str)
users.index = users.index.astype(str)
test_users.index = test_users.index.astype(str)
rubrics.index = rubrics.index.astype(str)
aspects.index = aspects.index.astype(str)
### DONT FORGET TO CONVERT OTHER DATAFRAMES TO STR

In [8]:
rated = reviews.groupby('user_id').rating.count()
good_users = rated[rated > 10].index
rated_orgs = reviews.groupby('org_id').rating.count()
good_orgs = rated_orgs[rated_orgs > 10].index

In [9]:
cleaned_reviews = reviews.query('user_id in @good_users and org_id in @good_orgs')

In [10]:
users_ordered, orgs_ordered, reviews_ordered = convert_ids_to_ordered(users, organizations, reviews)
n_users = len(users_ordered)
n_orgs = len(orgs_ordered)

In [11]:
orgs_rubrics = orgs_ordered[['rubrics_id']]
for i in range(len(rubrics)):
    orgs_rubrics[rubrics.rubric_name.iloc[i]] = orgs_rubrics['rubrics_id'].apply(lambda x: str(rubrics.index[i]) in x.split())
orgs_rubrics = orgs_rubrics.drop('rubrics_id', axis=1)
orgs_rubrics = orgs_rubrics.join(orgs_ordered['ordered_id']).set_index('ordered_id')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  orgs_rubrics[rubrics.rubric_name.iloc[i]] = orgs_rubrics['rubrics_id'].apply(lambda x: str(rubrics.index[i]) in x.split())


In [12]:
orgs_features = orgs_ordered[['features_id']].astype(str)
for i in range(len(features)):
    orgs_features[features.feature_name.iloc[i]] = orgs_features['features_id'].apply(lambda x: str(features.index[i]) in x.split())
orgs_features = orgs_features.drop('features_id', axis=1)
orgs_features = orgs_features.join(orgs_ordered['ordered_id']).set_index('ordered_id')

In [13]:
validation_split_day = 1050
train_reviews = reviews_ordered[reviews_ordered.ts < validation_split_day]
test_reviews = reviews_ordered[reviews_ordered.ts >= validation_split_day]

#### Convert to sparce matrix

In [14]:
import scipy.sparse as sp
last_user = max(reviews_ordered.ordered_id_user)

def sp_indeces(df):
    df['user_index'] = df['ordered_id_user']
    df['org_index'] = df['ordered_id_org'] + last_user + 1
    

sp_indeces(train_reviews)
sp_indeces(test_reviews)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['user_index'] = df['ordered_id_user']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['org_index'] = df['ordered_id_org'] + last_user + 1


In [15]:
def make_sparse(df):
    length = len(df)
    coo = sp.coo_matrix(
        ([1] * 2 * length, 
        (
            list(range(length)) + list(range(length)),
            list(df['org_index']) + list(df['user_index'])
        )),
        shape=(length, n_users+n_orgs)
    )
    sparse_rubrics = sp.coo_matrix(df.join(orgs_rubrics, on='ordered_id_org').iloc[:, 7:].astype(float))
    sparse_features = sp.coo_matrix(df.join(orgs_features, on='ordered_id_org').iloc[:, 7:].astype(float))
    return coo
#     return sp.hstack([coo, sparse_rubrics, sparse_features])
    
X_train = make_sparse(train_reviews)
X_test = make_sparse(test_reviews)
y_train = train_reviews['rating']
y_test = test_reviews['rating']

## Train fastFM

In [34]:
import fastFM as fm
from fastFM import mcmc, als, sgd
from sklearn.metrics import mean_squared_error

In [35]:
rank = 2
seed = 15
step_size = 0.3
init_stdev = 0.1
l2_reg_w = 0.1
l2_reg_V = 0.1

### SGD

In [31]:
# rmse_test_re = []
# rmse_train_re = [0]
# iterations = range(1, 2000, 50)
# for i in iterations:
#     fm = sgd.FMRegression(n_iter=i, l2_reg_w=l2_reg_w,l2_reg_V=l2_reg_V, rank=rank, random_state=seed, step_size=step_size, init_stdev=init_stdev)
#     rmse_test_re.append(np.sqrt(mean_squared_error(fm.predict(X_test), y_test)))
#     rmse_train_re.append(np.sqrt(mean_squared_error(fm.predict(X_train), y_train)))
#     print(rmse_train_re[-1], rmse_test_re[-1])

### MCMC

In [37]:
machine = mcmc.FMRegression(n_iter=0, rank=rank, init_stdev=init_stdev)

In [38]:
rmse_test_re = []
machine.fit_predict(X_train, y_train, X_test)
for i in range(10):
    y_pred = machine.fit_predict(X_train, y_train, X_test, n_more_iter=5)
    rmse_test_re.append(np.sqrt(mean_squared_error(y_pred, y_test)))
    print(rmse_test_re[-1])

4.534206714930456
4.524010299086885
4.519508027946975
4.517143260742467


KeyboardInterrupt: 

In [21]:
np.sqrt(np.mean((y_test - np.mean(y_train))**2))

1.2059134375768787

### ALS

In [22]:
from fastFM import als

In [28]:
rmse_test_re = []
machine = als.FMRegression(n_iter=100, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5)

In [29]:
machine.fit(X_train, y_train)
y_pred = machine.predict(X_test)
np.sqrt(mean_squared_error(y_pred, y_test))

4.793449422626955

In [None]:
from matplotlib import pyplot as plt

x = list(iterations)

with plt.style.context('fivethirtyeight'):
    plt.plot(x, rmse_train_re, label='train')
    plt.plot(x, rmse_test_re, label='test')
plt.legend()
plt.show()