In [None]:
import pandas as pd
from collections import Counter
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

In [None]:
train_data = pd.read_csv("static/data/train_data.csv", index_col=0).reset_index(drop=True)
test_data = pd.read_csv("static/data/test_data.csv", index_col=0).reset_index(drop=True)

In [None]:
train_data.head()

In [None]:
true_recs = test_data.groupby('user_id')['product_id'].apply(list).reset_index(name='recs')
true_recs

In [None]:
train_users = set(train_data.user_id)
test_users = set(test_data.user_id)

In [None]:
train_products = set(train_data.product_id)
test_products = set(test_data.product_id)

In [None]:
all_user_ids = train_users | set(test_users)
all_product_ids = set(train_products) | set(test_products)

In [None]:
class BaseRecommender:
  def __init__(self):
    self.model = None
    self.user_pos = {}
    self.pos_user = {}
    self.product_pos = {}
    self.pos_product = {}

  def _set_user_pos_mapping(self, all_user_ids):
    self.user_pos = {user: idx for idx, user in enumerate(all_user_ids)}
    self.pos_user = {value: key for key, value in self.user_pos.items()}

  def _set_product_pos_mapping(self, all_product_ids):
    self.product_pos = {product: idx for idx, product in enumerate(all_product_ids)}
    self.pos_product = {value: key for key, value in self.product_pos.items()}

  def _get_counts_df(self, df, column_names):
    return (df
            .groupby([column_names['user'], column_names['item']])[[column_names['group_id']]]
            .count()
            .reset_index()
            .rename(columns={column_names['group_id']:"count"}))

  def get_interactions_matrix(self, train_data, column_names):
    train_data_count = self._get_counts_df(train_data, column_names)
    rows = [self.user_pos[user] for user in train_data_count[column_names['user']].values]
    cols = [self.product_pos[product] for product in train_data_count[column_names['item']].values]
    return csr_matrix((train_data_count['count'].values, (rows, cols)),
                      shape=(len(self.pos_user.keys()), len(self.pos_product.keys())))

  def fit(self):
    pass

  def predict(self):
    pass

  def _p_k(self, y_true, y_pred, k):
    return len(np.intersect1d(y_pred[:k], y_true[:k])) / k

  def map_k(self, y_true, y_pred, k):
    res = 0
    for i in range(k):
      p_i = self._p_k(y_true, y_pred, i + 1)
      r_true = 1 if y_pred[i] in y_true[:k] else 0
      res = res + r_true * p_i
    return res / k

  def hit_rate_k(self, y_true, y_pred, k):
    return 1 if len(np.intersect1d(y_pred[:k], y_true[:k])) > 0 else 0

  def ndcg_k(self, y_true, y_pred, k):
    dcg_k = 0
    for i in range(k):
      r_true = 1 if y_pred[i] in y_true[:k] else 0
      dcg_k = dcg_k + r_true * np.log2(i + 1)
    idcg_k = np.sum([1 / np.log2(i + 2) for i in range(k)])
    return dcg_k / idcg_k

  def avg_metric(self, y_true, y_pred, metric, k=None):
    merged = pd.merge(y_true, y_pred, on='user_id', how='right', suffixes=('_true', '_pred'))
    merged['recs_true'] = merged['recs_true'].apply(lambda x: x if isinstance(x, list) else [])
    l = [metric(row['recs_true'], row['recs_pred'], k) for idx, row in merged.iterrows()]
    return np.mean(l)

  def print_metrics(self, true_res, predicted_res, k_values=(3, 5, 10)):
    str_metrics = ''
    for k in k_values:
      hit_rate_avg = round(self.avg_metric(true_res, predicted_res, self.hit_rate_k, k), 3)
      map_avg = round(self.avg_metric(true_res, predicted_res, self.map_k, k), 3)
      ndcg_avg = round(self.avg_metric(true_res, predicted_res, self.ndcg_k, k), 3)
      metric_line = f'HitRate@{k}: {hit_rate_avg}; MAP@{k}: {map_avg}; NDCG@{k}: {ndcg_avg};'
      str_metrics += metric_line + '\n'
    return str_metrics
    

In [None]:
class SvdRecommender(BaseRecommender):
  def __init__(self, user_ids, product_ids, n_components=500):
    self._set_user_pos_mapping(user_ids)
    self._set_product_pos_mapping(product_ids)
    self.n_components = n_components
    self.users_repres = None
    self.products_repres = None
  
  def fit(self, train_data, column_names):
    interactions_matrix = self.get_interactions_matrix(train_data, column_names)
    self.model = TruncatedSVD(random_state=0, n_components=self.n_components)
    self.users_repres = self.model.fit_transform(interactions_matrix)
    self.products_repres = self.model.components_
    print(f'User representaions have size: {self.users_repres.shape}')
    print(f'Item representaions have size: {self.products_repres.shape}')
    return self.model

  def predict(self, test_users, n_recs, batch_number=100):
    batches_users = np.array_split(test_users, batch_number)
    result_list = []
    for batch in batches_users:
      predicted_batch = self._predict_batch(batch, 10)
      result_list += predicted_batch
    return pd.DataFrame.from_records(result_list).sort_values('user_id').reset_index(drop=True)

  def _predict_batch(self, test_users_batch, n_recs):
    recs = []
    batch_users_pos = [self.user_pos[user] for user in test_users_batch]
    batch_user_ratings = np.dot(self.users_repres[batch_users_pos, :], self.products_repres)
    sorted_recs = batch_user_ratings.argsort()[:, ::-1][:, :n_recs]
    for i, user in enumerate(batch_users_pos):
      recs_dict = {'user_id': self.pos_user[user],
                   'recs': [self.pos_product[rec] for rec in sorted_recs[i, :]]}
      recs.append(recs_dict)
    
    return recs

In [None]:
svd_columns = {'user': 'user_id', 'item': 'product_id', 'group_id': 'order_id'}

In [None]:
svd_recommender = SvdRecommender(all_user_ids, all_product_ids, n_components=500)

In [None]:
svd_recommender.fit(train_data, svd_columns)

In [None]:
svd_recs = svd_recommender.predict(list(test_users), 10)
svd_recs

In [None]:
metrics = svd_recommender.print_metrics(true_recs, svd_recs)

In [None]:
print(metrics)

In [None]:
with open("static/metrics.txt", "w") as text_file:
    text_file.write(metrics)