In [28]:
import json
import os
import typing as tp
from datetime import date, datetime

import numpy as np
import optuna
import pandas as pd
# from common_metrics.metrics.recsys import MAP, HitRate, NDCG, PrecisionRecall
from loguru import logger
from scipy import sparse
from sklearn.preprocessing import normalize
from tqdm import tqdm

In [29]:
train1 = pd.read_csv('../data/train1level.csv')
test1 = pd.read_csv('../data/test1level.csv')
holdout1 = pd.read_csv('../data/holdout1level.csv')

In [30]:
# train1 = train1[train1.rating >= 3]

In [31]:
train1.head()

Unnamed: 0,userid,movieid,rating,timestamp
0,0,120,5.0,838985046
1,0,183,5.0,838983525
2,0,228,5.0,838983392
3,0,289,5.0,838983421
4,0,313,5.0,838983392


In [32]:
test1.head()

Unnamed: 0,userid,movieid,rating,timestamp
0,0,581,3.0,1208670024
1,0,9753,5.0,1208669800
2,0,9435,2.0,1208669698
3,0,9392,5.0,1208669977
4,0,9391,0.5,1208669967


In [33]:
holdout1.head()

Unnamed: 0,userid,movieid,rating,timestamp
0,0,6157,3.0,1208670181
1,1,4448,4.0,1199221172
2,2,9503,4.5,1188713380
3,3,5907,3.0,1190232446
4,4,7635,2.5,1192190218


In [34]:
items = train1.movieid.unique()
test1 = test1[test1.movieid.isin(items)]
holdout1 = holdout1[holdout1.movieid.isin(items)]
users_test = test1.userid.unique()
holdout1 = holdout1[holdout1.userid.isin(users_test)]

In [35]:
from sklearn import preprocessing
item_encoder =  preprocessing.LabelEncoder()
train1.movieid = item_encoder.fit_transform(train1.movieid)
test1.movieid = item_encoder.transform(test1.movieid)
holdout1.movieid = item_encoder.transform(holdout1.movieid)

train_user_encoder = preprocessing.LabelEncoder()
test_hold_u_encod = preprocessing.LabelEncoder()
train1.userid = train_user_encoder.fit_transform(train1.userid)
test1.userid = test_hold_u_encod.fit_transform(test1.userid)
holdout1.userid = test_hold_u_encod.transform(holdout1.userid)

In [36]:
traint1 = train1.reset_index(drop=True)
test1 = test1.reset_index(drop=True)
holdout1 = holdout1.reset_index(drop=True)

In [37]:
def load_train_data(
        train_data: pd.DataFrame
) -> sparse.csr_matrix:
    """
    Creates csr_matrix for train
    """
    
    n_items = max(train_data.movieid) + 1
    n_users = max(train_data.userid) + 1
    rows, cols = train_data["userid"], train_data["movieid"]
    data = sparse.csr_matrix((np.ones_like(rows), (rows, cols)), dtype="float64", shape=(n_users, n_items))
    return data

In [38]:
# n_items = max(max(train1.movieid), max(test1.movieid)) + 1
# n_users = max(max(train1.userid), max(test1.userid)) + 1

data = load_train_data(train1)
test = load_train_data(test1)

logger.info("data ready")

2022-07-13 16:07:33.276 | INFO     | __main__:<cell line: 7>:7 - data ready


In [39]:
assert data.shape[1] == test.shape[1]

In [40]:
def tune(
     train: sparse.csr_matrix, test: np.ndarray, holdout: pd.DataFrame,
    n_trials_ease: int = 25
) -> tp.Tuple[np.matrix, int]:
    logger.info("start tuning")

    def objective(trial):
        params_for_tune = {
            "reg_weight": trial.suggest_int("reg_weight", low=1, high=500)
        }
    
        item_matrix = fit_ease(train, params_for_tune["reg_weight"])
        scores = predict(test.toarray(), item_matrix)
        metric = HR(scores, holdout)
        return metric

    study = optuna.create_study(study_name="ease", direction="maximize")
    study.optimize(objective, n_trials=n_trials_ease, n_jobs=1)
    logger.info("end tuning")
    trial = study.best_trial
    reg_weight = trial.params['reg_weight']
    item_matrix = fit_ease(train, reg_weight)
    return item_matrix, reg_weight

In [41]:
def fit_ease(
    train: sparse.csr_matrix, reg_weight: int
) -> np.ndarray:
    """
    Counts item_matrix for future predictions
    """
    logger.info("start fitting")
    X = train
#     if cfg.norm:
    logger.info("normalize")
    X = normalize(X, norm="l2", axis=1)
    X = normalize(X, norm="l2", axis=0)
    X = sparse.csr_matrix(X)
    # gram matrix
    logger.info("gram matrix")
    G = X.T @ X
    # add reg to diagonal
    G += reg_weight * sparse.identity(G.shape[0])
    # convert to dense because inverse will be dense
    G = G.todense()
    # invert. this takes most of the time
    logger.info("invert")
    P = np.linalg.inv(G)
    B = P / (-np.diag(P))
    # zero out diag
    np.fill_diagonal(B, 0.0)
    item_matrix = B
    return item_matrix

In [42]:
def predict(
        data: np.ndarray, item_matrix: np.ndarray, remove_seen: bool = True
) -> np.ndarray:
    """
    Counts scores
    """
    scores = data.dot(item_matrix)
#     if remove_seen:
#         scores[data > 0] = -1e13
    return scores

In [None]:
item_matrix = fit_ease(data, reg_weight=367)
scores = predict(test.toarray(), item_matrix)

# with tuning
# item_matrix, reg_weight = tune(data, test, holdout1)
# scores = predict(test.toarray(), item_matrix)

2022-07-13 16:07:33.306 | INFO     | __main__:fit_ease:7 - start fitting
2022-07-13 16:07:33.308 | INFO     | __main__:fit_ease:10 - normalize
2022-07-13 16:07:33.642 | INFO     | __main__:fit_ease:15 - gram matrix


In [None]:
def HR(scores, holdout1, count_of_rec = 10):
    count_of_true_rec = 0
    pred_array = np.argsort(-scores)[:, :count_of_rec]
    for index, row in holdout1.iterrows():
        movie = row.movieid
#         print(movie)
        recommend = pred_array[index]
        if movie in recommend:
            count_of_true_rec += 1
    return count_of_true_rec / len(holdout1)

In [27]:
hr = HR(scores, holdout1)
hr

0.02117783579924572

In [23]:
# def get_recommend(
#     scores: np.matrix,
#     count_of_rec: int = 10
# ) -> None:
#     """
#     Writes recommendations in json file
#     """
#     logger.info("start creating recommend")
#     scores = np.squeeze(np.asarray(scores))
#     recommend = {}

#     list_rec = np.argsort(-scores)[:, :count_of_rec]

#     users = test1.userid.unique()
#     items = list_rec

# #     for i in range(len(list_rec)):
# #         items.append([*map(id2item.get, list(list_rec[i]))])
#     recommend = dict(zip(users, items))
# #     path_to_data = os.path.join(cfg.json_path, "ease")
# #     if not os.path.exists(path_to_data):
# #         os.makedirs(path_to_data)
# #     with open(os.path.join(path_to_data, "recommend.json"), "w", encoding="utf-8") as f:
# #         json.dump(recommend, f, ensure_ascii=False, indent=2)
# #     logger.info("recommendations are ready")
#     return recommend

In [24]:
# recommend = get_recommend(scores)