## メモ

回帰分析の評価指標

* https://stats.biopapyrus.jp/glm/lm-evaluation.html

ユークリッド距離 vs コサイン類似度

* https://enjoyworks.jp/tech-blog/2242

ライブラリ使った方が楽そう

* https://github.com/NicolasHug/Surprise

レコメンドサンプル

 * https://github.com/NicolasHug/Surprise/blob/master/examples/top_n_recommendations.py

Python surprise で作る らくらく「レコメンドエンジン」（その１）

* https://www.salesanalytics.co.jp/datascience/datascience180/

In [None]:
! pip install surprise

In [None]:
from surprise import NMF, Dataset, accuracy, Reader
from surprise.model_selection import train_test_split, cross_validate
import numpy as np
import pandas as pd

In [None]:
# 0の部分は未知
ratings_dict = {
    "itemID": [1, 1, 1, 2, 2],
    "userID": [9, 32, 2, 45, 2],
    "rating": [3, 2, 4, 3, 1],
}
df = pd.DataFrame(ratings_dict)

In [None]:
print(df)

In [None]:
algo=NMF()
reader = Reader()
data = Dataset.load_from_df(df[["userID", "itemID", "rating"]],reader)

In [None]:
cross_validate(algo, data, measures=['RMSE','MAE'], cv=5, verbose=True)

---------

In [None]:
! pip install surprise

In [None]:
from surprise import NMF, SVD, SVDpp, Dataset, accuracy, Reader
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from google.colab import drive
from collections import defaultdict
import numpy as np
import pandas as pd

In [None]:
drive.mount('/content/drive')

In [None]:
scores_df = pd.read_csv('drive/My Drive/dev/20230424_recommend_erogame/userbase_matrix.csv', encoding='utf-8')
user_df = pd.read_csv('drive/My Drive/dev/20230424_recommend_erogame/userbase_user_map.csv', encoding='utf-8')
game_df = pd.read_csv('drive/My Drive/dev/20230424_recommend_erogame/userbase_game_map.csv', encoding='utf-8')

In [None]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
def get_info(list, target_user_index):
  print("user_id : " + user_df.iloc[target_user_index].uid + " - https://erogamescape.dyndns.org/~ap2/ero/toukei_kaiseki/user_infomation.php?user=" + user_df.iloc[target_user_index].uid)
  for tmp in list[target_user_index]:
    print("game_id : " + str(game_df.iloc[tmp[0]].game_id) + " - " + str(tmp[1]) + " - https://erogamescape.dyndns.org/~ap2/ero/toukei_kaiseki/game.php?game=" + str(game_df.iloc[tmp[0]].game_id))

In [None]:
print(user_df.iloc[0].uid)

In [None]:
reader = Reader()
data = Dataset.load_from_df(scores_df[["uid", "game_id", "score"]],reader)

In [None]:
# パラメータの調整
param_grid = {"n_factors":[20, 50, 100], "n_epochs": [10, 25, 50], "lr_all": [0.002, 0.005 ,0.01]}
gs = GridSearchCV(SVDpp, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

In [None]:
algo=SVDpp(n_factors=20,n_epochs=25,lr_all=0.005)

In [None]:
cross_validate(algo, data, measures=['RMSE','MAE'], cv=5, verbose=True)

In [None]:
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()
algo.fit(trainset)
predictions = algo.test(testset)

In [None]:
top_10 = get_top_n(predictions)

In [None]:
get_info(top_10, 300)