# MF

## 方針

* 20%の既出ではないユーザーに対しては、階層ベイズでanimeの平均を割り当てる。
* その他のユーザーについては、MFで普通に推薦。

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import polars as pl
import seaborn as sns
from plotly.offline import init_notebook_mode

init_notebook_mode(connected=True)

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 50)
pl.Config.set_tbl_cols(999)
pl.Config.set_tbl_rows(30)

# plt.style.use("ggplot")
sns.set()
plt.rcParams["font.family"] = ["Noto Sans JP"]
plt.rcParams["mathtext.fontset"] = "cm"
# plt.rcParams["font.size"] = 10.5

%matplotlib inline
%config InlineBackend.figure_format = "retina"

In [2]:
import os

INPUT_DIR = "../data/"
OUTPUT_DIR = "../output"

In [3]:
df_anime = pd.read_csv(os.path.join(INPUT_DIR, "anime.csv"))
df_train = pd.read_csv(os.path.join(INPUT_DIR, "train.csv"))
df_test = pd.read_csv(os.path.join(INPUT_DIR, "test.csv"))

## Utils

In [56]:
from sklearn.metrics import mean_squared_error


def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5

CV用の関数。

anime_idの20%は検証用にしか存在しないように分割する

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold


def myfold(
    X,
    groups,
    n_splits=5,
    group_split_rate=0.2,
    spliter=None,
    shuffle=True,
    random_state=None,
):
    if random_state:
        np.random.seed(random_state)

    if spliter is None:
        spliter = KFold(n_splits, shuffle=shuffle, random_state=random_state)
    n_samples = len(X)

    unique_groups, groups = np.unique(groups, return_inverse=True)
    group_split_sample_num = (n_samples / n_splits) * group_split_rate
    # [index: group_id, そのグループのレコード数]
    df_group = (
        pd.value_counts(groups).to_frame("n_sample_per_group")
        # .reset_index(columns={"index", "groups"})
    )
    df_group["fold"] = -1
    if shuffle:
        df_group = df_group.sample(frac=1)

    # shape: グループ数。グループごとのサンプル数
    # n_samples_per_group = np.bincount(groups)

    n_samples_per_fold = np.zeros(n_splits)
    for group_index, data in df_group.iterrows():
        n_sample_in_group = data["n_sample_per_group"]
        lightest_fold = np.argmin(n_samples_per_fold)
        n_samples_per_fold[lightest_fold] += n_sample_in_group
        df_group["fold"][group_index] = lightest_fold
        if n_samples_per_fold.min() > group_split_sample_num:
            break

    indices = df_group["fold"][groups].values
    not_group_indices = np.where(indices == -1)[0]
    for i, (_, index) in enumerate(spliter.split(not_group_indices)):
        indices[not_group_indices[index]] = i

    for i in range(n_splits):
        yield np.where(indices != i)[0], np.where(indices == i)[0]

In [None]:
import os

INPUT_DIR = "../data/"
OUTPUT_DIR = "../output"

In [None]:
df_anime = pd.read_csv(os.path.join(INPUT_DIR, "anime.csv"))
df_train = pd.read_csv(os.path.join(INPUT_DIR, "train.csv"))
df_test = pd.read_csv(os.path.join(INPUT_DIR, "test.csv"))

## データ整備

In [5]:
def hook_user(df_train, df_test):
    df_user = pd.DataFrame({"user_id": pd.concat([df_train, df_test]).user_id.unique()})
    df_train_agg = (
        df_train.groupby("user_id")
        .agg(mean_score=("score", "mean"), train_anime_num=("anime_id", "nunique"))
        .reset_index()
    )
    df_test_agg = (
        df_test.groupby("user_id")
        .agg(test_anime_num=("anime_id", "nunique"))
        .reset_index()
    )

    df_user = df_user.merge(df_train_agg, on="user_id", how="left").merge(
        df_test_agg, on="user_id", how="left"
    )
    return df_user


df_user_summary = hook_user(df_train, df_test)
df_user_summary

Unnamed: 0,user_id,mean_score,train_anime_num,test_anime_num
0,0008e10fb39e55447333,6.951220,41.0,27.0
1,001a7aed2546342e2602,7.512500,160.0,122.0
2,003d4b0257cc7849ffe1,8.000000,35.0,24.0
3,0054e700b5be6e074fb7,8.166667,6.0,5.0
4,0059344eed7e8ca0b6c5,7.600000,10.0,7.0
...,...,...,...,...
1993,fa11453a6cca09c82953,,,29.0
1994,fa532dafc50ad8439e1d,,,448.0
1995,fcf79144bf18fdb90aa5,,,51.0
1996,fd64597be5e54f4ac9d3,,,1.0


In [6]:
def hook_anime(df_anime, df_train, df_test):
    df_train_agg = (
        df_train.groupby("anime_id")
        .agg(mean_score=("score", "mean"), train_user_num=("user_id", "nunique"))
        .reset_index()
    )
    df_test_agg = (
        df_test.groupby("anime_id")
        .agg(test_user_num=("user_id", "nunique"))
        .reset_index()
    )

    df_anime = df_anime.merge(df_train_agg, on="anime_id", how="left").merge(
        df_test_agg, on="anime_id", how="left"
    )
    # 期間をはじめと終わりの2つのカラムに分割
    df_anime = df_anime.join(
        df_anime["aired"]
        .str.split(" to ", expand=True)
        .rename(columns={0: "start", 1: "end"})
        .replace({"Unknown": None, "?": None})
        .assign(
            start=lambda df: pd.to_datetime(df["start"]),
            end=lambda df: pd.to_datetime(df["end"]),
        )
    )
    # genresを分割して縦持ちに(UNNEST CORSS JOIN)
    df_anime = df_anime.drop(columns=["genres"]).join(
        df_anime["genres"].str.split(",").explode()
    )
    return df_anime


df_anime_summary = hook_anime(df_anime, df_train, df_test)

In [92]:
from surprise import Dataset, SVDpp, SVD, Reader


def get_predict_df(predictins):
    user_id = [p[0] for p in predictins]
    anime_id = [p[1] for p in predictins]
    score = [p[2] for p in predictins]
    pred = [p[3] for p in predictins]
    return pd.DataFrame({"user_id": user_id, "anime_id": anime_id, "score": score, "pred": pred})

In [93]:
df_pred = df_train.copy()
df_pred["pred"] = -1
df_pred["only_val"] = False
df_pred["fold"] = -1


for i, (train_ind, val_ind) in enumerate(
    myfold(
        df_train["score"].values,
        df_train["anime_id"].values,
        n_splits=5,
        group_split_rate=0.2,
        shuffle=True,
        random_state=42,
    )
):
    df_train_train = df_train.iloc[train_ind, :]
    df_train_val = df_train.iloc[val_ind, :]
    df_pred.loc[val_ind, "fold"] = i
    df_pred["only_val"] = df_pred["only_val"].mask(
        df_pred.anime_id.isin(
            np.setdiff1d(
                df_train_val.anime_id.unique(), df_train_train.anime_id.unique()
            )
        ),
        True,
    )

    train_dataset = Dataset.load_from_df(df_train_train, reader).build_full_trainset()
    val_dataset = (
        Dataset.load_from_df(df_train_val, reader).build_full_trainset().build_testset()
    )

    svd = SVDpp()
    svd.fit(train_dataset)
    predictins = svd.test(val_dataset)
    df_pred_val = get_predict_df(predictins)
    print("rmse: ", rmse(df_pred_val["score"], df_pred_val["pred"]))
    df_pred = df_pred.merge(df_pred_val, on=["user_id", "anime_id"], how="left")
    df_pred["pred"] = df_pred["pred_y"].combine_first(df_pred["pred_x"])
    df_pred = df_pred.rename(columns={"score_x": "score"}).drop(
        columns=["pred_x", "score_y", "pred_y"]
    )

rmse:  1.227981189977481
rmse:  1.2216562853119104
rmse:  1.2342854056922643
rmse:  1.2485903660646969
rmse:  1.2598544564805756


rmse:  1.2398783149875572
rmse:  1.2304318719821288
rmse:  1.2444645304443789
rmse:  1.2533717015765762
rmse:  1.263766584339434

In [99]:
train_dataset = Dataset.load_from_df(df_train, reader).build_full_trainset()
val_dataset = (
    Dataset.load_from_df(df_test.assign(score=7), reader).build_full_trainset().build_testset()
)
svd = SVDpp()
svd.fit(train_dataset)
predictins = svd.test(val_dataset)
df_pred_val = get_predict_df(predictins)

In [105]:
df_pred_val["pred"].to_frame("score").to_csv("mf_01.csv", index=False)

In [102]:
df_submission = pd.read_csv("../data/sample_submission.csv")
df_submission

Unnamed: 0,score
0,6.478691
1,2.513589
2,2.212736
3,6.608664
4,6.339157
...,...
117671,4.182134
117672,9.782259
117673,5.659402
117674,5.764207


In [94]:
rmse(df_pred["score"], df_pred["pred"])

1.2385553909055067

In [95]:
rmse(df_pred.query("only_val")["score"], df_pred.query("only_val")["pred"])

1.3834241491204649

In [96]:
rmse(df_pred.query("~only_val")["score"], df_pred.query("~only_val")["pred"])

1.19904598346898

In [106]:
rmse(df_pred.query("score < 5")["score"], df_pred.query("score < 5")["pred"])

3.138554936432211

In [39]:
from surprise import SVD

svd = SVD()
svd.fit(train_dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ff2671af580>

In [41]:
predictins = svd.test(val_dataset)

In [51]:
predictins[0][0]

'0008e10fb39e55447333'

In [54]:
def get_predict_df(predictins):
    user_id = [p[0] for p in predictins]
    anime_id = [p[1] for p in predictins]
    score = [p[2] for p in predictins]
    pred = [p[3] for p in predictins]
    return pd.DataFrame({"user_id": user_id, "anime_id": anime_id, "score": score, "pred": pred})

In [57]:
df_pred = get_predict_df(predictins)

In [58]:
rmse(df_pred["score"], df_pred["pred"])

1.2398783149875572