In [None]:
import numpy as np 
import pandas as pd 
import sys 
import os 
import pickle 
import seaborn as sns 
import matplotlib.pyplot as plt 
from pathlib import Path
import wandb
sns.set()

sys.path.append('../')
from src.utils import pickle_load, pickle_save, seed_everything

In [None]:
class Config():
    version = '039'
    comment = 'cat'
    input_dir = Path('../input/atmaCup15_dataset/')
    cv_strategy = ['group', 'stratified'][1]
    output_root_dir = Path(f'../output/{version}')
    output_dir = output_root_dir / f'{cv_strategy}'
    feature_dir = Path(f'../features/{cv_strategy}')
    seed = 42
    target_col = 'score'
    wandb_init = {
        "project": "atma15",
        "entity": "kuto5046",
        "group": f"exp{version}",
        "dir": output_dir,
        "tags": [],
        "mode": "disabled", 
    }
    n_splits = 5
    use_fold = [0,1,2,3,4]  # fold1つで終える場合[0], 全てのfoldを実行する場合[0,1,2,3,4]

    # model設定読み込み
    model_config_name = 'cb_regression'  # タスクや使うモデルに応じて変更
    model_config = HydraConfig.get_cnf(config_path='../configs/model/', config_name=model_config_name)
    num_boost_round = model_config['num_boost_round']
    model_name = model_config.name
    model_params = dict(model_config['params'])


c = Config()
# c = HydraConfig.get_cnf(config_path='/home/user/work/configs/', config_name='config.yaml')
c.output_dir.mkdir(parents=True, exist_ok=True)
c.feature_dir.mkdir(parents=True, exist_ok=True)
seed_everything(c.seed)

In [None]:
import logging
class StreamToLogger:
    def __init__(self, logger, level):
        self.logger = logger
        self.level = level

    def write(self, message):
        if message.rstrip() != "":
            self.logger.log(self.level, message.rstrip())

    def flush(self):
        pass

def get_logger(output_dir:Path):
    logger = logging.getLogger('main')
    logger.setLevel(logging.INFO)

    # File handler for outputting to a log file
    file_handler = logging.FileHandler(output_dir / 'result.log')
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    # Stream handler for outputting to console
    stream_handler = logging.StreamHandler()
    stream_handler.setFormatter(formatter)
    logger.addHandler(stream_handler)

    # Redirect stdout and stderr
    # sys.stdout = StreamToLogger(logger, logging.INFO)
    # sys.stderr = StreamToLogger(logger, logging.ERROR)

    return logger 

logger = get_logger(c.output_dir)

In [None]:
# cの中身を表示
for k, v in Config.__dict__.items():
    logger.info(f'{k}: {v}')

### read data

train = pd.read_csv(c.input_dir /'train.csv')
test = pd.read_csv(c.input_dir / 'test.csv')
anime = pd.read_csv(c.input_dir / 'anime.csv')
train.shape, test.shape

### preprocess

In [None]:
train_only_anime_ids = set(train['anime_id']) - set(test['anime_id'])
test_only_anime_ids = set(test['anime_id']) - set(train['anime_id'])
train_only_user_ids = set(train['user_id']) - set(test['user_id'])
test_only_user_ids = set(test['user_id']) - set(train['user_id'])
print(f'train_only_anime_ids: {len(train_only_anime_ids)}')
print(f'test_only_anime_ids: {len(test_only_anime_ids)}')
print(f'train_only_user_ids: {len(train_only_user_ids)}')
print(f'test_only_user_ids: {len(test_only_user_ids)}')

In [None]:
train = train.query('anime_id not in @train_only_anime_ids and user_id not in @train_only_user_ids')

In [None]:
from itertools import combinations
from scipy.cluster.hierarchy import DisjointSet
import Levenshtein

def get_original_work_name(df, threshold=0.3):

    _feature = df.japanese_name.tolist()
    _n = df.shape[0]

    _disjoint_set = DisjointSet(list(range(_n)))
    for i, j in combinations(range(_n), 2):
        if _feature[i] is np.nan or _feature[j] is np.nan:
            lv_dist, jw_dist = 0.5, 0.5
        else:
            lv_dist = 1 - Levenshtein.ratio(_feature[i], _feature[j])
            jw_dist = 1 - Levenshtein.jaro_winkler(_feature[i], _feature[j])
        _d = (lv_dist + jw_dist) / 2

        if _d < threshold:
            _disjoint_set.merge(i, j)

    _labels = [None] * _n
    for subset in _disjoint_set.subsets():
        label = _feature[list(subset)[0]]
        for element in subset:
            _labels[element] = label
    df["original_work_name"] = _labels
    return df


anime["japanese_name"] = anime["japanese_name"].apply(lambda x:np.nan if x=="Unknown" else x)
anime = get_original_work_name(anime)

In [None]:
anime['episodes'] = anime['episodes'].replace('Unknown', np.nan).astype(float)

In [None]:
whole = pd.concat([train, test]).reset_index(drop=True)
whole = whole.merge(anime, on='anime_id', how='left')
whole.head()

In [None]:
cat_cols = get_categorical_col(whole, skip_cols=['id', c.target_col])
numerical_cols = get_numerical_col(whole, skip_cols=['id', c.target_col])

In [None]:
cat_cols

In [None]:
numerical_cols

In [None]:
train = whole[~whole[c.target_col].isna()].reset_index(drop=True).copy()
test = whole[whole[c.target_col].isna()].reset_index(drop=True).copy()

### cv

In [None]:
from src.cv import get_kfold, get_stratifiedkfold, get_groupkfold, get_fold, split_train_valid
if c.cv_strategy == 'group':
    cv = get_groupkfold(train, c.target_col, 'user_id', n_splits=5)
elif c.cv_strategy == 'stratified':
    cv = get_stratifiedkfold(train, c.target_col, n_splits=5)
else:
    raise NotImplementedError

train = get_fold(train, cv)
fold_list = sorted(list(train['fold'].unique()))

### feature engineering

In [None]:
from src.features.base import Feature, generate_features, get_categorical_col, get_numerical_col, load_datasets
from src.features.encoder import count_encoder, ordinal_encoder, pp_for_categorical_encoding, target_encoder
from src.features.nlp import count_lda_vectorize, tfidf_svd_vectorize, UniversalSentenceEncoder, BertSequenceVectorizer, Sentence2Vec, SCDVEmbedder, get_embedding_model

In [None]:
from typing import Optional
import random
import gensim.downloader
from gensim.models import word2vec
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from surprise import (
    NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans, KNNWithZScore,
    KNNBaseline, SVD, SVDpp, NMF, SlopeOne, CoClustering
)
from surprise import Dataset, Reader
from sklearn.decomposition import PCA

class Target(Feature):
    def create_features(self, fold:Optional[int]=None):
        if fold is not None:
            # for cv
            _train, _valid = split_train_valid(train, fold)
            self.train = _train[[c.target_col]]
            self.valid = _valid[[c.target_col]]
        else:
            # for submission
            self.train = train[[c.target_col]].copy()
            self.test = pd.DataFrame(np.zeros(len(test)), columns=[c.target_col])  # dummy

        
class Numerical(Feature):
    def create_features(self, fold:Optional[int]=None):
        if fold is not None:
            # for cv
            _train, _valid = split_train_valid(train, fold)
            self.train = _train[numerical_cols]
            self.valid = _valid[numerical_cols]
        else:
            # for submission
            self.train = train[numerical_cols].copy()
            self.test = test[numerical_cols].copy()


class OrdinalEncode(Feature):
    def ordinal_encoder(self, train:pd.DataFrame, test:pd.DataFrame, cols:list[str], prefix:str):
        encoder = OrdinalEncoder()
        _whole = pd.concat([train, test], axis=0).reset_index(drop=True)
        encoder.fit(_whole[cols])
        _train = pd.DataFrame(encoder.transform(train[cols]), columns=cols).add_prefix(prefix).astype('category')
        _test = pd.DataFrame(encoder.transform(test[cols]), columns=cols).add_prefix(prefix).astype('category')
        return _train, _test

    def create_features(self, fold: Optional[int]=None):
        if c.cv_strategy == 'group':
            use_cols = [
                # 'user_id',
                # 'anime_id',
                'genres',
                # 'japanese_name',
                'type',
                'aired',
                'producers',
                'licensors',
                'studios',
                'source',
                'duration',
                'rating',
                # 'original_work_name'
            ]
        else:
            use_cols = [
                'user_id',
                'anime_id',
                'genres',
                # 'japanese_name',
                'type',
                'aired',
                'producers',
                'licensors',
                'studios',
                'source',
                'duration',
                'rating',
                'original_work_name'
            ]
        prefix= 'ordinal_enc'
        _train, _test = self.ordinal_encoder(train, test, use_cols, prefix)
        
        if fold is not None:
            _train['fold'] = train['fold'].to_numpy()
            _train, _valid = split_train_valid(_train, fold)
            self.train = _train.filter(like=prefix)
            self.valid = _valid.filter(like=prefix)
        else:
            self.train = _train.copy()
            self.test = _test.copy()


class CountEncode(Feature):
    def count_encoder(self, df, cols):
        _df = df.copy()
        prefix = 'count_enc'
        for col in cols:
            encoder = whole[col].value_counts()
            _df[f'{prefix}_{col}'] = _df[col].map(encoder)
        return _df

    def create_features(self, fold:Optional[int]=None):
        """ 
        どれくらいよく見られているか
        testを含めたcountはtestの予測でも使えるので使う
        validの時はvalidを含めたcountを使う
        """
        use_cols = [
            'user_id',
            'anime_id',
            'original_work_name',
            ]
        
        if fold is not None:
            _train = self.count_encoder(train, use_cols)
            _train, _valid = split_train_valid(_train, fold)
            self.train = _train.filter(like='count_enc')
            self.valid = _valid.filter(like='count_enc')
        else:
            whole = pd.concat([train, test])
            _whole = self.count_encoder(whole, use_cols)
            _train = _whole[~whole['score'].isna()].sort_index()
            _test = _whole[whole['score'].isna()].sort_index()
            self.train = _train.filter(like='count_enc')
            self.test = _test.filter(like='count_enc')
  

class TargetEncode(Feature):
    def create_features(self, fold:Optional[int]=None):
        if c.cv_strategy == 'group':
            """
            unseen userを学習するのでuser_idは使わない
            """
            use_cols = []
        elif c.cv_strategy == 'stratified':
            use_cols = ['user_id', 'anime_id', 'original_work_name']
        else:
            raise NotImplementedError
        
        if fold is not None:
            _train, _valid = split_train_valid(train, fold)
            self.train, self.valid = target_encoder(_train, _valid, use_cols, c.target_col, methods=['mean', 'std'])
        else:
            self.train, self.test = target_encoder(train, test, use_cols, c.target_col, methods=['mean', 'std'])


class MultiTagEncode(Feature):
    @staticmethod
    def create_one_hot_and_svd_features(anime, multilabel_cols:list[str]):
        multilabel_dfs = []
        for c in multilabel_cols:
            list_srs = anime[c].map(lambda x: x.split(", ")).tolist()
            # MultiLabelBinarizerを使うと簡単に変換できるのでオススメです
            mlb = MultiLabelBinarizer()
            ohe_srs = mlb.fit_transform(list_srs)
            if c == "genres" or c == "licensors":
                # ユニーク数が多くないのでOne-hot表現のまま
                col_df = pd.DataFrame(ohe_srs, columns=[f"ohe_{c}_{name}" for name in mlb.classes_])
            else:
                # ユニーク数が多いので、SVDで次元圧縮する
                svd = TruncatedSVD(n_components=10)
                svd_arr = svd.fit_transform(ohe_srs)
                col_df = pd.DataFrame(
                    svd_arr,
                    columns=[f"svd_{c}_{ix}" for ix in range(10)]
                )
            multilabel_dfs.append(col_df)

        multilabel_df = pd.concat(multilabel_dfs, axis=1)
        multilabel_df['anime_id'] = anime['anime_id'].to_numpy()
        return multilabel_df
    
    def create_features(self, fold: int | None = None): 
        multilabel_cols = ["genres", "producers", "licensors", "studios"]
        df = self.create_one_hot_and_svd_features(anime, multilabel_cols)
        if fold is not None:
            _train, _valid = split_train_valid(train, fold)
            _train = _train[['anime_id']].merge(df, on="anime_id", how="left")
            _valid = _valid[['anime_id']].merge(df, on="anime_id", how="left")
            self.train = _train.drop(columns=['anime_id'], axis=1)
            self.valid = _valid.drop(columns=['anime_id'], axis=1)
        else:
            _train = train.copy()
            _test = test.copy()
            _train = _train[['anime_id']].merge(df, on="anime_id", how="left")
            _test = _test[['anime_id']].merge(df, on="anime_id", how="left")
            self.train = _train.drop(columns=['anime_id'], axis=1)
            self.test = _test.drop(columns=['anime_id'], axis=1)


class Anime2Vec(Feature):
    @staticmethod
    def create_model(title_sentence_list, vector_size):
        
        # ユーザごとにshuffleしたリストを作成
        shuffled_sentence_list = [random.sample(sentence, len(sentence)) for sentence in title_sentence_list]

        # 元のリストとshuffleしたリストを合わせる
        train_sentence_list = title_sentence_list + shuffled_sentence_list
        # train_sentence_list = title_sentence_list

        # word2vecのパラメータ
        w2v_params = {
            "vector_size": vector_size,
            "seed": c.seed,
            # "window": 20,
            "min_count": 1,
            "workers": 1,
            # "epochs": 20,
        }

        # word2vecのモデル学習
        model = word2vec.Word2Vec(train_sentence_list, **w2v_params)
        return model

    @staticmethod
    def create_embedding(model, user_anime_list_dict, anime_ids, user_emb_suffix, item_emb_suffix, vector_size):
        """
        validation時: trainのみでembeddingを作成しtrainとvalidにmerge
        submission時: trainでembeddingを作成しtrainとtestにmerge
        """
        # ユーザーごとの特徴ベクトルと対応するユーザーID
        user_factors = {user_id: np.mean([model.wv[anime_id] for anime_id in user_anime_list], axis=0) for user_id, user_anime_list in user_anime_list_dict.items()}

        # アイテムごとの特徴ベクトルと対応するアイテムID
        item_factors = {aid: model.wv[aid] for aid in anime_ids}

        # データフレームを作成
        user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "user_id"})
        item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "anime_id"})
        user_factors_df.columns = ["user_id"] + [f"{i}_{user_emb_suffix}" for i in range(vector_size)]
        item_factors_df.columns = ["anime_id"] + [f"{i}_{item_emb_suffix}" for i in range(vector_size)]
        return user_factors_df, item_factors_df
    
    @staticmethod
    def create_sentence_list(_train:pd.DataFrame):
        whole = _train.copy()
        anime_ids = _train['anime_id'].unique().tolist()
        user_anime_list_dict = {user_id: anime_ids.tolist() for user_id, anime_ids in whole.groupby('user_id')['anime_id']}

        # スコアを考慮する場合
        # 今回は1～10のレーティングなので、スコアが5のアニメは5回、スコアが10のアニメは10回、タイトルをリストに追加する
        title_sentence_list = []
        for user_id, user_df in whole.groupby('user_id'):
            user_title_sentence_list = []
            for anime_id, anime_score in user_df[['anime_id', 'score']].values:
                for i in range(int(anime_score)):
                    user_title_sentence_list.append(anime_id)
            title_sentence_list.append(user_title_sentence_list)

        # whole = _train.copy()
        # # scoreを元にembeddingを作成しているのでリークするかも
        # title_sentence_list = whole.sort_values('score').groupby(['user_id'])['japanese_name'].unique().apply(lambda x: x.tolist()).to_list()
        # # title_sentence_list = whole.groupby(['user_id', 'score'])['japanese_name'].unique().apply(lambda x: x.tolist()).to_list()
        return title_sentence_list, anime_ids, user_anime_list_dict

    # def create_features(self, fold: int | None = None):
    #     pass
    def create_features(self, fold:Optional[int] = None):
        """ 
        train全体でembeddingを作成すると検証時にリークする(した)
        →そこでcross validation時は分割後のtrainでembeddingを作成する

        ただしtrain時とtest時でembeddingを変えると次元の意味が変わってしまう。
        cvの場合fold=0で使われるembでtestを作成して予測,fold=1で使われるembでtestを作成して予測、とする必要がある
        そのためにはtestの特徴量をfold分作成する必要がある

        unseen userに対してはtrainで作成したuser_embにはvalidのuserは存在しないので全て欠損データとなるので除外
        いや、やり方を工夫すれば使える
        testを考えるとanime embはtrainのみを利用して作成しuser embはそれをtestに反映させる
        となればuser embを作る際はtestのuserを含めても問題ない
        """
        vector_size = 64
        if fold is not None:
            _train, _valid = split_train_valid(train, fold)
            _test = test.copy()
            title_sentence_list, anime_ids, _ = self.create_sentence_list(_train)
            # 使うanimeが分割後のtrainであればそれを元にuser embを作るのはtrain全体でもok
            user_anime_list_dict = {user_id: anime_ids.tolist() for user_id, anime_ids in train.query('anime_id in @anime_ids').groupby('user_id')['anime_id']}
            model = self.create_model(title_sentence_list, vector_size)
            user_suffix = '_anime2vec_user_emb'
            item_suffix = '_anime2vec_item_emb'
            user_emb, item_emb = self.create_embedding(model, user_anime_list_dict, anime_ids, user_suffix, item_suffix, vector_size)


            _train = _train.merge(user_emb, on='user_id', how='left')
            _train = _train.merge(item_emb, on='anime_id', how='left')
            _valid = _valid.merge(user_emb, on='user_id', how='left')
            _valid = _valid.merge(item_emb, on='anime_id', how='left')
            _test = _test.merge(user_emb, on='user_id', how='left')
            _test = _test.merge(item_emb, on='anime_id', how='left')

            self.train = _train.filter(like='_anime2vec_')
            self.valid = _valid.filter(like='_anime2vec_')
            self.test = _test.filter(like='_anime2vec_')
        else:
            _train = train.copy()
            _test = test.copy()
            _whole = pd.concat([_train, _test], axis=0)
            title_sentence_list, anime_ids, _ = self.create_sentence_list(_train)
            user_anime_list_dict = {user_id: anime_ids.tolist() for user_id, anime_ids in _whole.query('anime_id in @anime_ids').groupby('user_id')['anime_id']}
            model = self.create_model(title_sentence_list, vector_size)
            user_suffix = '_anime2vec_user_emb'
            item_suffix = '_anime2vec_item_emb'
            user_emb, item_emb = self.create_embedding(model, user_anime_list_dict, anime_ids, user_suffix, item_suffix, vector_size)

            _train = _train.merge(user_emb, on='user_id', how='left')
            _train = _train.merge(item_emb, on='anime_id', how='left')
            _test = _test.merge(user_emb, on='user_id', how='left')
            _test = _test.merge(item_emb, on='anime_id', how='left')
            self.train = _train.filter(like='_anime2vec_')
            self.test = _test.filter(like='_anime2vec_')


class Anime2VecWithoutScore(Feature):

    @staticmethod
    def create_sentence_list(_whole:pd.DataFrame):
        anime_ids = _whole['anime_id'].unique().tolist()
        user_anime_list_dict = {user_id: anime_ids.tolist() for user_id, anime_ids in _whole.groupby('user_id')['anime_id']}
        title_sentence_list = _whole.groupby('user_id')['anime_id'].apply(list).tolist()
        return title_sentence_list, anime_ids, user_anime_list_dict

    def create_features(self, fold:Optional[int] = None):
        """ 
        train全体でembeddingを作成すると検証時にリークする(した)
        →そこでcross validationごとにembeddingを作成する

        ただしtrain時とtest時でembeddingを変えると次元の意味が変わってしまう。
        cvの場合fold=0で使われるembでtestを作成して予測,fold=1で使われるembでtestを作成して予測、とする必要がある
        そのためにはtestの特徴量をfold分作成する必要がある

        こちらはtest時には全てのデータを使った特徴量を作成する
        よってvalid時はtrain+validで作成したembeddingを使う
        unseen userを学習させる場合にも使える

        改めて
        embeddingを揃えるためにfoldごとにtestは作成する必要がある
        ただし今回はtest含めて作っていいのでデータ全体でembeddingを作成する
        validで若干リークしそう？(testを含めてるので)
        """

        vector_size = 64
        user_suffix = '_anime2vec_user_emb_wo_score'
        item_suffix = '_anime2vec_item_emb_wo_score'
        whole = pd.concat([train, test], axis=0)
        title_sentence_list, anime_ids, user_anime_list_dict = self.create_sentence_list(whole)
        model = Anime2Vec.create_model(title_sentence_list, vector_size)
        user_emb, item_emb = Anime2Vec.create_embedding(model, user_anime_list_dict, anime_ids, user_suffix, item_suffix, vector_size)
        # whole = whole.merge(user_emb, on='user_id', how='left')  # これするとなぜかindexがresetされる
        _train = whole[~whole['score'].isna()].sort_index()
        _test = whole[whole['score'].isna()].sort_index()
        _train = _train.merge(user_emb, on='user_id', how='left')
        _test = _test.merge(user_emb, on='user_id', how='left')

        # itemは使わない？
        if fold is not None:
            _train, _valid = split_train_valid(_train, fold)
            self.train = _train.filter(like='_wo_score')
            self.valid = _valid.filter(like='_wo_score')
            self.test = _test.filter(like='_wo_score')
        else:
            self.train = _train.filter(like='_wo_score')
            self.test = _test.filter(like='_wo_score')


class NumericalUserProfile(Feature):
    def create_features(self, fold:Optional[int] = None):
        """ 
        testを予測する際には全てのデータを使用する
        よってvalid時はtrain+validで作成したデータを使用する

        これはunseen userに対しても有効(test時にはtestを使っていいので)
        """
    
        if fold is not None:
            user_features = train.groupby('user_id')[numerical_cols].agg(['mean', 'std', 'min', 'max'])
            user_features.columns = user_features.columns.map('_'.join)
            user_features.reset_index(inplace=True)

            _train, _valid = split_train_valid(train, fold)
            _train = _train[['user_id']].merge(user_features, on='user_id', how='left')
            _valid = _valid[['user_id']].merge(user_features, on='user_id', how='left')
            self.train = _train.drop('user_id', axis=1)
            self.valid = _valid.drop('user_id', axis=1)
        else:
            _whole = pd.concat([train, test]).reset_index(drop=True)  # animeはすでにmerge済み
            user_features = _whole.groupby('user_id')[numerical_cols].agg(['mean', 'std', 'min', 'max'])
            user_features.columns = user_features.columns.map('_'.join)
            user_features.reset_index(inplace=True)

            _train = train.copy()
            _test = test.copy()
            _train = _train[['user_id']].merge(user_features, on='user_id', how='left')
            _test = _test[['user_id']].merge(user_features, on='user_id', how='left')
            self.train = _train.drop('user_id', axis=1)
            self.test = _test.drop('user_id', axis=1)


class TfidfAllText(Feature):
    def tfidf_svd_vectorize(self, input_df:pd.DataFrame, col:str, n_components:int = 50):
        """
        usage:
        tfidf_df = tfidf_svd_vectorize(df, col='abc', n_components=5)
        次元圧縮したくない場合はpipelineのTruncatedSVDをコメントアウト
        """
        pipeline = Pipeline(steps=[
            ("TfidfVectorizer", TfidfVectorizer()),
            ("TruncatedSVD", TruncatedSVD(n_components=n_components, random_state=42))
        ])
        features = pipeline.fit_transform(input_df[col].fillna(""))
        output_df = pd.DataFrame(features).add_prefix(f'tfidf_svd_{col}_')
        return output_df

    def create_features(self, fold: int | None = None):

        # 全体でやるとvalidでリークしない？
        whole = pd.concat([train, test], axis=0)
        whole['text'] = whole[['genres', 'aired', 'producers', 'licensors', 'studios', 'source', 'duration', 'rating']].apply(lambda x: ' '.join(x.values.astype(str)), axis=1)
        user_df = whole.groupby('user_id')['text'].apply(lambda x: ' '.join(x)).reset_index()
        user_df['text'] = user_df['text'].str.replace(",", "")
        user_emb_df = self.tfidf_svd_vectorize(user_df, 'text', n_components=50)
        user_emb_df['user_id'] = user_df['user_id'].to_numpy()

        if fold is not None:
            # for cv
            _train, _valid = split_train_valid(train, fold)
            _test = test.copy()
            _train = _train.merge(user_emb_df, on='user_id', how='left')
            _valid = _valid.merge(user_emb_df, on='user_id', how='left')
            _test = _test.merge(user_emb_df, on='user_id', how='left')
            self.train = _train.filter(like='tfidf_svd_')
            self.valid = _valid.filter(like='tfidf_svd_')
            self.test = _test.filter(like='tfidf_svd_')
        else:
            # for submission
            _train = train.copy()
            _test = test.copy()
            _train = _train.merge(user_emb_df, on='user_id', how='left')
            _test = _test.merge(user_emb_df, on='user_id', how='left')
            self.train = _train.filter(like='tfidf_svd_')
            self.test = _test.filter(like='tfidf_svd_')


class TfidfEachText(Feature):
    def tfidf_svd_vectorize(self, input_df:pd.DataFrame, col:str, n_components:int = 50):
        """
        usage:
        tfidf_df = tfidf_svd_vectorize(df, col='abc', n_components=5)
        次元圧縮したくない場合はpipelineのTruncatedSVDをコメントアウト
        """
        pipeline = Pipeline(steps=[
            ("TfidfVectorizer", TfidfVectorizer()),
            ("TruncatedSVD", TruncatedSVD(n_components=n_components, random_state=42))
        ])
        features = pipeline.fit_transform(input_df[col].fillna(""))
        output_df = pd.DataFrame(features).add_prefix(f'tfidf_svd_{col}_')
        return output_df

    def create_features(self, fold: int | None = None):
        """ 
        embeddingの次元の意味を揃えたいのでfoldごとにtestのデータを作成している
        そうするとembeddingにはtestのデータも含めて作りたい
        そうするとvalid時に本来は見ないtestも含まれることになるのでリークがある
        """ 
        whole = pd.concat([train, test], axis=0)
        user_emb_df = whole.groupby('user_id').size().reset_index().drop(0, axis=1)
        whole = pd.concat([train, test], axis=0)
        for col in ['genres', 'producers', 'licensors', 'studios']:
            user_df = whole.groupby('user_id')[col].apply(lambda x: ' '.join(x)).reset_index()
            user_df[col] = user_df[col].str.replace(",", "")
            emb_df = self.tfidf_svd_vectorize(user_df, col, n_components=20)
            emb_df['user_id'] = user_df['user_id'].to_numpy()
            user_emb_df = user_emb_df.merge(emb_df, on='user_id', how='left')


        if fold is not None:
            # for cv
            _train, _valid = split_train_valid(train, fold)
            _test = test.copy()
            _train = _train.merge(user_emb_df, on='user_id', how='left')
            _valid = _valid.merge(user_emb_df, on='user_id', how='left')
            _test = _test.merge(user_emb_df, on='user_id', how='left')
            self.train = _train.filter(like='tfidf_svd_')
            self.valid = _valid.filter(like='tfidf_svd_')
            self.test = _test.filter(like='tfidf_svd_')
        else:
            # for submission
            _train = train.copy()
            _test = test.copy()
            _train = _train.merge(user_emb_df, on='user_id', how='left')
            _test = _test.merge(user_emb_df, on='user_id', how='left')
            self.train = _train.filter(like='tfidf_svd_')
            self.test = _test.filter(like='tfidf_svd_')


class Sentence2VecByUser(Feature):
    def vectorize(self, x: str, ndim=160):
        embeddings = [
            self.model.get_vector(word)
            if self.model.key_to_index.get(word, None) is not None
            else np.zeros(ndim, dtype=np.float32)
            for word in x.split()
        ]
        if len(embeddings) == 0:
            return np.zeros(ndim, dtype=np.float32)
        else:
            return np.mean(embeddings, axis=0)

    def vectorize_to_df(self,input_df, col, prefix, ndim=160):
        vector = np.stack(
            input_df[col].fillna("").str.replace("\n", "").progress_apply(lambda x: self.vectorize(x, ndim)).to_numpy()
            )
        output_df = pd.DataFrame(vector).add_prefix('senentce2vec_')
        return output_df 

    def create_features(self, fold: int | None = None):
        """ 
        embeddinの次元を揃えるためにtestはfoldごとに作成する
        そのためuser embはtrainとtestをconcatして作成する
        """
        self.model = gensim.downloader.load('glove-wiki-gigaword-50')
        vector_size = self.model.vector_size
        # 全体でやるとvalidでリークしない？
        whole = pd.concat([train, test], axis=0)
        whole['text'] = whole[['genres', 'aired', 'producers', 'licensors', 'studios', 'source', 'duration', 'rating']].apply(lambda x: ' '.join(x.values.astype(str)), axis=1)
        user_df = whole.groupby('user_id')['text'].apply(lambda x: ' '.join(x)).reset_index()
        user_df['text'] = user_df['text'].str.replace(",", "")

        prefix = 'senentce2vec'
        user_emb_df = self.vectorize_to_df(user_df, 'text', prefix, ndim=vector_size)
        user_emb_df['user_id'] = user_df['user_id'].to_numpy()

        if fold is not None:
            # for cv
            _train, _valid = split_train_valid(train, fold)
            _test = test.copy()
            _train = _train.merge(user_emb_df, on='user_id', how='left')
            _valid = _valid.merge(user_emb_df, on='user_id', how='left')
            _test = _test.merge(user_emb_df, on='user_id', how='left')
            self.train = _train.filter(like=prefix)
            self.valid = _valid.filter(like=prefix)
            self.test = _test.filter(like=prefix)
        else:
            # for submission
            _train = train.copy()
            _test = test.copy()
            _train = _train.merge(user_emb_df, on='user_id', how='left')
            _test = _test.merge(user_emb_df, on='user_id', how='left')
            self.train = _train.filter(like=prefix)
            self.test = _test.filter(like=prefix)


class SVDppEmb(Feature):
    def get_user_emb(self, _train):
        user_emb = {}
        for user_id in _train['user_id'].unique():
            user_emb[user_id] = self.algo.pu[self.algo.trainset.to_inner_uid(user_id)]
        user_emb = pd.DataFrame(user_emb).T.add_prefix('svdpp_user_emb_').reset_index().rename(columns={'index': 'user_id'})
        return user_emb
    
    def get_anime_emb(self, _train):
        anime_emb = {}
        for anime_id in _train['anime_id'].unique():
            anime_emb[anime_id] = self.algo.qi[self.algo.trainset.to_inner_iid(anime_id)]
        anime_emb = pd.DataFrame(anime_emb).T.add_prefix('svdpp_anime_emb_').reset_index().rename(columns={'index': 'anime_id'})
        return anime_emb

    def create_features(self, fold: int | None = None):
        if fold is not None:
            _train, _valid = split_train_valid(train, fold)
            _test = test.copy()
            reader = Reader(rating_scale=(1, 10))
            train_data = Dataset.load_from_df(_train[['user_id', 'anime_id', 'score']], reader).build_full_trainset()
            self.algo = SVDpp()
            self.algo.fit(train_data)

            if c.cv_strategy != 'group':
                user_emb = self.get_user_emb(_train)
                _train = _train.merge(user_emb, on='user_id', how='left')
                _valid = _valid.merge(user_emb, on='user_id', how='left')
                _test = _test.merge(user_emb, on='user_id', how='left')
    
            # anime_emb = self.get_anime_emb(_train)
            # _train = _train.merge(anime_emb, on='anime_id', how='left')
            # _valid = _valid.merge(anime_emb, on='anime_id', how='left')
            # _test = _test.merge(anime_emb, on='anime_id', how='left')

            self.train = _train.filter(like='svdpp')
            self.valid = _valid.filter(like='svdpp')       
            self.test = _test.filter(like='svdpp')
        else:
            _train = train.copy()
            _test = test.copy()
            reader = Reader(rating_scale=(1, 10))
            train_data = Dataset.load_from_df(_train[['user_id', 'anime_id', 'score']], reader).build_full_trainset()
            self.algo = SVDpp()
            self.algo.fit(train_data)

            if c.cv_strategy != 'group':
                user_emb = self.get_user_emb(_train)
                _train = _train.merge(user_emb, on='user_id', how='left')
                _test = _test.merge(user_emb, on='user_id', how='left')

            # anime_emb = self.get_anime_emb(_train)
            # _train = _train.merge(anime_emb, on='anime_id', how='left')
            # _test = _test.merge(anime_emb, on='anime_id', how='left')
            self.train = _train.filter(like='svdpp')
            self.test = _test.filter(like='svdpp')


class KNNBaselineSimEmb(Feature):
    def get_user_emb(self, _train):
        reader = Reader(rating_scale=(1, 10))
        trainset = Dataset.load_from_df(_train[['user_id', 'anime_id', 'score']], reader).build_full_trainset()
        algo = KNNBaseline(sim_options={'name': 'pearson_baseline', 'user_based': True})
        algo.fit(trainset)

        pca = PCA(n_components=50, random_state=0)
        reduced_matrix = pca.fit_transform(algo.sim)
        user_sim = pd.DataFrame(reduced_matrix).add_prefix('knn_baseline_user')
        user_sim['user_id'] = _train['user_id'].unique()  # uniqueの順で並んでいることを確認
        return user_sim

    def get_anime_emb(self, _train):
        reader = Reader(rating_scale=(1, 10))
        trainset = Dataset.load_from_df(_train[['user_id', 'anime_id', 'score']], reader).build_full_trainset()
        algo = KNNBaseline(sim_options={'name': 'pearson_baseline', 'user_based': False})
        algo.fit(trainset)
        
        pca = PCA(n_components=50, random_state=0)
        reduced_matrix = pca.fit_transform(algo.sim)
        anime_sim = pd.DataFrame(reduced_matrix).add_prefix('knn_baseline_item')
        anime_sim['anime_id'] = _train['anime_id'].unique()  # uniqueの順で並んでいることを確認
        return anime_sim
    

    def create_features(self, fold: int | None = None):
        if fold is not None:
            _train, _valid = split_train_valid(train, fold)
            _test = test.copy()
            
            if c.cv_strategy != 'group':
                user_emb = self.get_user_emb(_train)
                _train = _train.merge(user_emb, on='user_id', how='left')
                _valid = _valid.merge(user_emb, on='user_id', how='left')
                _test = _test.merge(user_emb, on='user_id', how='left')

            anime_emb = self.get_anime_emb(_train)
            _train = _train.merge(anime_emb, on='anime_id', how='left')
            _valid = _valid.merge(anime_emb, on='anime_id', how='left')
            _test = _test.merge(anime_emb, on='anime_id', how='left')
            self.train = _train.filter(like='knn_baseline')
            self.valid = _valid.filter(like='knn_baseline')
            self.test = _test.filter(like='knn_baseline')
        else:
            _train, _test = train.copy(), test.copy()
            if c.cv_strategy != 'group':
                user_emb = self.get_user_emb(_train)
                _train = _train.merge(user_emb, on='user_id', how='left')
                _test = _test.merge(user_emb, on='user_id', how='left')

            anime_emb = self.get_anime_emb(_train)
            _train = _train.merge(anime_emb, on='anime_id', how='left')
            _test = _test.merge(anime_emb, on='anime_id', how='left')
            self.train = _train.filter(like='knn_baseline')
            self.test = _test.filter(like='knn_baseline')


class NumericalCatProfile(Feature):
    def aggregate_by_anime(self, anime):
        # multilabel_cols = ['genres']
        # df = MultiTagEncode.create_one_hot_and_svd_features(anime, multilabel_cols)
        # target_cols = list(df.columns)
        # target_cols += ['original_work_name', 'type', 'source', 'rating']
        # _anime = anime.merge(df, on='anime_id', how='left')

        # 上のやり方だと数が多いので一旦下でやる
        _anime = anime.copy()
        target_cols = ['original_work_name', 'type', 'source', 'rating']

        for col in target_cols:
            gr = _anime.groupby(col)[numerical_cols].agg(['mean', 'std', 'min', 'max'])
            gr.columns = gr.columns.map('_'.join)
            gr = gr.add_suffix(f'_{col}_agg')
            _anime = _anime.merge(gr, on=col, how='left')
        _anime = _anime.filter(like='_agg')
        _anime['anime_id'] = anime['anime_id'].to_numpy()
        return _anime

    def create_features(self, fold:Optional[int] = None):
        _anime = self.aggregate_by_anime(anime)
        if fold is not None:
            _train, _valid = split_train_valid(train, fold)
            _train = _train[['anime_id']].merge(_anime, on='anime_id', how='left')
            _valid = _valid[['anime_id']].merge(_anime, on='anime_id', how='left')
            self.train = _train.drop('anime_id', axis=1)
            self.valid = _valid.drop('anime_id', axis=1)
        else:
            _train = train.copy()
            _test = test.copy()
            _train = _train[['anime_id']].merge(_anime, on='anime_id', how='left')
            _test = _test[['anime_id']].merge(_anime, on='anime_id', how='left')
            self.train = _train.drop('anime_id', axis=1)
            self.test = _test.drop('anime_id', axis=1)

In [None]:
generate_features(globals(), overwrite=False)

In [None]:
feats = [
    "Numerical",
    "OrdinalEncode",
    "CountEncode",
    "TargetEncode",
    "Anime2Vec",
    "MultiTagEncode",
    "NumericalUserProfile",
    "Anime2VecWithoutScore",
    "TfidfAllText",
    "TfidfEachText",
    "Sentence2VecByUser",
    "SVDppEmb",
    "KNNBaselineSimEmb",
    "NumericalCatProfile",
]


targets = [
    "Target"
]

In [None]:
def sanity_check(feats, fold=0):
    """ 
    1 foldだけで動作確認する
    """
    not_feature_cols = [
        c.target_col,
        'fold'
    ]
    train_feats = load_datasets(feats, input_dir=c.feature_dir, phase='train', fold=fold)
    valid_feats = load_datasets(feats, input_dir=c.feature_dir, phase='valid', fold=fold)
    test_feats = load_datasets(feats, input_dir=c.feature_dir, phase='test', fold=fold)

    train_target = load_datasets(targets, input_dir=c.feature_dir, phase='train', fold=fold)
    valid_target = load_datasets(targets, input_dir=c.feature_dir, phase='valid', fold=fold)

    # 使用する特徴量&label
    print("##################")
    print('features')
    print("##################")
    for f in train_feats.columns:
        print(f)
        assert f not in not_feature_cols

    print(f'train:{train_feats.shape}')
    print(f'valid:{valid_feats.shape}')
    print(f'test:{test_feats.shape}')
    assert train_feats.shape[1] == valid_feats.shape[1] == test_feats.shape[1]
    assert train_feats.shape[0] == train_target.shape[0]
    assert valid_feats.shape[0] == valid_target.shape[0]
    assert test_feats.shape[0] == test.shape[0]
    print("##################")
    print('categorical features')
    print("##################")
    cat_cols = get_categorical_col(train_feats)
    for col in cat_cols:
        print(col)


In [None]:
sanity_check(feats, fold=0)

### callback

In [None]:
from src.models.gbdt import get_callbacks
from lightgbm import register_logger
register_logger(logger)
callbacks = get_callbacks(c.model_name)
callbacks

### model

In [None]:
from src.models.gbdt import get_model # , LGBModel, XGBModel, CBModel 
sample_data = load_datasets(feats, input_dir=c.feature_dir, phase='train', fold=None)
cat_cols = get_categorical_col(sample_data)
model = get_model(c.model_name, c.model_params, c.num_boost_round, cat_cols, c.output_dir, callbacks)

### train

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, mean_squared_error
def calc_score(true, pred):
    return mean_squared_error(true, pred, squared=False)

In [None]:

oofs = []
preds = []
scores = []
for fold in c.use_fold:
    idx_train, idx_valid = cv[fold]
    wandb.init(**c.wandb_init, name=f'exp{c.version}-fold{fold}')

    X_train = load_datasets(feats, input_dir=c.feature_dir, phase='train', fold=fold)
    X_valid = load_datasets(feats, input_dir=c.feature_dir, phase='valid', fold=fold)
    X_test = load_datasets(feats, input_dir=c.feature_dir, phase='test', fold=fold)

    y_train = load_datasets(targets, input_dir=c.feature_dir, phase='train', fold=fold)
    y_valid = load_datasets(targets, input_dir=c.feature_dir, phase='valid', fold=fold)

    model.train(X_train, y_train, X_valid, y_valid)
    model.save(fold)
    pred = model.predict(X_valid)

    # evaluate
    score = calc_score(y_valid, pred)
    logger.info(f'fold-{fold} score: {score}')
    wandb.log({'CV': score})
    scores.append(score)

    # create oof
    oof_df = pd.DataFrame(pred, index=idx_valid)
    oofs.append(oof_df)

    # pred
    pred_test = model.predict(X_test)
    np.save(c.output_dir / f"pred_test_{fold}", pred_test)
    preds.append(pred_test)

    if fold!=c.use_fold[-1]:
        wandb.finish()

# oofを保存
total_score = np.mean(scores)
logger.info(f'total score: {total_score}')
wandb.log({'TotalCV': total_score})
oof = np.array(pd.concat(oofs).sort_index())
np.save(c.output_dir / "oof", oof)

In [None]:
from src.visualize import plot_importance
# catboostは対応していない
plot_importance(model.models, output_dir=c.output_dir)

### inference

preds = []
for i in range(len(cv)):
    pred = np.load(f'{c.output_dir}/pred_test_{i}.npy')
    preds.append(pred)
pred_test = np.mean(preds, axis=0)

In [None]:
oof = np.load(f'{c.output_dir}/oof.npy').flatten()

In [None]:
if c.cv_strategy == 'group':
    idx = test['user_id'].isin(test_only_user_ids)
else:
    idx = ~test['user_id'].isin(test_only_user_ids)

In [None]:
sns.histplot(train[c.target_col], label='train', color='blue', alpha=0.5, bins=50, kde=True)
sns.histplot(oof, label='oof', color='red', alpha=0.5, bins=50, kde=True)
sns.histplot(pred_test[idx], label='test', color='orange', alpha=0.5, bins=50, kde=True)
plt.legend();

### submission

sub = pd.read_csv(c.input_dir / 'sample_submission.csv')
assert sub.shape[0] == pred_test.shape[0]
sub['score'] = pred_test
sub.to_csv(c.output_dir / f'submission_exp{c.version}.csv', index=False)

In [None]:
base_sub = pd.read_csv('../output/030/submission_exp030.csv')
sub1 = pd.read_csv(f'../output/{c.version}/group/submission_exp{c.version}.csv')
sub2 = pd.read_csv(f'../output/{c.version}/stratified/submission_exp{c.version}.csv')

In [None]:
_test = pd.read_csv(c.input_dir / 'test.csv')
_test['score'] = sub2['score'].to_numpy()

# testのみに存在するuserに対してはgroupkfoldの予測値を使う
idx = _test['user_id'].isin(test_only_user_ids)
_test.loc[idx, 'score'] = sub1.loc[idx, 'score'].to_numpy()

In [None]:
sub = _test[['score']]
# 下限を1に,上限を10にclipする
sub['score'] = sub['score'].clip(1, 10)
sub.to_csv(c.output_root_dir / f'submission_exp{c.version}.csv', index=False)

In [None]:
plt.scatter(base_sub['score'], sub['score'], alpha=0.1)