In [1]:
import cudf
from cuml import ForestInference
from cuml.preprocessing.TargetEncoder import TargetEncoder
from catboost import CatBoostClassifier, CatBoostRanker, Pool

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import gc
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
import datetime
import itertools
import os
from contextlib import redirect_stdout
from tqdm.notebook import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

#カラム内の文字数。デフォルトは50だった
pd.set_option("display.max_colwidth", 100)

#行数
pd.set_option("display.max_rows", 500)

#列数
pd.set_option("display.max_columns", 300)

def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

def apk_list(actual, predicted, k=10):
    return [apk(a,p,k) for a,p in zip(actual, predicted)]

# https://www.kaggle.com/tkm2261/fast-pandas-left-join-357x-faster-than-pd-merge
# add mulple index option
def fast_left_join(df, join_df, on):
    if isinstance(on, list):
        return pd.concat(
            [
                df.reset_index(drop=True),
                join_df.reindex(pd.MultiIndex.from_tuples(tuple(map(tuple, df[on].values)), names=on)).reset_index(drop=True)
            ],
            axis=1,
        )        
    else:
        return pd.concat(
            [
                df.reset_index(drop=True),
                join_df.reindex(df[on].values).reset_index(drop=True),
            ],
            axis=1,
        )

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for i, col in enumerate(df.columns):
        try:
            col_type = df[col].dtype

            if col_type != object:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == "int":
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif (
                        c_min > np.iinfo(np.int16).min
                        and c_max < np.iinfo(np.int16).max
                    ):
                        df[col] = df[col].astype(np.int16)
                    elif (
                        c_min > np.iinfo(np.int32).min
                        and c_max < np.iinfo(np.int32).max
                    ):
                        df[col] = df[col].astype(np.int32)
                    elif (
                        c_min > np.iinfo(np.int64).min
                        and c_max < np.iinfo(np.int64).max
                    ):
                        df[col] = df[col].astype(np.int32)
                else:
                    if (
                        c_min > np.finfo(np.float16).min
                        and c_max < np.finfo(np.float16).max
                    ):
                        df[col] = df[col].astype(np.float32)
                    elif (
                        c_min > np.finfo(np.float32).min
                        and c_max < np.finfo(np.float32).max
                    ):
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float32)
        except ValueError:
            continue

    end_mem = df.memory_usage().sum() / 1024 ** 2
    # print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    # print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df

In [2]:
def calc_mapk(preds, df_agg_val_1):
    tmp = preds.groupby("customer_id").head(12)
    tmp = tmp.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"preds"})
    tmp = tmp.merge(df_agg_val_1, how="left", on="customer_id").rename(columns={"article_id":"gts"})
    mapk_val = mapk(tmp["gts"], tmp["preds"])
    print(f"mapk:{mapk_val:.5f}")

In [3]:
tran_dtypes = {
    "t_dat": "str",
    "customer_id": "str",
    "article_id": "int",
    "product_code": "int",
    "price": "float",
    "sales_channel_id": "int",
}
art_dtypes = {
    "article_id": "int",
    "product_code": "int",
    "product_type_no": "int",
    "product_group_name": "str",
    "graphical_appearance_no": "int",
    "colour_group_code": "int",
    "perceived_colour_value_id": "int",
    "perceived_colour_master_id": "int",
    "department_no": "int",
    "index_code": "str",
    "index_group_no": "int",
    "section_no": "int",
    "garment_group_no": "int",
}
cust_dtypes = {"customer_id": "str"}

## Ensemble

In [4]:
df_agg_val_1 = pd.read_pickle("../exp_iwata/cache/v2//df_agg_val_1_2020-09-23.pkl")

In [5]:
df_agg_val_1 = df_agg_val_1.rename(columns={"article_id": "gts"})

# Read OOF

# Stacking

In [8]:
path = "../input/h-and-m-personalized-fashion-recommendations/"

df_trans = pd.read_csv(path + "transactions_train.csv", dtype=tran_dtypes)
df_trans["t_dat"] = pd.to_datetime(df_trans["t_dat"], format="%Y-%m-%d")
df_trans = df_trans.drop_duplicates(["customer_id", "article_id", "t_dat"])
df_trans["article_id"] = df_trans["article_id"].astype(str).str.zfill(10)

df_trans_week1 = df_trans[df_trans.t_dat > datetime.datetime(2020, 9, 15)]
df_trans_week1["target"] = 1
df_trans_week1 = df_trans_week1[['customer_id', 'article_id', "target"]]

In [9]:
df_art = pd.read_csv(path + "articles.csv", dtype=art_dtypes)
df_art["article_id"] = df_art["article_id"].astype(str).str.zfill(10)
le = LabelEncoder()
le.fit(df_art["index_code"].unique())
df_art["index_code"] = le.transform(df_art["index_code"])
le = LabelEncoder()
le.fit(df_art["product_group_name"].unique())
df_art["product_group_name"] = le.transform(df_art["product_group_name"])

df_cust = pd.read_csv(path + "customers.csv", dtype=cust_dtypes)
df_cust["age"] = df_cust["age"].fillna(df_cust["age"].mean())

# age_id
df_cust["age"] = df_cust["age"].astype(int)
customer_age_gorup = pd.read_csv("../save/customer_age_gorup.csv")
df_cust = df_cust.merge(customer_age_gorup, how="left", on=["age"])
df_cust["age_2"] = df_cust["age"]
df_cust.loc[df_cust["age"] >= 61, "age_2"] = 61
df_cust["age_id_1"] = df_cust["age_2"] // 10
df_cust["age_id_2"] = df_cust["age_2"] // 5
df_cust = df_cust.drop(columns=["age_2"])

df_cust[["FN", "Active"]] = df_cust[["FN", "Active"]].fillna(0)
df_cust["club_member_status"] = df_cust["club_member_status"].apply(
    lambda x: 1 if x == "ACTIVE" else 0
)
df_cust["fashion_news_frequency"] = df_cust["fashion_news_frequency"].apply(
    lambda x: 0 if x == "NONE" else 1
)

# postal_code_ce
df_cust["postal_code_ce"] = df_cust["postal_code"].map(
    df_cust["postal_code"].value_counts()
)

# postal_code, 10以下をまとめる
postal_code_cnt = df_cust["postal_code"].value_counts().reset_index()
postal_code_cnt.columns = ["postal_code", "cnt"]

code_map = {
    i: 0 for i in postal_code_cnt[postal_code_cnt.cnt <= 10]["postal_code"].values
}
for i, code in enumerate(
    postal_code_cnt[postal_code_cnt.cnt > 10]["postal_code"].values
):
    code_map[code] = i + 1

df_cust["postal_code"] = df_cust["postal_code"].map(code_map)

df_art.index = df_art.article_id
df_art.index.name = "article_id"
df_art = df_art.drop(columns=["article_id"])

df_cust.index = df_cust.customer_id
df_cust.index.name = "customer_id"
df_cust = df_cust.drop(columns=["customer_id"])

## Read OOF

In [10]:
# sub = pd.read_pickle("../input/nari/sub_top200/sub_exp_lgbm_007_20220505112349_0.0374_top30.pickle")
# sub = sub.groupby("customer_id").head(12)
# sub = sub.groupby("customer_id")["article_id"].apply(list)
# sub = sub.reset_index().rename(columns={"article_id":"prediction"})
# sub.to_csv("../sub/sub_exp_lgbm_007_20220505112349_0.0374.csv", index=False)

In [11]:
def read_oof(exp_name):
    if exp_name in [
        "myaun_cat_v4-3",
        "cat_v4-3_decay_popular_sample",
        "cat_v7-2_random_weighted",
        "cat_v4-3_short",
        "cat_v4-3-long-tail",
        "cat_v12"
    ]:
        oof = pd.read_pickle(exps[exp_name]["oof"])
        oof["article_id"] = oof["article_id"].astype(str).str.zfill(10)
    elif exp_name == "nari_exp_lgbm_007":
        oof = pd.read_feather(exps[exp_name]["oof"])
        oof = oof.rename(columns={"oof":"pred"})
        oof = oof[~(oof.pred==-1.0)]
        oof = oof.sort_values(["customer_id", "pred"], ascending=False)
    elif exp_name == "nari_exp_lgbm_007_fix":
        oof = pd.read_feather(exps[exp_name]["oof"])
        oof = oof.rename(columns={"oof":"pred"})
        oof = oof.drop(columns=["oof_rank"])
        oof = oof.sort_values(["customer_id", "pred"], ascending=False)
    elif exp_name == "iwata_exp05":
        oof = pd.read_csv(exps[exp_name]["oof"])
        oof["article_id"] = oof["article_id"].astype(str).str.zfill(10)
    elif exp_name == "minguin_v28": # フォーマットが違う, ブレンドには使える
        oof = pd.read_csv(exps[exp_name]["oof"])
        oof["LGBMRanker"] = oof["LGBMRanker"].str.split(" ")
    elif exp_name == "minguin_v34":
        oof = pd.read_csv(exps[exp_name]["oof"])
        oof = oof.rename(columns={"preds":"pred"})
        oof["article_id"] = oof["article_id"].astype(str).str.zfill(10)
        oof = oof.sort_values(["customer_id", "pred"], ascending=False)
    elif exp_name == "moro_sub1":
        oof = pd.read_pickle(exps[exp_name]["oof"])
        oof = oof.drop(columns=["true"])
        oof = oof.sort_values(["customer_id", "pred"], ascending=False)
    return oof

exps = {
    "myaun_cat_v4-3": {
        "lb":0.0307,
        "cv":0.03539,
        "oof":"../exp_iwata/cat_v4-3/df_ans_eval_mid1_-1.pkl",
        "sub_pred":"../exp_iwata/cat_v4-3/df_ans_eval.pkl",
        "sub_pred_top30":"../exp_iwata/cat_v4-3/df_ans_eval_top30.pkl",
        "sub":"../sub/cat_v4-3_0.03539.csv",
    },
    "cat_v4-3_decay_popular_sample": {
        "lb":0.0306,
        "cv":0.03542,
        "oof":"../exp_iwata/cat_v4-3_decay_popular_sample/df_ans_eval_mid1_-1.pkl",
        "sub_pred":"../exp_iwata/cat_v4-3_decay_popular_sample/df_ans_eval.pkl",
        "sub_pred_top30": "../exp_iwata/cat_v4-3_decay_popular_sample/df_ans_eval_top30.pkl",
        "sub":"../sub/cat_v4-3_decay_popular_sample_0.03542.csv",
    },
    # "cat_v7-2_random_weighted": {
    #     "lb":None,
    #     "cv":0.03525,
    #     "oof":"../exp_iwata/cat_v7-2_random_weighted/df_ans_eval_mid1_-1.pkl",
    #     "sub_pred":None,
    #     "sub_pred_top30":None,
    #     "sub":None,
    # },
    # "cat_v4-3_short": {
    #     "lb":None,
    #     "cv":0.03529,
    #     "oof":"../exp_iwata/cat_v4-3_short/df_ans_eval_mid1_-1.pkl",
    #     "sub_pred":None,
    #     "sub_pred_top30":None,
    #     "sub":None,
    # },
    # "cat_v4-3-long-tail": {
    #     "lb":None,
    #     "cv":0.03443,
    #     "oof":"../exp_iwata/cat_v4-3-long-tail/df_ans_eval_mid1_-1.pkl",
    #     "sub_pred":None,
    #     "sub_pred_top30":None,
    #     "sub":None,
    # },
    # "cat_v12": {
    #     "lb":None,
    #     "cv":0.03493,
    #     "oof":"../exp_iwata/cat_v12/df_ans_eval_mid1_-1.pkl",
    #     "sub_pred":None,
    #     "sub_pred_top30":None,
    #     "sub":None,
    # },
    "nari_exp_lgbm_007": {
        "lb":0.0302,
        "cv":0.0366,
        "oof":"../input/nari/oof_top200/oof_exp_lgbm_007_20220428230759_0.0366.feather",
        "sub_pred":"../input/nari/sub_top200/sub_exp_lgbm_007_20220428230759_0.0366.feather",
        "sub_pred_top30": "../input/nari/sub_top200/sub_exp_lgbm_007_20220428230759_0.0366_top30.pickle",
        "sub":"../sub/exp_lgbm_007_20220425033232_0.0366.csv",
    },
    "nari_exp_lgbm_007_fix": {
        "lb":None,
        "cv":0.0366,
        "oof":"../input/nari/oof_top200/oof_exp_lgbm_007_20220505112349_0.0374.feather",
        "sub_pred":"../input/nari/sub_top200/sub_exp_lgbm_007_20220505112349_0.0374.feather",
        "sub_pred_top30": "../input/nari/sub_top200/sub_exp_lgbm_007_20220505112349_0.0374_top30.pickle",
        "sub": "../sub/sub_exp_lgbm_007_20220505112349_0.0374.csv",
    },
    "iwata_exp05": {
        "lb":0.0303,
        "cv":0.3559,
        "oof":"../input/iwata/oof_top200/oof_exp05_2020-09-16.csv",
        "sub_pred":"../input/iwata/sub_top200/oof_submission_exp05_2020-09-23.csv",
        "sub_pred_top30": "../input/iwata/sub_top200/oof_submission_exp05_2020-09-23_top30.pickle",
        "sub":"../sub/submission_exp05.csv",
    },
    # "minguin_v28": {
    #     "lb":0.0294,
    #     "cv":0.03079,
    #     "oof":"../input/minguin/sub2/oof_h_m_v28_rerun_0294.csv", 
    #     "sub":"../input/minguin/sub2/submission_h_m_v28_rerun_0294.csv",
    # },
    "minguin_v34": {
        "lb":0.0293,
        "cv":0.0314,
        "oof": "../input/minguin/sub2/oof_single_preds_h_m_single_v34_0314.csv",
        "sub_pred": "../input/minguin/sub2/sub_single_preds_h_m_single_v34_0314.csv",
        "sub_pred_top30": "../input/minguin/sub2/sub_single_preds_h_m_single_v34_0314_top30.pickle",
        "sub": "../sub/submission_single_v34.csv",
    },
    # "moro_sub1": {
    #     "lb":0.0301,
    #     "cv":0.03249,
    #     "oof":"../input/moro/sub1_fix/train_oof_train123_all.pickle",
    #     "sub":"../input/moro/sub1/submission_lgb_train123_0.03249.csv",
    # },
}

In [143]:
os.makedirs("../share/oof_and_subs/", exist_ok=True)

In [None]:
for exp_name, paths in tqdm(exps.items()):
    oof_mod = read_oof(exp_name=exp_name)

In [12]:
oofs = {}
for exp_name in tqdm(exps.keys()):
    oofs[exp_name] = read_oof(exp_name=exp_name)

  0%|          | 0/6 [00:00<?, ?it/s]

## stacking exp

In [109]:
topN = 30
preds = []
for exp_name in tqdm(exps.keys()):
    oof = oofs[exp_name]
    oof_topN = oof.groupby("customer_id").head(topN).copy()
    # add pred rank
    oof_topN["pred_rank"] = oof_topN.groupby("customer_id")["pred"].rank(ascending=False, method="dense")
    oof_topN.loc[:, "model"] = exp_name
    preds.append(oof_topN)

preds = pd.concat(preds)

  0%|          | 0/6 [00:00<?, ?it/s]

In [110]:
preds_pv = preds.pivot_table(values=['pred'], index=['customer_id', 'article_id'], columns=['model'])
preds_pv = preds_pv.reset_index()
preds_pv_columns = ['_'.join(col).strip() for col in preds_pv.columns.values]
preds_pv.columns = preds_pv_columns
preds_pv = preds_pv.rename(columns={"customer_id_":"customer_id", "article_id_":"article_id"})

preds_pv_rank = preds.pivot_table(values=['pred_rank'], index=['customer_id', 'article_id'], columns=['model'])
preds_pv_rank = preds_pv_rank.reset_index()
preds_pv_rank_columns = ['_'.join(col).strip() for col in preds_pv_rank.columns.values]
preds_pv_rank.columns = preds_pv_rank_columns
preds_pv_rank = preds_pv_rank.drop(columns=["customer_id_","article_id_"])

preds_pv = pd.concat([
    preds_pv, preds_pv_rank
], axis=1)

In [113]:
# agg score

pred_names = [f"pred_{exp_name}" for exp_name in exps.keys()]
preds_pv["preds_sum"] = preds_pv[pred_names].sum(axis=1)
preds_pv["preds_min"] = preds_pv[pred_names].min(axis=1)
preds_pv["preds_max"] = preds_pv[pred_names].max(axis=1)
preds_pv["preds_cnt"] = preds_pv[pred_names].count(axis=1)

pred_names = [f"pred_rank_{exp_name}" for exp_name in exps.keys()]
preds_pv["preds_sum_rank"] = preds_pv[pred_names].sum(axis=1)
preds_pv["preds_min_rank"] = preds_pv[pred_names].min(axis=1)
preds_pv["preds_max_rank"] = preds_pv[pred_names].max(axis=1)

In [114]:
# add art attrs

preds_pv = fast_left_join(
    preds_pv,
    df_art[
        [
            "product_code",
            "product_type_no",
            "product_group_name",  #
            "graphical_appearance_no",
            "colour_group_code",
            "perceived_colour_value_id",  #
            "perceived_colour_master_id",  #
            "department_no",
            "index_code",
            "index_group_no",
            "section_no",
            "garment_group_no",
        ]
    ],
    on="article_id",
)

preds_pv = fast_left_join(
    preds_pv,
    df_cust[
        [
            "age",
            "age_id",
            "FN",
            "Active",
            "club_member_status",
            "fashion_news_frequency",
            "postal_code",
            "postal_code_ce",
        ]
    ],
    on="customer_id",
)

In [115]:
%%time

# add hist features

len_hist = 366
dev = "cpu"
def stacking_feat_store(df_trans, l_cust, ds, de, dsr, der, dsh, deh):
    feat = {}
    
    df_trans_yesterday = df_trans.query("(t_dat == @der)")  # 1day
    df_trans_recent = df_trans.query("(t_dat >= @dsr) and (t_dat <= @der)")  # 1week
    df_trans_hist = df_trans.query("(t_dat >= @dsh) and (t_dat <= @deh)")  # 1year
    
    # make decay count
    df_trans_hist["t_delay"] = (
        df_trans_hist["t_dat"].max() - df_trans_hist["t_dat"]
    ).dt.days
    df_trans_hist["decay_count"] = 1 / (1 + df_trans_hist["t_delay"] // 7)
    
    # agg art
    feat["art_buy_hist"] = df_trans_hist.groupby(["article_id"])["t_dat"].agg(
        art_buy_hist="count"
    )
    feat["art_buy_recent"] = df_trans_recent.groupby(["article_id"])["t_dat"].agg(
        art_buy_recent="count"
    )
    feat["art_buy_yesterday"] = df_trans_yesterday.groupby(["article_id"])[
        "t_dat"
    ].agg(art_buy_yesterday="count")
    feat["art_buy_hist_decay"] = df_trans_hist.groupby(["article_id"])["decay_count"].agg(
        art_buy_hist_decay="sum"
    )

    for k in feat.keys():
        feat[k] = reduce_mem_usage(feat[k])
    if dev == "gpu":
        feat[k] = cudf.from_pandas(feat[k])
    return feat

def stacking_add_feat(df, feat):
    df = reduce_mem_usage(df)
    if dev == "gpu":
        df = cudf.from_pandas(df)

    # merge aid
    for col in [
        "art_buy_hist",
        "art_buy_recent",
        "art_buy_yesterday",
        "art_buy_hist_decay",
    ]:
        if dev == "gpu":
            df = df.merge(
                feat[col], how="left", left_on=["article_id"], right_index=True
            )
        else:
            df = fast_left_join(df, feat[col], on="article_id")
    
    fill_zero_columns = [
        "art_buy_hist",
        "art_buy_recent",
        "art_buy_yesterday",
        "art_buy_hist_decay",
    ]
    for col in fill_zero_columns:
        # print(col)
        df[col] = df[col].fillna(0)

    if dev == "gpu":
        df = df.to_pandas()
    return df

feat = stacking_feat_store(
    df_trans,
    l_cust=preds_pv["customer_id"].unique(),
    ds=datetime.datetime(2020, 9, 16),
    de=datetime.datetime(2020, 9, 22),
    dsr=datetime.datetime(2020, 9, 9),
    der=datetime.datetime(2020, 9, 15),
    dsh=datetime.datetime(2019, 9, 15),
    deh=datetime.datetime(2020, 9, 15),
)

preds_pv = stacking_add_feat(preds_pv, feat)

CPU times: user 11.8 s, sys: 2.15 s, total: 14 s
Wall time: 13.9 s


In [116]:
# add target

preds_pv = preds_pv.merge(df_trans_week1, how="left", on=['customer_id', 'article_id'])
preds_pv["target"] = preds_pv["target"].fillna(0).astype(int)

In [117]:
preds_pv["target"].sum()

32913

In [118]:
cover_rate = preds_pv["target"].sum() / len(df_trans_week1)
user_cover_rate = (preds_pv.groupby("customer_id")["target"].sum() > 0).sum() / preds_pv["customer_id"].nunique()
cover_rate, user_cover_rate

(0.15181833277980739, 0.32984460164675866)

In [119]:
# !pip install -U scikit-learn

In [120]:
import sklearn
sklearn.__version__

'1.0.2'

In [121]:
from sklearn.model_selection import GroupKFold
# from sklearn.model_selection import StratifiedGroupKFold

FOLD_NUM = 5
RANDOM_STATE = 46
CAT_PARAMS = {
    "depth": 5,
    "learning_rate": 0.1,
    "boosting_type": "Plain",
    "bootstrap_type": "Bernoulli",
    "subsample": 0.8,
    "reg_lambda": 0.001,
    "iterations": 10000,
    "od_type": "Iter",
    "od_wait": 30,
    "metric_period": 20,
    "random_seed": RANDOM_STATE,
    "task_type": "CPU",
    # "task_type": "GPU",
    # "gpu_ram_part": 0.95,
    # "devices": "1",
    "verbose": True,
    "loss_function": "YetiRank",
    "eval_metric": "MAP:top=12",
}

models = []
scores = []
oof = np.zeros(len(preds_pv))
y = preds_pv['target'].values
groups = preds_pv['customer_id'].values
kfold = GroupKFold(n_splits=FOLD_NUM)
# kfold = StratifiedGroupKFold(n_splits=FOLD_NUM, shuffle=True, random_state=RANDOM_STATE)
for fold, (trn, val) in enumerate(kfold.split(preds_pv, y, groups)):
    
    df_train = preds_pv.iloc[trn]
    df_val = preds_pv.iloc[val]
    
    df_train = df_train.sort_values("customer_id")
    X_train = df_train.drop(
        [
            "customer_id",
            "article_id",
            "target",
        ],
        axis=1,
    )
    y_train = df_train["target"]
    idx = df_train.groupby("customer_id")["customer_id"].count().values
    group_id_train = [
        i for i, bascket_num in enumerate(idx) for _ in range(bascket_num)
    ]

    df_val = df_val.sort_values("customer_id")
    X_val = df_val.drop(
        [
            "customer_id",
            "article_id",
            "target",
        ],
        axis=1,
    )
    y_val = df_val["target"]
    idx = df_val.groupby("customer_id")["customer_id"].count().values
    group_id_val = [
        i for i, bascket_num in enumerate(idx) for _ in range(bascket_num)
    ]

    cat_features_index = []
    train_pool = Pool(
        data=X_train,
        label=y_train,
        group_id=group_id_train,
        cat_features=cat_features_index,
    )
    val_pool = Pool(
        data=X_val,
        label=y_val,
        group_id=group_id_val,
        cat_features=cat_features_index,
    )

    model = CatBoostRanker(**CAT_PARAMS)
    model.fit(train_pool, eval_set=val_pool)
    gbdt_score = model.best_score_["validation"]["MAP:top=12"]
    
    # not sorted sample
    X_val = preds_pv.iloc[val].drop(
        [
            "customer_id",
            "article_id",
            "target",
        ],
        axis=1,
    )
    preds = model.predict(X_val)
    oof[val] = preds
    scores.append(gbdt_score)
    models.append(model)



0:	learn: 0.0360347	test: 0.0362073	best: 0.0362073 (0)	total: 705ms	remaining: 1h 57m 25s
20:	learn: 0.0602230	test: 0.0614167	best: 0.0614167 (20)	total: 18.7s	remaining: 2h 27m 45s
40:	learn: 0.0612330	test: 0.0619053	best: 0.0620805 (33)	total: 36.4s	remaining: 2h 27m 29s
60:	learn: 0.0620687	test: 0.0618802	best: 0.0620805 (33)	total: 49.8s	remaining: 2h 15m 7s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06208048931
bestIteration = 33

Shrink model to first 34 iterations.




0:	learn: 0.0327782	test: 0.0308503	best: 0.0308503 (0)	total: 711ms	remaining: 1h 58m 29s
20:	learn: 0.0612153	test: 0.0579886	best: 0.0581957 (19)	total: 14.4s	remaining: 1h 54m 15s
40:	learn: 0.0620652	test: 0.0588823	best: 0.0588823 (40)	total: 32.3s	remaining: 2h 10m 44s
60:	learn: 0.0626560	test: 0.0592397	best: 0.0592397 (60)	total: 50s	remaining: 2h 15m 50s
80:	learn: 0.0632525	test: 0.0592756	best: 0.0595596 (64)	total: 1m 3s	remaining: 2h 10m 8s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.05955956944
bestIteration = 64

Shrink model to first 65 iterations.




0:	learn: 0.0336278	test: 0.0322803	best: 0.0322803 (0)	total: 696ms	remaining: 1h 56m 4s
20:	learn: 0.0613418	test: 0.0582335	best: 0.0583007 (19)	total: 14.5s	remaining: 1h 55m 7s
40:	learn: 0.0622274	test: 0.0589344	best: 0.0589344 (40)	total: 28.2s	remaining: 1h 54m 6s
60:	learn: 0.0630197	test: 0.0591795	best: 0.0592944 (50)	total: 41.9s	remaining: 1h 53m 50s
80:	learn: 0.0634829	test: 0.0593381	best: 0.0593466 (78)	total: 55.7s	remaining: 1h 53m 37s
100:	learn: 0.0639496	test: 0.0593317	best: 0.0595378 (91)	total: 1m 9s	remaining: 1h 53m 44s
120:	learn: 0.0644559	test: 0.0596213	best: 0.0596647 (116)	total: 1m 23s	remaining: 1h 53m 34s
140:	learn: 0.0650478	test: 0.0594503	best: 0.0597410 (130)	total: 1m 37s	remaining: 1h 53m 10s
160:	learn: 0.0654372	test: 0.0594877	best: 0.0597410 (130)	total: 1m 50s	remaining: 1h 52m 48s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.05974104393
bestIteration = 130

Shrink model to first 131 iterations.




0:	learn: 0.0327128	test: 0.0337367	best: 0.0337367 (0)	total: 695ms	remaining: 1h 55m 47s
20:	learn: 0.0603212	test: 0.0607300	best: 0.0607300 (20)	total: 14.3s	remaining: 1h 53m 20s
40:	learn: 0.0612845	test: 0.0614060	best: 0.0615468 (36)	total: 27.6s	remaining: 1h 51m 51s
60:	learn: 0.0619427	test: 0.0619552	best: 0.0619552 (60)	total: 40.9s	remaining: 1h 51m 10s
80:	learn: 0.0624050	test: 0.0619802	best: 0.0620272 (69)	total: 54.3s	remaining: 1h 50m 46s
100:	learn: 0.0628560	test: 0.0623763	best: 0.0623763 (100)	total: 1m 7s	remaining: 1h 50m 21s
120:	learn: 0.0633466	test: 0.0625359	best: 0.0625359 (120)	total: 1m 20s	remaining: 1h 50m 6s
140:	learn: 0.0636904	test: 0.0623731	best: 0.0625410 (129)	total: 1m 34s	remaining: 1h 49m 52s
160:	learn: 0.0640345	test: 0.0627659	best: 0.0627659 (160)	total: 1m 47s	remaining: 1h 49m 37s
180:	learn: 0.0645190	test: 0.0625758	best: 0.0627746 (161)	total: 2m 1s	remaining: 1h 49m 28s
Stopped by overfitting detector  (30 iterations wait)

bestT



0:	learn: 0.0336326	test: 0.0349974	best: 0.0349974 (0)	total: 704ms	remaining: 1h 57m 14s
20:	learn: 0.0600352	test: 0.0629624	best: 0.0630162 (17)	total: 15.8s	remaining: 2h 5m 18s
40:	learn: 0.0611502	test: 0.0635567	best: 0.0636598 (38)	total: 29.2s	remaining: 1h 58m 6s
60:	learn: 0.0615900	test: 0.0640662	best: 0.0641198 (59)	total: 44.2s	remaining: 2h 7s
80:	learn: 0.0622034	test: 0.0640925	best: 0.0641963 (77)	total: 58.8s	remaining: 1h 59m 59s
100:	learn: 0.0626800	test: 0.0642420	best: 0.0643998 (88)	total: 1m 12s	remaining: 1h 58m 35s
120:	learn: 0.0630438	test: 0.0645268	best: 0.0645275 (116)	total: 1m 27s	remaining: 1h 59m 25s
140:	learn: 0.0634451	test: 0.0646216	best: 0.0647180 (126)	total: 1m 41s	remaining: 1h 58m 2s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06471803513
bestIteration = 126

Shrink model to first 127 iterations.


In [122]:
# topN = 30
# group k=5, score + rank + agg, 
# 0.03843 -> (top18: 0.038666) -> (top24: 0.038783) -> (top30: 0.038646)


preds_cv = preds_pv[["customer_id", "article_id"]].copy()
preds_cv["oof"] = oof

preds_cv = preds_cv.sort_values(["customer_id", "oof"], ascending=False)
preds_cv = preds_cv.groupby("customer_id").head(12)
preds_cv = preds_cv.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"oof"})
preds_cv = preds_cv.merge(df_agg_val_1, on="customer_id", how="left").rename(columns={"article_id":"gts"})

gbdt_cv = np.mean(scores)
cv = mapk(preds_cv["gts"], preds_cv["oof"])

print(f"gbdt_cv: {gbdt_cv:.5f}")
print(f"CV: {cv:.6f}")

gbdt_cv: 0.06177
CV: 0.038646


In [108]:
# topN = 24
# group k=5, score + rank + agg, 
# 0.03843 -> (top18: 0.038666) -> 0.038783


preds_cv = preds_pv[["customer_id", "article_id"]].copy()
preds_cv["oof"] = oof

preds_cv = preds_cv.sort_values(["customer_id", "oof"], ascending=False)
preds_cv = preds_cv.groupby("customer_id").head(12)
preds_cv = preds_cv.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"oof"})
preds_cv = preds_cv.merge(df_agg_val_1, on="customer_id", how="left").rename(columns={"article_id":"gts"})

gbdt_cv = np.mean(scores)
cv = mapk(preds_cv["gts"], preds_cv["oof"])

print(f"gbdt_cv: {gbdt_cv:.5f}")
print(f"CV: {cv:.6f}")

gbdt_cv: 0.06305
CV: 0.038783


In [94]:
# topN = 18
# group k=5, score + rank + agg, 
# 0.03843 -> 0.038666


preds_cv = preds_pv[["customer_id", "article_id"]].copy()
preds_cv["oof"] = oof

preds_cv = preds_cv.sort_values(["customer_id", "oof"], ascending=False)
preds_cv = preds_cv.groupby("customer_id").head(12)
preds_cv = preds_cv.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"oof"})
preds_cv = preds_cv.merge(df_agg_val_1, on="customer_id", how="left").rename(columns={"article_id":"gts"})

gbdt_cv = np.mean(scores)
cv = mapk(preds_cv["gts"], preds_cv["oof"])

print(f"gbdt_cv: {gbdt_cv:.5f}")
print(f"CV: {cv:.6f}")

gbdt_cv: 0.06434
CV: 0.038666


In [25]:
# stratified group k=10 0.03843 -> 0.038396


preds_cv = preds_pv[["customer_id", "article_id"]].copy()
preds_cv["oof"] = oof

preds_cv = preds_cv.sort_values(["customer_id", "oof"], ascending=False)
preds_cv = preds_cv.groupby("customer_id").head(12)
preds_cv = preds_cv.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"oof"})
preds_cv = preds_cv.merge(df_agg_val_1, on="customer_id", how="left").rename(columns={"article_id":"gts"})

gbdt_cv = np.mean(scores)
cv = mapk(preds_cv["gts"], preds_cv["oof"])

print(f"gbdt_cv: {gbdt_cv:.5f}")
print(f"CV: {cv:.6f}")

gbdt_cv: 0.06464
CV: 0.038396


In [23]:
# stratified group k=5 0.03843 -> 0.038175(?)


preds_cv = preds_pv[["customer_id", "article_id"]].copy()
preds_cv["oof"] = oof

preds_cv = preds_cv.sort_values(["customer_id", "oof"], ascending=False)
preds_cv = preds_cv.groupby("customer_id").head(12)
preds_cv = preds_cv.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"oof"})
preds_cv = preds_cv.merge(df_agg_val_1, on="customer_id", how="left").rename(columns={"article_id":"gts"})

gbdt_cv = np.mean(scores)
cv = mapk(preds_cv["gts"], preds_cv["oof"])

print(f"gbdt_cv: {gbdt_cv:.5f}")
print(f"CV: {cv:.6f}")

gbdt_cv: 0.06426
CV: 0.038175


In [103]:
# transform pred_rank  0.03832 -> 0.03843


preds_cv = preds_pv[["customer_id", "article_id"]].copy()
preds_cv["oof"] = oof

preds_cv = preds_cv.sort_values(["customer_id", "oof"], ascending=False)
preds_cv = preds_cv.groupby("customer_id").head(12)
preds_cv = preds_cv.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"oof"})
preds_cv = preds_cv.merge(df_agg_val_1, on="customer_id", how="left").rename(columns={"article_id":"gts"})

gbdt_cv = np.mean(scores)
cv = mapk(preds_cv["gts"], preds_cv["oof"])

print(f"gbdt_cv: {gbdt_cv:.5f}")
print(f"CV: {cv:.6f}")

gbdt_cv: 0.06477
CV: 0.03843


In [79]:
# base


preds_cv = preds_pv[["customer_id", "article_id"]].copy()
preds_cv["oof"] = oof

preds_cv = preds_cv.sort_values(["customer_id", "oof"], ascending=False)
preds_cv = preds_cv.groupby("customer_id").head(12)
preds_cv = preds_cv.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"oof"})
preds_cv = preds_cv.merge(df_agg_val_1, on="customer_id", how="left").rename(columns={"article_id":"gts"})

gbdt_cv = np.mean(scores)
cv = mapk(preds_cv["gts"], preds_cv["oof"])

print(f"gbdt_cv: {gbdt_cv:.4f}")
print(f"CV: {cv:.4f}")

gbdt_cv: 0.06453897552478616
CV: 0.038322614957826344


## Inference

In [135]:
# read sub pred topN

topN = 30
sub_preds = []
for exp_name in tqdm(exps.keys()):
    sub = pd.read_pickle(exps[exp_name]["sub_pred_top30"])
    sub_topN = sub.groupby("customer_id").head(topN).copy()
    # add pred rank
    sub_topN["pred_rank"] = sub_topN.groupby("customer_id")["pred"].rank(ascending=False, method="dense")
    sub_topN.loc[:, "model"] = exp_name
    sub_preds.append(sub_topN)

sub_preds = pd.concat(sub_preds)

  0%|          | 0/6 [00:00<?, ?it/s]

In [136]:
feat = stacking_feat_store(
    df_trans,
    l_cust=preds_pv["customer_id"].unique(),
    ds=datetime.datetime(2020, 9, 23),
    de=datetime.datetime(2020, 9, 29),
    dsr=datetime.datetime(2020, 9, 16),
    der=datetime.datetime(2020, 9, 22),
    dsh=datetime.datetime(2019, 9, 22),
    deh=datetime.datetime(2020, 9, 22),
)

In [137]:
customer_ids = sub_preds["customer_id"].unique()

In [138]:
size_block = 30000
list_slice = list(range(0, len(customer_ids), size_block))
if list_slice[-1] != len(customer_ids):
    list_slice.append(len(customer_ids))

In [139]:
sub_all = []
for batch_idx in tqdm(range(len(list_slice) - 1)):
    customer_ids_batch = customer_ids[list_slice[batch_idx] : list_slice[batch_idx + 1]]
    
    preds = sub_preds[sub_preds.customer_id.isin(customer_ids_batch)]
    
    # to pv    
    preds_pv = preds.pivot_table(values=['pred'], index=['customer_id', 'article_id'], columns=['model'])
    preds_pv = preds_pv.reset_index()
    preds_pv_columns = ['_'.join(col).strip() for col in preds_pv.columns.values]
    preds_pv.columns = preds_pv_columns
    preds_pv = preds_pv.rename(columns={"customer_id_":"customer_id", "article_id_":"article_id"})

    preds_pv_rank = preds.pivot_table(values=['pred_rank'], index=['customer_id', 'article_id'], columns=['model'])
    preds_pv_rank = preds_pv_rank.reset_index()
    preds_pv_rank_columns = ['_'.join(col).strip() for col in preds_pv_rank.columns.values]
    preds_pv_rank.columns = preds_pv_rank_columns
    preds_pv_rank = preds_pv_rank.drop(columns=["customer_id_","article_id_"])

    preds_pv = pd.concat([
        preds_pv, preds_pv_rank
    ], axis=1)

    # agg score

    pred_names = [f"pred_{exp_name}" for exp_name in exps.keys()]
    preds_pv["preds_sum"] = preds_pv[pred_names].sum(axis=1)
    preds_pv["preds_min"] = preds_pv[pred_names].min(axis=1)
    preds_pv["preds_max"] = preds_pv[pred_names].max(axis=1)
    preds_pv["preds_cnt"] = preds_pv[pred_names].count(axis=1)

    pred_names = [f"pred_rank_{exp_name}" for exp_name in exps.keys()]
    preds_pv["preds_sum_rank"] = preds_pv[pred_names].sum(axis=1)
    preds_pv["preds_min_rank"] = preds_pv[pred_names].min(axis=1)
    preds_pv["preds_max_rank"] = preds_pv[pred_names].max(axis=1)
    
    # add art attrs
    preds_pv = fast_left_join(
        preds_pv,
        df_art[
            [
                "product_code",
                "product_type_no",
                "product_group_name",  #
                "graphical_appearance_no",
                "colour_group_code",
                "perceived_colour_value_id",  #
                "perceived_colour_master_id",  #
                "department_no",
                "index_code",
                "index_group_no",
                "section_no",
                "garment_group_no",
            ]
        ],
        on="article_id",
    )

    preds_pv = fast_left_join(
        preds_pv,
        df_cust[
            [
                "age",
                "age_id",
                "FN",
                "Active",
                "club_member_status",
                "fashion_news_frequency",
                "postal_code",
                "postal_code_ce",
            ]
        ],
        on="customer_id",
    )
    # add feat
    preds_pv = stacking_add_feat(preds_pv, feat)
    
    # predict (5fold)
    preds_tmp = np.zeros(len(preds_pv))
    for i in range(len(models)):
        preds_tmp += models[i].predict(preds_pv.drop(
            [
                "customer_id",
                "article_id",
            ],
            axis=1,
        ))
    preds_tmp /= len(models)
    
    sub_batch = preds_pv[["customer_id", "article_id"]].copy()
    sub_batch["pred"] = preds_tmp

    sub_batch = sub_batch.sort_values(["customer_id", "pred"], ascending=False)
    sub_batch = sub_batch.groupby("customer_id").head(12)
    sub_batch = sub_batch.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"prediction"})
    sub_batch["prediction"] = sub_batch["prediction"].apply(lambda x:' '.join(x))
    
    sub_all.append(sub_batch)

  0%|          | 0/46 [00:00<?, ?it/s]

In [140]:
sub_all = pd.concat(sub_all)
sub_all = sub_all.sort_values("customer_id")
sub_all = sub_all.reset_index(drop=True)

In [141]:
sub_all.to_csv(f"../sub/stacking-v3_{cv:.5f}.csv", index=False)

In [142]:
sub_all

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,0568601006 0568601043 0568601044 0779781015 0568601007 0568601023 0568601030 0579541001 06736770...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,0918522001 0902528006 0788575004 0924243002 0909059002 0448509014 0918292001 0866731001 07147900...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0794321007 0794321011 0794321008 0866731001 0924243002 0918292001 0924243001 0915529005 09292750...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e,0866731001 0928206001 0918292001 0730683050 0852584001 0740519002 0850917001 0751471001 07514710...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,0730683050 0896152002 0791587001 0730683062 0791587015 0791587021 0866731001 0927530004 09242430...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e4747568cac33e8c541831,0557599022 0791587001 0791587010 0791587015 0866731001 0804992014 0720125039 0822344001 09182920...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab53481233731b5c4f8b7,0762846027 0762846031 0706016001 0762846026 0762846006 0762846029 0762846008 0706016003 07060160...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1778d0116cffd259264,0762846027 0762846006 0762846026 0762846031 0762846029 0762846008 0706016002 0706016001 06736770...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38b2236865d949d4df6a,0714790020 0448509014 0714790028 0874110016 0821395005 0714790024 0914441004 0855706009 08934320...
