In [1]:
import cudf
from cuml import ForestInference
from cuml.preprocessing.TargetEncoder import TargetEncoder
from catboost import CatBoostClassifier, CatBoostRanker, Pool

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import gc
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
import datetime
import itertools
import os
from contextlib import redirect_stdout
from tqdm.notebook import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

#カラム内の文字数。デフォルトは50だった
pd.set_option("display.max_colwidth", 100)

#行数
pd.set_option("display.max_rows", 500)

#列数
pd.set_option("display.max_columns", 300)

def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

def apk_list(actual, predicted, k=10):
    return [apk(a,p,k) for a,p in zip(actual, predicted)]

# https://www.kaggle.com/tkm2261/fast-pandas-left-join-357x-faster-than-pd-merge
# add mulple index option
def fast_left_join(df, join_df, on):
    if isinstance(on, list):
        return pd.concat(
            [
                df.reset_index(drop=True),
                join_df.reindex(pd.MultiIndex.from_tuples(tuple(map(tuple, df[on].values)), names=on)).reset_index(drop=True)
            ],
            axis=1,
        )        
    else:
        return pd.concat(
            [
                df.reset_index(drop=True),
                join_df.reindex(df[on].values).reset_index(drop=True),
            ],
            axis=1,
        )

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for i, col in enumerate(df.columns):
        try:
            col_type = df[col].dtype

            if col_type != object:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == "int":
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif (
                        c_min > np.iinfo(np.int16).min
                        and c_max < np.iinfo(np.int16).max
                    ):
                        df[col] = df[col].astype(np.int16)
                    elif (
                        c_min > np.iinfo(np.int32).min
                        and c_max < np.iinfo(np.int32).max
                    ):
                        df[col] = df[col].astype(np.int32)
                    elif (
                        c_min > np.iinfo(np.int64).min
                        and c_max < np.iinfo(np.int64).max
                    ):
                        df[col] = df[col].astype(np.int32)
                else:
                    if (
                        c_min > np.finfo(np.float16).min
                        and c_max < np.finfo(np.float16).max
                    ):
                        df[col] = df[col].astype(np.float32)
                    elif (
                        c_min > np.finfo(np.float32).min
                        and c_max < np.finfo(np.float32).max
                    ):
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float32)
        except ValueError:
            continue

    end_mem = df.memory_usage().sum() / 1024 ** 2
    # print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    # print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df

In [2]:
def calc_mapk(preds, df_agg_val_1):
    tmp = preds.groupby("customer_id").head(12)
    tmp = tmp.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"preds"})
    tmp = tmp.merge(df_agg_val_1, how="left", on="customer_id").rename(columns={"article_id":"gts"})
    mapk_val = mapk(tmp["gts"], tmp["preds"])
    print(f"mapk:{mapk_val:.5f}")

In [3]:
tran_dtypes = {
    "t_dat": "str",
    "customer_id": "str",
    "article_id": "int",
    "product_code": "int",
    "price": "float",
    "sales_channel_id": "int",
}
art_dtypes = {
    "article_id": "int",
    "product_code": "int",
    "product_type_no": "int",
    "product_group_name": "str",
    "graphical_appearance_no": "int",
    "colour_group_code": "int",
    "perceived_colour_value_id": "int",
    "perceived_colour_master_id": "int",
    "department_no": "int",
    "index_code": "str",
    "index_group_no": "int",
    "section_no": "int",
    "garment_group_no": "int",
}
cust_dtypes = {"customer_id": "str"}

## Ensemble

In [4]:
df_agg_val_1 = pd.read_pickle("../exp_iwata/cache/v2//df_agg_val_1_2020-09-23.pkl")

In [5]:
df_agg_val_1 = df_agg_val_1.rename(columns={"article_id": "gts"})

# Read OOF

# Stacking

In [6]:
path = "../input/h-and-m-personalized-fashion-recommendations/"

df_trans = pd.read_csv(path + "transactions_train.csv", dtype=tran_dtypes)
df_trans["t_dat"] = pd.to_datetime(df_trans["t_dat"], format="%Y-%m-%d")
df_trans = df_trans.drop_duplicates(["customer_id", "article_id", "t_dat"])
df_trans["article_id"] = df_trans["article_id"].astype(str).str.zfill(10)

df_trans_week1 = df_trans[df_trans.t_dat > datetime.datetime(2020, 9, 15)]
df_trans_week1["target"] = 1
df_trans_week1 = df_trans_week1[['customer_id', 'article_id', "target"]]

In [7]:
df_art = pd.read_csv(path + "articles.csv", dtype=art_dtypes)
df_art["article_id"] = df_art["article_id"].astype(str).str.zfill(10)
le = LabelEncoder()
le.fit(df_art["index_code"].unique())
df_art["index_code"] = le.transform(df_art["index_code"])
le = LabelEncoder()
le.fit(df_art["product_group_name"].unique())
df_art["product_group_name"] = le.transform(df_art["product_group_name"])

df_cust = pd.read_csv(path + "customers.csv", dtype=cust_dtypes)
df_cust["age"] = df_cust["age"].fillna(df_cust["age"].mean())

# age_id
df_cust["age"] = df_cust["age"].astype(int)
customer_age_gorup = pd.read_csv("../save/customer_age_gorup.csv")
df_cust = df_cust.merge(customer_age_gorup, how="left", on=["age"])
df_cust["age_2"] = df_cust["age"]
df_cust.loc[df_cust["age"] >= 61, "age_2"] = 61
df_cust["age_id_1"] = df_cust["age_2"] // 10
df_cust["age_id_2"] = df_cust["age_2"] // 5
df_cust = df_cust.drop(columns=["age_2"])

df_cust[["FN", "Active"]] = df_cust[["FN", "Active"]].fillna(0)
df_cust["club_member_status"] = df_cust["club_member_status"].apply(
    lambda x: 1 if x == "ACTIVE" else 0
)
df_cust["fashion_news_frequency"] = df_cust["fashion_news_frequency"].apply(
    lambda x: 0 if x == "NONE" else 1
)

# postal_code_ce
df_cust["postal_code_ce"] = df_cust["postal_code"].map(
    df_cust["postal_code"].value_counts()
)

# postal_code, 10以下をまとめる
postal_code_cnt = df_cust["postal_code"].value_counts().reset_index()
postal_code_cnt.columns = ["postal_code", "cnt"]

code_map = {
    i: 0 for i in postal_code_cnt[postal_code_cnt.cnt <= 10]["postal_code"].values
}
for i, code in enumerate(
    postal_code_cnt[postal_code_cnt.cnt > 10]["postal_code"].values
):
    code_map[code] = i + 1

df_cust["postal_code"] = df_cust["postal_code"].map(code_map)

df_art.index = df_art.article_id
df_art.index.name = "article_id"
df_art = df_art.drop(columns=["article_id"])

df_cust.index = df_cust.customer_id
df_cust.index.name = "customer_id"
df_cust = df_cust.drop(columns=["customer_id"])

In [8]:
df_trans = fast_left_join(
    df_trans,
    df_art[
        [
            "product_code",
            "product_type_no",
            "product_group_name",  #
            "graphical_appearance_no",
            "colour_group_code",
            "perceived_colour_value_id",  #
            "perceived_colour_master_id",  #
            "department_no",
            "index_code",
            "index_group_no",
            "section_no",
            "garment_group_no",
        ]
    ],
    on="article_id",
)

df_trans = fast_left_join(
    df_trans,
    df_cust[
        [
            "age",
            "age_id",
            "age_id_1",
            "age_id_2",
            "FN",
            "Active",
            "club_member_status",
            "fashion_news_frequency",
            "postal_code",
            "postal_code_ce",
        ]
    ],
    on="customer_id",
)

In [9]:
df_trans.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,product_code,product_type_no,product_group_name,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,age,age_id,age_id_1,age_id_2,FN,Active,club_member_status,fashion_news_frequency,postal_code,postal_code_ce
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,663713001,0.050831,2,663713,283,16,1010016,9,4,5,1338,1,1,61,1017,24,4,2,4,0.0,0.0,1,0,3248,16
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,541518023,0.030492,2,541518,306,16,1010016,51,1,4,1334,1,1,61,1017,24,4,2,4,0.0,0.0,1,0,3248,16
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,505221004,0.015237,2,505221,252,7,1010010,52,2,4,5963,3,2,58,1003,32,7,3,6,1.0,1.0,1,1,0,5
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,685687003,0.016932,2,685687,252,7,1010010,52,7,4,3090,0,1,15,1023,32,7,3,6,1.0,1.0,1,1,0,5
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,685687004,0.016932,2,685687,252,7,1010010,93,4,19,3090,0,1,15,1023,32,7,3,6,1.0,1.0,1,1,0,5


## Read OOF

In [10]:
def read_oof(exp_name):
    if exp_name in [
        "myaun_cat_v4-3",
        "cat_v4-3_decay_popular_sample",
        "cat_v7-2_random_weighted",
        "cat_v4-3_short",
        "cat_v4-3-long-tail",
        "cat_v12"
    ]:
        oof = pd.read_pickle(exps[exp_name]["oof"])
        oof["article_id"] = oof["article_id"].astype(str).str.zfill(10)
    elif exp_name == "nari_exp_lgbm_007":
        oof = pd.read_feather(exps[exp_name]["oof"])
        oof = oof.rename(columns={"oof":"pred"})
        oof = oof[~(oof.pred==-1.0)]
        oof = oof.sort_values(["customer_id", "pred"], ascending=False)
    elif exp_name == "nari_exp_lgbm_007_fix":
        oof = pd.read_feather(exps[exp_name]["oof"])
        oof = oof.rename(columns={"oof":"pred"})
        oof = oof.drop(columns=["oof_rank"])
        oof = oof.sort_values(["customer_id", "pred"], ascending=False)
    elif exp_name == "iwata_exp05":
        oof = pd.read_csv(exps[exp_name]["oof"])
        oof["article_id"] = oof["article_id"].astype(str).str.zfill(10)
    elif exp_name == "minguin_v28": # フォーマットが違う, ブレンドには使える
        oof = pd.read_csv(exps[exp_name]["oof"])
        oof["LGBMRanker"] = oof["LGBMRanker"].str.split(" ")
    elif exp_name == "minguin_v34":
        oof = pd.read_csv(exps[exp_name]["oof"])
        oof = oof.rename(columns={"preds":"pred"})
        oof["article_id"] = oof["article_id"].astype(str).str.zfill(10)
        oof = oof.sort_values(["customer_id", "pred"], ascending=False)
    elif exp_name == "moro_sub1":
        oof = pd.read_pickle(exps[exp_name]["oof"])
        oof = oof.drop(columns=["true"])
        oof = oof.sort_values(["customer_id", "pred"], ascending=False)
    return oof

exps = {
    "myaun_cat_v4-3": {
        "lb":0.0307,
        "cv":0.03539,
        "oof":"../exp_iwata/cat_v4-3/df_ans_eval_mid1_-1.pkl",
        "sub_pred":"../exp_iwata/cat_v4-3/df_ans_eval.pkl",
        "sub_pred_top30":"../exp_iwata/cat_v4-3/df_ans_eval_top30.pkl",
        "sub":"../sub/cat_v4-3_0.03539.csv",
    },
    "cat_v4-3_decay_popular_sample": {
        "lb":0.0306,
        "cv":0.03542,
        "oof":"../exp_iwata/cat_v4-3_decay_popular_sample/df_ans_eval_mid1_-1.pkl",
        "sub_pred":"../exp_iwata/cat_v4-3_decay_popular_sample/df_ans_eval.pkl",
        "sub_pred_top30": "../exp_iwata/cat_v4-3_decay_popular_sample/df_ans_eval_top30.pkl",
        "sub":"../sub/cat_v4-3_decay_popular_sample_0.03542.csv",
    },
    # "cat_v7-2_random_weighted_sample": {
    #     "lb":None,
    #     "cv":0.03525,
    #     "oof":"../exp_iwata/cat_v7-2_random_weighted_sample/df_ans_eval_mid1_-1.pkl",
    #     "sub_pred":None,
    #     "sub_pred_top30":None,
    #     "sub":None,
    # },
    # "cat_v4-3_short": {
    #     "lb":None,
    #     "cv":0.03529,
    #     "oof":"../exp_iwata/cat_v4-3-short/df_ans_eval_mid1_-1.pkl",
    #     "sub_pred":None,
    #     "sub_pred_top30":None,
    #     "sub":None,
    # },
    # "cat_v4-3-long-tail": {
    #     "lb":None,
    #     "cv":0.03443,
    #     "oof":"../exp_iwata/cat_v4-3-long-tail/df_ans_eval_mid1_-1.pkl",
    #     "sub_pred":None,
    #     "sub_pred_top30":None,
    #     "sub":None,
    # },
    # "cat_v12": {
    #     "lb":None,
    #     "cv":0.03493,
    #     "oof":"../exp_iwata/cat_v12/df_ans_eval_mid1_-1.pkl",
    #     "sub_pred":None,
    #     "sub_pred_top30":None,
    #     "sub":None,
    # },
    "nari_exp_lgbm_007": {
        "lb":0.0302,
        "cv":0.0366,
        "oof":"../input/nari/oof_top200/oof_exp_lgbm_007_20220428230759_0.0366.feather",
        "sub_pred":"../input/nari/sub_top200/sub_exp_lgbm_007_20220428230759_0.0366.feather",
        "sub_pred_top30": "../input/nari/sub_top200/sub_exp_lgbm_007_20220428230759_0.0366_top30.pickle",
        "sub":"../sub/exp_lgbm_007_20220425033232_0.0366.csv",
    },
    "nari_exp_lgbm_007_fix": {
        "lb":None,
        "cv":0.0366,
        "oof":"../input/nari/oof_top200/oof_exp_lgbm_007_20220505112349_0.0374.feather",
        "sub_pred":"../input/nari/sub_top200/sub_exp_lgbm_007_20220505112349_0.0374.feather",
        "sub_pred_top30": "../input/nari/sub_top200/sub_exp_lgbm_007_20220505112349_0.0374_top30.pickle",
        "sub": "../sub/sub_exp_lgbm_007_20220505112349_0.0374.csv",
    },
    "iwata_exp05": {
        "lb":0.0303,
        "cv":0.3559,
        "oof":"../input/iwata/oof_top200/oof_exp05_2020-09-16.csv",
        "sub_pred":"../input/iwata/sub_top200/oof_submission_exp05_2020-09-23.csv",
        "sub_pred_top30": "../input/iwata/sub_top200/oof_submission_exp05_2020-09-23_top30.pickle",
        "sub":"../sub/submission_exp05.csv",
    },
    # "minguin_v28": {
    #     "lb":0.0294,
    #     "cv":0.03079,
    #     "oof":"../input/minguin/sub2/oof_h_m_v28_rerun_0294.csv", 
    #     "sub":"../input/minguin/sub2/submission_h_m_v28_rerun_0294.csv",
    # },
    "minguin_v34": {
        "lb":0.0293,
        "cv":0.0314,
        "oof": "../input/minguin/sub2/oof_single_preds_h_m_single_v34_0314.csv",
        "sub_pred": "../input/minguin/sub2/sub_single_preds_h_m_single_v34_0314.csv",
        "sub_pred_top30": "../input/minguin/sub2/sub_single_preds_h_m_single_v34_0314_top30.pickle",
        "sub": "../sub/submission_single_v34.csv",
    },
    "moro_sub1": {
        "lb":0.0301,
        "cv":0.03249,
        "oof":"../input/moro/sub1_fix/train_oof_train123_all.pickle",
        "sub":"../input/moro/sub1/submission_lgb_train123_0.03249.csv",
        "sub_pred_top30": "../input/moro/sub1_fix2/sub_pred_top30_train123_0.03249.pickle",
    },
}

In [11]:
oofs = {}
for exp_name in tqdm(exps.keys()):
    oofs[exp_name] = read_oof(exp_name=exp_name)

  0%|          | 0/7 [00:00<?, ?it/s]

## stacking exp

In [12]:
topN = 24
preds = []
for exp_name in tqdm(exps.keys()):
    oof = oofs[exp_name]
    oof_topN = oof.groupby("customer_id").head(topN).copy()
    # add pred rank
    oof_topN["pred_rank"] = oof_topN.groupby("customer_id")["pred"].rank(ascending=False, method="dense")
    # add pred norm
    oof_topN["pred_norm"] = oof_topN.groupby('customer_id')["pred"].transform(lambda x: (x - x.mean()) / x.std())
    oof_topN.loc[:, "model"] = exp_name
    preds.append(oof_topN)

preds = pd.concat(preds)

  0%|          | 0/7 [00:00<?, ?it/s]

In [13]:
preds_pv = preds.pivot_table(values=['pred'], index=['customer_id', 'article_id'], columns=['model'])
preds_pv = preds_pv.reset_index()
preds_pv_columns = ['_'.join(col).strip() for col in preds_pv.columns.values]
preds_pv.columns = preds_pv_columns
preds_pv = preds_pv.rename(columns={"customer_id_":"customer_id", "article_id_":"article_id"})

preds_pv_rank = preds.pivot_table(values=['pred_rank'], index=['customer_id', 'article_id'], columns=['model'])
preds_pv_rank = preds_pv_rank.reset_index()
preds_pv_rank_columns = ['_'.join(col).strip() for col in preds_pv_rank.columns.values]
preds_pv_rank.columns = preds_pv_rank_columns
preds_pv_rank = preds_pv_rank.drop(columns=["customer_id_","article_id_"])

preds_pv_norm = preds.pivot_table(values=['pred_norm'], index=['customer_id', 'article_id'], columns=['model'])
preds_pv_norm = preds_pv_norm.reset_index()
preds_pv_norm_columns = ['_'.join(col).strip() for col in preds_pv_norm.columns.values]
preds_pv_norm.columns = preds_pv_norm_columns
preds_pv_norm = preds_pv_norm.drop(columns=["customer_id_","article_id_"])

preds_pv = pd.concat([
    preds_pv, preds_pv_rank, preds_pv_norm
], axis=1)

In [14]:
# agg score

pred_names = [f"pred_{exp_name}" for exp_name in exps.keys()]
preds_pv["preds_sum"] = preds_pv[pred_names].sum(axis=1)
preds_pv["preds_min"] = preds_pv[pred_names].min(axis=1)
preds_pv["preds_max"] = preds_pv[pred_names].max(axis=1)
preds_pv["preds_cnt"] = preds_pv[pred_names].count(axis=1)

pred_names = [f"pred_rank_{exp_name}" for exp_name in exps.keys()]
preds_pv["preds_sum_rank"] = preds_pv[pred_names].sum(axis=1)
preds_pv["preds_min_rank"] = preds_pv[pred_names].min(axis=1)
preds_pv["preds_max_rank"] = preds_pv[pred_names].max(axis=1)


pred_names = [f"pred_norm_{exp_name}" for exp_name in exps.keys()]
preds_pv["preds_sum_norm"] = preds_pv[pred_names].sum(axis=1)
preds_pv["preds_min_norm"] = preds_pv[pred_names].min(axis=1)
preds_pv["preds_max_norm"] = preds_pv[pred_names].max(axis=1)

In [15]:
# add art attrs

preds_pv = fast_left_join(
    preds_pv,
    df_art[
        [
            "product_code",
            "product_type_no",
            "product_group_name",  #
            "graphical_appearance_no",
            "colour_group_code",
            "perceived_colour_value_id",  #
            "perceived_colour_master_id",  #
            "department_no",
            "index_code",
            "index_group_no",
            "section_no",
            "garment_group_no",
        ]
    ],
    on="article_id",
)

preds_pv = fast_left_join(
    preds_pv,
    df_cust[
        [
            "age",
            "age_id",
            "age_id_1",
            "age_id_2",
            "FN",
            "Active",
            "club_member_status",
            "fashion_news_frequency",
            "postal_code",
            "postal_code_ce",
        ]
    ],
    on="customer_id",
)

In [16]:
def rebuy_rate_v2(df_trans_hist, name, key="article_id", target="customer_id", sm=5):
    df_buy1 = (
        df_trans_hist.groupby(key)[target]
        .nunique()
        .reset_index()
        .rename(columns={target: "cnt_buy1"})
    )
    df_buy2 = df_trans_hist[df_trans_hist.duplicated([target, key])].copy()
    df_buy2 = df_buy2.drop_duplicates([target, key])
    df_buy2 = df_buy2.groupby(key)[key].agg(cnt_buy2="count").reset_index()
    df_buy = pd.merge(df_buy1, df_buy2, how="left", on=key).fillna(0)

    df_buy[name] = df_buy["cnt_buy2"] / (df_buy["cnt_buy1"] + sm)
    df_buy = df_buy[[key, name]]
    df_buy.index = df_buy[key]
    df_buy.index.name = key
    df_buy = df_buy.drop(columns=[key])
    return df_buy

# add hist features

len_hist = 366
dev = "cpu"
def stacking_feat_store(df_trans, l_cust, ds, de, dsr, der, dsh, deh):
    feat = {}
    
    df_trans_yesterday = df_trans.query("(t_dat == @der)")  # 1day
    df_trans_recent = df_trans.query("(t_dat >= @dsr) and (t_dat <= @der)")  # 1week
    df_trans_hist = df_trans.query("(t_dat >= @dsh) and (t_dat <= @deh)")  # 1year
    
    # art
    
    # make decay count
    df_trans_hist["t_delay"] = (
        df_trans_hist["t_dat"].max() - df_trans_hist["t_dat"]
    ).dt.days
    df_trans_hist["decay_count"] = 1 / (1 + df_trans_hist["t_delay"] // 7)
    
    
    # agg art
    feat["art_buy_hist"] = df_trans_hist.groupby(["article_id"])["t_dat"].agg(
        art_buy_hist="count"
    )
    feat["art_buy_recent"] = df_trans_recent.groupby(["article_id"])["t_dat"].agg(
        art_buy_recent="count"
    )
    feat["art_buy_yesterday"] = df_trans_yesterday.groupby(["article_id"])[
        "t_dat"
    ].agg(art_buy_yesterday="count")
    feat["art_buy_hist_decay"] = df_trans_hist.groupby(["article_id"])["decay_count"].agg(
        art_buy_hist_decay="sum"
    )
    
#     ## days
#     feat["art_days_after_buy"] = df_trans_hist.groupby(["article_id"])["t_dat"].agg(
#         art_days_after_buy=lambda x: (ds - max(x)).days
#     )
#     ## Price
#     feat["art_price_hist_agg"] = (
#         df_trans_hist.groupby(["article_id"])["price"]
#         .agg(["mean", "median", "max", "min"])
#         .add_prefix("art_price_hist_")
#     )
#     ## age
#     feat["art_age_hist_agg"] = (
#         df_trans_hist.groupby(["article_id"])["age"]
#         .agg(["mean", "median", "max", "min"])
#         .add_prefix("art_age_hist_")
#     )
    
#     ## rebuy_rate
#     df_buy = rebuy_rate_v2(
#         df_trans_hist, name="rebuy_rate", key="article_id", target="customer_id", sm=5
#     )
#     feat["rebuy_rate"] = df_buy

#     # >>> Customer Trans

#     df_trans_yesterday = df_trans_yesterday.query("(customer_id in @l_cust)")
#     df_trans_recent = df_trans_recent.query("(customer_id in @l_cust)")
#     df_trans_hist = df_trans_hist.query("(customer_id in @l_cust)")
    
#     # cust

#     feat["n_buy_hist_all"] = df_trans_hist.groupby(["customer_id"])["t_dat"].agg(
#         n_buy_hist_all="count"
#     )
#     feat["n_buy_hist_all_decay"] = df_trans_hist.groupby(["customer_id"])[
#         "decay_count"
#     ].agg(n_buy_hist_all_decay="sum")
#     feat["n_buy_recent_all"] = df_trans_recent.groupby(["customer_id"])["t_dat"].agg(
#         n_buy_recent_all="count"
#     )
#     feat["days_after_buy_all"] = df_trans_hist.groupby(["customer_id"])["t_dat"].agg(
#         days_after_buy_all=lambda x: (ds - max(x)).days
#     )
    
#     ## rebuy_rate
#     df_buy = rebuy_rate_v2(
#         df_trans_hist,
#         name="cust_rebuy_rate",
#         key="customer_id",
#         target="article_id",
#         sm=5,
#     )
#     feat["cust_rebuy_rate"] = df_buy
    
#     ## Channel
#     feat["rate_sales_channel_hist"] = df_trans_hist.groupby(["customer_id"])[
#         "sales_channel_id"
#     ].agg(rate_sales_channel_hist="mean")
#     feat["rate_sales_channel_recent"] = df_trans_recent.groupby(["customer_id"])[
#         "sales_channel_id"
#     ].agg(rate_sales_channel_recent="mean")
    
#     ## Price
#     feat["user_price_hist_agg"] = (
#         df_trans_hist.groupby(["customer_id"])["price"]
#         .agg(["mean", "median", "max", "min"])
#         .add_prefix("user_price_hist_")
#     )

#     # cust * art

#     feat["n_buy_hist"] = df_trans_hist.groupby(["customer_id", "article_id"])[
#         "t_dat"
#     ].agg(n_buy_hist="count")
#     feat["n_buy_hist_decay"] = df_trans_hist.groupby(["customer_id", "article_id"])[
#         "decay_count"
#     ].agg(n_buy_hist_decay="sum")
#     feat["n_buy_recent"] = df_trans_recent.groupby(["customer_id", "article_id"])[
#         "t_dat"
#     ].agg(n_buy_recent="count")
#     feat["n_buy_yesterday"] = df_trans_yesterday.groupby(["customer_id", "article_id"])[
#         "t_dat"
#     ].agg(n_buy_yesterday="count") 


    for k in feat.keys():
        feat[k] = reduce_mem_usage(feat[k])
    if dev == "gpu":
        feat[k] = cudf.from_pandas(feat[k])
    return feat

def custom_fillna(df):
    cnt_columns = [
        "n_buy_hist",
        "n_buy_hist_decay",
        "n_buy_hist_short",
        "n_buy_hist_mid",
        "n_buy_recent",
        "n_buy_hist_all",
        "n_buy_hist_all_decay",
        "n_buy_hist_short_all",
        "n_buy_hist_mid_all",
        "n_buy_recent_all",
        "n_buy_hist_prod",
        "n_buy_recent_prod",
        "n_buy_hist_ptype",
        "n_buy_recent_ptype",
        "n_buy_hist_graph",
        "n_buy_recent_graph",
        "n_buy_hist_col",
        "n_buy_recent_col",
        "n_buy_hist_dep",
        "n_buy_hist_short_dep",
        "n_buy_hist_mid_dep",
        "n_buy_recent_dep",
        "n_buy_hist_idx",
        "n_buy_recent_idx",
        "n_buy_hist_idxg",
        "n_buy_recent_idxg",
        "n_buy_hist_sec",
        "n_buy_recent_sec",
        "n_buy_hist_short_sec",
        "n_buy_hist_mid_sec",
        "n_buy_hist_garm",
        "n_buy_recent_garm",
        "art_buy_yesterday",
        "art_buy_recent",
        "art_buy_hist",
        "art_buy_hist_decay",
        "art_buy_hist_short",
        "art_buy_hist_mid",
        "art_buy_hist_ch1",
        "art_buy_hist_ch2",
        "art_buy_hist_ch1_decay",
        "art_buy_hist_ch2_decay",
        "art_buy_hist_short_ch1",
        "art_buy_hist_short_ch2",
        "code_buy_hist",
        "code_buy_hist_decay",
        "code_buy_recent",
        "code_buy_yesterday",
        "rebuy_rate",
        "code_rebuy_rate",
        "cust_rebuy_rate",
        "n_buy_hist_code_pcol",
        "n_buy_recent_code_pcol",
        "n_buy_hist_idxg_sec",
        "n_buy_recent_idxg_sec",
        "n_buy_hist_idxg_gram",
        "n_buy_recent_idxg_gram",
        "age_id_n_buy_hist",
        "age_id_n_buy_hist_decay",
        "age_id_n_buy_recent",
        # add
        "n_buy_yesterday",
        "hist_index_group_no_1_norm_sm",
        "hist_index_group_no_4_norm_sm",
        "hist_index_group_no_3_norm_sm",
        "hist_index_group_no_26_norm_sm",
        "hist_index_group_no_2_norm_sm",
        "recent_index_group_no_1_norm_sm",
        "recent_index_group_no_4_norm_sm",
        "recent_index_group_no_3_norm_sm",
        "recent_index_group_no_26_norm_sm",
        "recent_index_group_no_2_norm_sm",
        "hist_perceived_colour_value_id_4_norm_sm",
        "hist_perceived_colour_value_id_3_norm_sm",
        "hist_perceived_colour_value_id_1_norm_sm",
        "hist_perceived_colour_value_id_2_norm_sm",
        "hist_perceived_colour_value_id_5_norm_sm",
        "hist_perceived_colour_value_id_7_norm_sm",
        "hist_perceived_colour_value_id_6_norm_sm",
        "hist_perceived_colour_value_id_-1_norm_sm",
        "recent_perceived_colour_value_id_4_norm_sm",
        "recent_perceived_colour_value_id_3_norm_sm",
        "recent_perceived_colour_value_id_1_norm_sm",
        "recent_perceived_colour_value_id_2_norm_sm",
        "recent_perceived_colour_value_id_5_norm_sm",
        "recent_perceived_colour_value_id_7_norm_sm",
        "recent_perceived_colour_value_id_6_norm_sm",
        "recent_perceived_colour_value_id_-1_norm_sm",
        "postal_code_n_buy_hist",
        "postal_code_n_buy_hist_short",
        "postal_code_n_buy_hist_mid",
        "postal_code_n_buy_recent",
        "postal_code_n_buy_yesterday",
        "postal_code_n_buy_hist_ch1",
        "postal_code_n_buy_recent_ch1",
        "postal_code_n_buy_yesterday_ch1",
    ]
    for col in cnt_columns:
        if not col in df.columns: continue
        df[col] = df[col].fillna(0)

    days_columns = [
        "days_after_buy",
        "days_after_buy_all",
        "days_after_buy_prod",
        "days_after_buy_ptype",
        "days_after_buy_graph",
        "days_after_buy_col",
        "days_after_buy_dep",
        "days_after_buy_idx",
        "days_after_buy_idxg",
        "days_after_buy_sec",
        "days_after_buy_garm",
        "days_after_buy_code_pcol",
        "days_after_buy_idxg_sec",
        "days_after_buy_idxg_gram",
        "art_days_after_buy",
        "art_days_from_oldest_buy",
        "art_days_from_mode_buy",
        "days_from_oldest_buy_all",
    ]
    for col in days_columns:
        if not col in df.columns: continue
        df[col] = df[col].fillna(10 + len_hist)

    ch_columns = [
        "rate_sales_channel_hist",
        "rate_sales_channel_recent",
        "art_rate_sales_channel_hist",
        "art_rate_sales_channel_recent",
    ]
    for col in ch_columns:
        if not col in df.columns: continue
        df[col] = df[col].fillna(1.5)

    rank_columns = [i for i in df.columns if "_rank" in i]
    for col in rank_columns:
        if not col in df.columns: continue
        df[col] = df[col].fillna(100000)

    fill_mean_columns = [
        "art_price_hist_mean",
        "art_price_hist_median",
        "art_price_hist_max",
        "art_price_hist_min",
        "art_age_hist_mean",
        "art_age_hist_median",
        "art_age_hist_max",
        "art_age_hist_min",
    ]
    for col in fill_mean_columns:
        if not col in df.columns: continue
        mn = df[col]
        df[col] = df[col].fillna(mn)
    return df

def stacking_add_feat(df, feat):
    df = reduce_mem_usage(df)
    if dev == "gpu":
        df = cudf.from_pandas(df)

    # merge aid
    for col in [
        "art_buy_hist",
        "art_buy_recent",
        "art_buy_yesterday",
        "art_buy_hist_decay",
        # "art_days_after_buy",
        # "art_price_hist_agg",
        # "art_age_hist_agg",
        # "rebuy_rate",
        # "n_buy_recent_all",
        # "days_after_buy_all"
    ]:
        if dev == "gpu":
            df = df.merge(
                feat[col], how="left", left_on=["article_id"], right_index=True
            )
        else:
            df = fast_left_join(df, feat[col], on="article_id")

#     # merge cid
#     for col in [
#         "n_buy_hist_all",
#         "n_buy_hist_all_decay"
        
#     ]:
#         if dev == "gpu":
#             df = df.merge(
#                 feat[col], how="left", left_on=["customer_id"], right_index=True
#             )
#         else:
#             df = fast_left_join(df, feat[col], on="customer_id")

#     # merge cid * aid
#     for col in [
#         "n_buy_hist",
#         "n_buy_recent",
#         "n_buy_yesterday",
#         "n_buy_hist_decay",
#     ]:
#         df = df.merge(
#             feat[col],
#             how="left",
#             left_on=["customer_id", "article_id"],
#             right_index=True,
#         )

    df = custom_fillna(df)
    if dev == "gpu":
        df = df.to_pandas()
    return df

In [17]:
%%time

feat = stacking_feat_store(
    df_trans,
    l_cust=preds_pv["customer_id"].unique(),
    ds=datetime.datetime(2020, 9, 16),
    de=datetime.datetime(2020, 9, 22),
    dsr=datetime.datetime(2020, 9, 9),
    der=datetime.datetime(2020, 9, 15),
    dsh=datetime.datetime(2019, 9, 15),
    deh=datetime.datetime(2020, 9, 15),
)

preds_pv = stacking_add_feat(preds_pv, feat)

CPU times: user 14.5 s, sys: 4.11 s, total: 18.6 s
Wall time: 18.5 s


In [18]:
# add target

preds_pv = preds_pv.merge(df_trans_week1, how="left", on=['customer_id', 'article_id'])
preds_pv["target"] = preds_pv["target"].fillna(0).astype(int)

In [19]:
from sklearn.model_selection import GroupKFold
# from sklearn.model_selection import StratifiedGroupKFold

FOLD_NUM = 10
RANDOM_STATE = 46
CAT_PARAMS = {
    "depth": 5,
    "learning_rate": 0.1,
    "boosting_type": "Plain",
    "bootstrap_type": "Bernoulli",
    "subsample": 0.8,
    "reg_lambda": 0.001,
    "iterations": 10000,
    "od_type": "Iter",
    "od_wait": 30,
    "metric_period": 20,
    "random_seed": RANDOM_STATE,
    "task_type": "CPU",
    # "task_type": "GPU",
    # "gpu_ram_part": 0.95,
    # "devices": "1",
    "verbose": True,
    "loss_function": "YetiRank",
    "eval_metric": "MAP:top=12",
}

models = []
scores = []
oof = np.zeros(len(preds_pv))
y = preds_pv['target'].values
groups = preds_pv['customer_id'].values
kfold = GroupKFold(n_splits=FOLD_NUM)
# kfold = StratifiedGroupKFold(n_splits=FOLD_NUM, shuffle=True, random_state=RANDOM_STATE)
for fold, (trn, val) in enumerate(kfold.split(preds_pv, y, groups)):
    
    df_train = preds_pv.iloc[trn]
    df_val = preds_pv.iloc[val]
    
    df_train = df_train.sort_values("customer_id")
    X_train = df_train.drop(
        [
            "customer_id",
            "article_id",
            "target",
        ],
        axis=1,
    )
    y_train = df_train["target"]
    idx = df_train.groupby("customer_id")["customer_id"].count().values
    group_id_train = [
        i for i, bascket_num in enumerate(idx) for _ in range(bascket_num)
    ]

    df_val = df_val.sort_values("customer_id")
    X_val = df_val.drop(
        [
            "customer_id",
            "article_id",
            "target",
        ],
        axis=1,
    )
    y_val = df_val["target"]
    idx = df_val.groupby("customer_id")["customer_id"].count().values
    group_id_val = [
        i for i, bascket_num in enumerate(idx) for _ in range(bascket_num)
    ]

    cat_features_index = []
    train_pool = Pool(
        data=X_train,
        label=y_train,
        group_id=group_id_train,
        cat_features=cat_features_index,
    )
    val_pool = Pool(
        data=X_val,
        label=y_val,
        group_id=group_id_val,
        cat_features=cat_features_index,
    )

    model = CatBoostRanker(**CAT_PARAMS)
    model.fit(train_pool, eval_set=val_pool)
    gbdt_score = model.best_score_["validation"]["MAP:top=12"]
    
    # not sorted sample
    X_val = preds_pv.iloc[val].drop(
        [
            "customer_id",
            "article_id",
            "target",
        ],
        axis=1,
    )
    preds = model.predict(X_val)
    oof[val] = preds
    scores.append(gbdt_score)
    models.append(model)



0:	learn: 0.0404374	test: 0.0425329	best: 0.0425329 (0)	total: 795ms	remaining: 2h 12m 25s
20:	learn: 0.0613830	test: 0.0642962	best: 0.0644249 (17)	total: 16.8s	remaining: 2h 13m 25s
40:	learn: 0.0626182	test: 0.0653436	best: 0.0653436 (40)	total: 32.4s	remaining: 2h 11m 10s
60:	learn: 0.0630924	test: 0.0660563	best: 0.0660563 (60)	total: 46.7s	remaining: 2h 6m 54s
80:	learn: 0.0637630	test: 0.0661366	best: 0.0663456 (71)	total: 1m 2s	remaining: 2h 7m 43s
100:	learn: 0.0643158	test: 0.0660375	best: 0.0663456 (71)	total: 1m 18s	remaining: 2h 7m 32s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06634562904
bestIteration = 71

Shrink model to first 72 iterations.




0:	learn: 0.0386890	test: 0.0361267	best: 0.0361267 (0)	total: 735ms	remaining: 2h 2m 24s
20:	learn: 0.0625440	test: 0.0588162	best: 0.0588162 (20)	total: 16.5s	remaining: 2h 10m 39s
40:	learn: 0.0633842	test: 0.0592757	best: 0.0593717 (31)	total: 32.3s	remaining: 2h 10m 35s
60:	learn: 0.0640184	test: 0.0596958	best: 0.0598161 (56)	total: 48.1s	remaining: 2h 10m 31s
80:	learn: 0.0646445	test: 0.0601601	best: 0.0602610 (77)	total: 1m 3s	remaining: 2h 10m 19s
100:	learn: 0.0652932	test: 0.0603702	best: 0.0605334 (98)	total: 1m 19s	remaining: 2h 9m 51s
120:	learn: 0.0657536	test: 0.0606072	best: 0.0607698 (115)	total: 1m 33s	remaining: 2h 7m 10s
140:	learn: 0.0661946	test: 0.0603474	best: 0.0607698 (115)	total: 1m 50s	remaining: 2h 9m 9s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.060769819
bestIteration = 115

Shrink model to first 116 iterations.




0:	learn: 0.0407950	test: 0.0379232	best: 0.0379232 (0)	total: 1.14s	remaining: 3h 10m 30s
20:	learn: 0.0622889	test: 0.0564213	best: 0.0564789 (13)	total: 16.9s	remaining: 2h 14m 12s
40:	learn: 0.0638667	test: 0.0574236	best: 0.0574537 (30)	total: 32.9s	remaining: 2h 13m 3s
60:	learn: 0.0643482	test: 0.0574494	best: 0.0575288 (55)	total: 48.9s	remaining: 2h 12m 45s
80:	learn: 0.0648792	test: 0.0575197	best: 0.0579180 (67)	total: 1m 3s	remaining: 2h 9m 11s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.0579179898
bestIteration = 67

Shrink model to first 68 iterations.




0:	learn: 0.0400467	test: 0.0421905	best: 0.0421905 (0)	total: 756ms	remaining: 2h 5m 58s
20:	learn: 0.0614401	test: 0.0666492	best: 0.0666853 (19)	total: 15s	remaining: 1h 59m
40:	learn: 0.0625109	test: 0.0671647	best: 0.0673152 (33)	total: 29s	remaining: 1h 57m 29s
60:	learn: 0.0632299	test: 0.0683460	best: 0.0683460 (60)	total: 43s	remaining: 1h 56m 53s
80:	learn: 0.0636241	test: 0.0686583	best: 0.0686583 (80)	total: 57.1s	remaining: 1h 56m 27s
100:	learn: 0.0640671	test: 0.0687497	best: 0.0688042 (88)	total: 1m 11s	remaining: 1h 56m 18s
120:	learn: 0.0646072	test: 0.0691767	best: 0.0692663 (115)	total: 1m 25s	remaining: 1h 56m 57s
140:	learn: 0.0650973	test: 0.0687778	best: 0.0692663 (115)	total: 1m 39s	remaining: 1h 56m 23s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06926631066
bestIteration = 115

Shrink model to first 116 iterations.




0:	learn: 0.0396524	test: 0.0389662	best: 0.0389662 (0)	total: 805ms	remaining: 2h 14m 5s
20:	learn: 0.0618191	test: 0.0606824	best: 0.0606824 (20)	total: 15.2s	remaining: 2h 38s
40:	learn: 0.0628900	test: 0.0620673	best: 0.0620689 (39)	total: 29s	remaining: 1h 57m 28s
60:	learn: 0.0637360	test: 0.0623939	best: 0.0624882 (55)	total: 42.8s	remaining: 1h 56m 14s
80:	learn: 0.0643744	test: 0.0624163	best: 0.0629169 (63)	total: 56.6s	remaining: 1h 55m 28s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06291686399
bestIteration = 63

Shrink model to first 64 iterations.




0:	learn: 0.0384787	test: 0.0366011	best: 0.0366011 (0)	total: 736ms	remaining: 2h 2m 37s
20:	learn: 0.0617238	test: 0.0604773	best: 0.0607184 (15)	total: 14.6s	remaining: 1h 55m 20s
40:	learn: 0.0630843	test: 0.0614948	best: 0.0616968 (36)	total: 28.3s	remaining: 1h 54m 43s
60:	learn: 0.0638522	test: 0.0618748	best: 0.0619054 (59)	total: 42.2s	remaining: 1h 54m 32s
80:	learn: 0.0645806	test: 0.0619169	best: 0.0622307 (74)	total: 56.1s	remaining: 1h 54m 33s
100:	learn: 0.0651124	test: 0.0616511	best: 0.0622307 (74)	total: 1m 11s	remaining: 1h 56m 50s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06223072504
bestIteration = 74

Shrink model to first 75 iterations.




0:	learn: 0.0379190	test: 0.0401887	best: 0.0401887 (0)	total: 1.06s	remaining: 2h 57m 23s
20:	learn: 0.0617792	test: 0.0626022	best: 0.0627148 (19)	total: 16.3s	remaining: 2h 9m 19s
40:	learn: 0.0629149	test: 0.0637935	best: 0.0637935 (40)	total: 31.9s	remaining: 2h 9m 15s
60:	learn: 0.0636837	test: 0.0640842	best: 0.0642615 (57)	total: 46.9s	remaining: 2h 7m 27s
80:	learn: 0.0642455	test: 0.0642877	best: 0.0644142 (69)	total: 1m 1s	remaining: 2h 6m 6s
100:	learn: 0.0648059	test: 0.0637085	best: 0.0644661 (82)	total: 1m 17s	remaining: 2h 6m 39s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.0644661185
bestIteration = 82

Shrink model to first 83 iterations.




0:	learn: 0.0380500	test: 0.0360574	best: 0.0360574 (0)	total: 745ms	remaining: 2h 4m 10s
20:	learn: 0.0623178	test: 0.0589675	best: 0.0589675 (20)	total: 15s	remaining: 1h 58m 47s
40:	learn: 0.0632001	test: 0.0600324	best: 0.0600324 (40)	total: 30.7s	remaining: 2h 4m 7s
60:	learn: 0.0640079	test: 0.0604197	best: 0.0604806 (54)	total: 46.3s	remaining: 2h 5m 42s
80:	learn: 0.0645496	test: 0.0610583	best: 0.0610878 (79)	total: 1m	remaining: 2h 3m 10s
100:	learn: 0.0651061	test: 0.0609516	best: 0.0612296 (83)	total: 1m 15s	remaining: 2h 4m 8s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06122959798
bestIteration = 83

Shrink model to first 84 iterations.




0:	learn: 0.0383599	test: 0.0406888	best: 0.0406888 (0)	total: 749ms	remaining: 2h 4m 46s
20:	learn: 0.0616559	test: 0.0625641	best: 0.0625641 (20)	total: 15s	remaining: 1h 58m 51s
40:	learn: 0.0628824	test: 0.0630389	best: 0.0632162 (39)	total: 30.7s	remaining: 2h 4m 17s
60:	learn: 0.0636042	test: 0.0637074	best: 0.0637248 (59)	total: 46.4s	remaining: 2h 6m 7s
80:	learn: 0.0641124	test: 0.0640623	best: 0.0640623 (80)	total: 1m	remaining: 2h 3m 22s
100:	learn: 0.0644198	test: 0.0637440	best: 0.0642226 (88)	total: 1m 16s	remaining: 2h 5m 40s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06422255735
bestIteration = 88

Shrink model to first 89 iterations.




0:	learn: 0.0397581	test: 0.0390505	best: 0.0390505 (0)	total: 722ms	remaining: 2h 20s
20:	learn: 0.0619419	test: 0.0601601	best: 0.0603358 (19)	total: 16.6s	remaining: 2h 11m 22s
40:	learn: 0.0629862	test: 0.0607132	best: 0.0608282 (33)	total: 30.6s	remaining: 2h 3m 46s
60:	learn: 0.0637291	test: 0.0612842	best: 0.0613127 (59)	total: 46.2s	remaining: 2h 5m 30s
80:	learn: 0.0644602	test: 0.0609847	best: 0.0614475 (68)	total: 1m 1s	remaining: 2h 6m 22s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06144750635
bestIteration = 68

Shrink model to first 69 iterations.


In [20]:
# make sub cv

preds_cv_val = preds_pv[["customer_id", "article_id"]].copy()
preds_cv_val["oof"] = oof

preds_cv_val = preds_cv_val.sort_values(["customer_id", "oof"], ascending=False)
preds_cv_val = preds_cv_val.groupby("customer_id").head(12)
preds_cv_val = preds_cv_val.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"oof"})
preds_cv_val = preds_cv_val.merge(df_agg_val_1, on="customer_id", how="left").rename(columns={"article_id":"gts"})

gbdt_cv = np.mean(scores)
cv = mapk(preds_cv_val["gts"], preds_cv_val["oof"])

print(f"gbdt_cv: {gbdt_cv:.5f}")
print(f"CV: {cv:.6f}")

gbdt_cv: 0.06308
CV: 0.039142


In [26]:
imp = models[-1].get_feature_importance(val_pool)
df_imp = pd.DataFrame({
    'imp': abs(imp), # absでOK?
    'name': X_val.columns
}).sort_values(by=['imp'], ascending=False)
df_imp

Unnamed: 0,imp,name
28,0.0027,preds_sum_norm
1,0.001646,pred_iwata_exp05
54,0.001329,art_buy_recent
3,0.001191,pred_moro_sub1
31,0.001054,product_code
55,0.000967,art_buy_yesterday
22,0.000677,preds_min
19,0.000602,pred_norm_nari_exp_lgbm_007
6,0.000493,pred_nari_exp_lgbm_007_fix
34,0.000481,graphical_appearance_no


## Inference

In [27]:
# read sub pred topN

# topN = 24
sub_preds = []
for exp_name in tqdm(exps.keys()):
    sub = pd.read_pickle(exps[exp_name]["sub_pred_top30"])
    sub_topN = sub.groupby("customer_id").head(topN).copy()
    # add pred rank
    sub_topN["pred_rank"] = sub_topN.groupby("customer_id")["pred"].rank(ascending=False, method="dense")
    # add pred norm, 時間掛かる...保存する
    sub_topN["pred_norm"] = sub_topN.groupby('customer_id')["pred"].transform(lambda x: (x - x.mean()) / x.std())
    sub_topN.loc[:, "model"] = exp_name
    sub_preds.append(sub_topN)

sub_preds = pd.concat(sub_preds)

  0%|          | 0/7 [00:00<?, ?it/s]

In [28]:
feat = stacking_feat_store(
    df_trans,
    l_cust=preds_pv["customer_id"].unique(),
    ds=datetime.datetime(2020, 9, 23),
    de=datetime.datetime(2020, 9, 29),
    dsr=datetime.datetime(2020, 9, 16),
    der=datetime.datetime(2020, 9, 22),
    dsh=datetime.datetime(2019, 9, 22),
    deh=datetime.datetime(2020, 9, 22),
)

In [29]:
customer_ids = sub_preds["customer_id"].unique()

In [30]:
size_block = 30000
list_slice = list(range(0, len(customer_ids), size_block))
if list_slice[-1] != len(customer_ids):
    list_slice.append(len(customer_ids))

In [31]:
sub_all = []
for batch_idx in tqdm(range(len(list_slice) - 1)):
    customer_ids_batch = customer_ids[list_slice[batch_idx] : list_slice[batch_idx + 1]]
    
    preds = sub_preds[sub_preds.customer_id.isin(customer_ids_batch)]
    
    # to pv
    preds_pv = preds.pivot_table(values=['pred'], index=['customer_id', 'article_id'], columns=['model'])
    preds_pv = preds_pv.reset_index()
    preds_pv_columns = ['_'.join(col).strip() for col in preds_pv.columns.values]
    preds_pv.columns = preds_pv_columns
    preds_pv = preds_pv.rename(columns={"customer_id_":"customer_id", "article_id_":"article_id"})

    preds_pv_rank = preds.pivot_table(values=['pred_rank'], index=['customer_id', 'article_id'], columns=['model'])
    preds_pv_rank = preds_pv_rank.reset_index()
    preds_pv_rank_columns = ['_'.join(col).strip() for col in preds_pv_rank.columns.values]
    preds_pv_rank.columns = preds_pv_rank_columns
    preds_pv_rank = preds_pv_rank.drop(columns=["customer_id_","article_id_"])

    preds_pv_norm = preds.pivot_table(values=['pred_norm'], index=['customer_id', 'article_id'], columns=['model'])
    preds_pv_norm = preds_pv_norm.reset_index()
    preds_pv_norm_columns = ['_'.join(col).strip() for col in preds_pv_norm.columns.values]
    preds_pv_norm.columns = preds_pv_norm_columns
    preds_pv_norm = preds_pv_norm.drop(columns=["customer_id_","article_id_"])

    preds_pv = pd.concat([
        preds_pv, preds_pv_rank, preds_pv_norm
    ], axis=1)

    # agg score

    pred_names = [f"pred_{exp_name}" for exp_name in exps.keys()]
    preds_pv["preds_sum"] = preds_pv[pred_names].sum(axis=1)
    preds_pv["preds_min"] = preds_pv[pred_names].min(axis=1)
    preds_pv["preds_max"] = preds_pv[pred_names].max(axis=1)
    preds_pv["preds_cnt"] = preds_pv[pred_names].count(axis=1)

    pred_names = [f"pred_rank_{exp_name}" for exp_name in exps.keys()]
    preds_pv["preds_sum_rank"] = preds_pv[pred_names].sum(axis=1)
    preds_pv["preds_min_rank"] = preds_pv[pred_names].min(axis=1)
    preds_pv["preds_max_rank"] = preds_pv[pred_names].max(axis=1)


    pred_names = [f"pred_norm_{exp_name}" for exp_name in exps.keys()]
    preds_pv["preds_sum_norm"] = preds_pv[pred_names].sum(axis=1)
    preds_pv["preds_min_norm"] = preds_pv[pred_names].min(axis=1)
    preds_pv["preds_max_norm"] = preds_pv[pred_names].max(axis=1)
    
    # add art attrs

    preds_pv = fast_left_join(
        preds_pv,
        df_art[
            [
                "product_code",
                "product_type_no",
                "product_group_name",  #
                "graphical_appearance_no",
                "colour_group_code",
                "perceived_colour_value_id",  #
                "perceived_colour_master_id",  #
                "department_no",
                "index_code",
                "index_group_no",
                "section_no",
                "garment_group_no",
            ]
        ],
        on="article_id",
    )

    preds_pv = fast_left_join(
        preds_pv,
        df_cust[
            [
                "age",
                "age_id",
                "age_id_1",
                "age_id_2",
                "FN",
                "Active",
                "club_member_status",
                "fashion_news_frequency",
                "postal_code",
                "postal_code_ce",
            ]
        ],
        on="customer_id",
    )

    # add feat

    preds_pv = stacking_add_feat(preds_pv, feat)
    
    # predict (5fold)
    preds_tmp = np.zeros(len(preds_pv))
    for i in range(len(models)):
        preds_tmp += models[i].predict(preds_pv.drop(
            [
                "customer_id",
                "article_id",
            ],
            axis=1,
        ))
    preds_tmp /= len(models)
    
    sub_batch = preds_pv[["customer_id", "article_id"]].copy()
    sub_batch["pred"] = preds_tmp

    sub_batch = sub_batch.sort_values(["customer_id", "pred"], ascending=False)
    sub_batch = sub_batch.groupby("customer_id").head(12)
    sub_batch = sub_batch.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"prediction"})
    sub_batch["prediction"] = sub_batch["prediction"].apply(lambda x:' '.join(x))
    
    sub_all.append(sub_batch)

  0%|          | 0/46 [00:00<?, ?it/s]

In [32]:
sub_all = pd.concat(sub_all)
sub_all = sub_all.sort_values("customer_id")
sub_all = sub_all.reset_index(drop=True)

In [33]:
sub_all.to_csv(f"../sub/stacking-v4_{cv:.5f}.csv", index=False)

In [34]:
sub_all

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,0568601006 0568601043 0568601044 0568601007 0568601023 0568601030 0673677002 0678942001 05795410...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,0918522001 0902528006 0924243002 0448509014 0909059002 0788575004 0918292001 0915529003 07147900...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0794321007 0794321011 0794321008 0866731001 0924243002 0918292001 0924243001 0915529005 09185220...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e,0866731001 0730683050 0852584001 0928206001 0740519002 0751471001 0850917001 0673677002 09182920...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,0730683050 0896152002 0791587001 0730683062 0791587015 0791587021 0866731001 0918292001 08525840...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e4747568cac33e8c541831,0557599022 0791587001 0791587010 0804992014 0791587015 0822344001 0866731001 0918292001 08049920...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab53481233731b5c4f8b7,0762846027 0762846031 0762846006 0762846026 0706016001 0762846008 0762846029 0706016003 07060160...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1778d0116cffd259264,0762846027 0762846006 0762846031 0762846026 0762846029 0706016002 0762846008 0673677002 08509170...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38b2236865d949d4df6a,0714790020 0448509014 0714790028 0821395005 0714790024 0874110016 0914441004 0893432002 08557060...
