In [11]:
import cudf
from cuml import ForestInference
from cuml.preprocessing.TargetEncoder import TargetEncoder
from catboost import CatBoostClassifier, CatBoostRanker, Pool

import numpy as np
import pandas as pd
import random
import gc
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
import datetime
import itertools
import os
from contextlib import redirect_stdout
from tqdm.notebook import tqdm

tqdm.pandas()

import warnings

warnings.filterwarnings("ignore")

pd.set_option("display.max_colwidth", 100)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 300)


def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])


def apk_list(actual, predicted, k=10):
    return [apk(a, p, k) for a, p in zip(actual, predicted)]


# https://www.kaggle.com/tkm2261/fast-pandas-left-join-357x-faster-than-pd-merge
# add mulple index option
def fast_left_join(df, join_df, on):
    if isinstance(on, list):
        return pd.concat(
            [
                df.reset_index(drop=True),
                join_df.reindex(
                    pd.MultiIndex.from_tuples(
                        tuple(map(tuple, df[on].values)), names=on
                    )
                ).reset_index(drop=True),
            ],
            axis=1,
        )
    else:
        return pd.concat(
            [
                df.reset_index(drop=True),
                join_df.reindex(df[on].values).reset_index(drop=True),
            ],
            axis=1,
        )


def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for i, col in enumerate(df.columns):
        try:
            col_type = df[col].dtype

            if col_type != object:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == "int":
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif (
                        c_min > np.iinfo(np.int16).min
                        and c_max < np.iinfo(np.int16).max
                    ):
                        df[col] = df[col].astype(np.int16)
                    elif (
                        c_min > np.iinfo(np.int32).min
                        and c_max < np.iinfo(np.int32).max
                    ):
                        df[col] = df[col].astype(np.int32)
                    elif (
                        c_min > np.iinfo(np.int64).min
                        and c_max < np.iinfo(np.int64).max
                    ):
                        df[col] = df[col].astype(np.int32)
                else:
                    if (
                        c_min > np.finfo(np.float16).min
                        and c_max < np.finfo(np.float16).max
                    ):
                        df[col] = df[col].astype(np.float32)
                    elif (
                        c_min > np.finfo(np.float32).min
                        and c_max < np.finfo(np.float32).max
                    ):
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float32)
        except ValueError:
            continue

    end_mem = df.memory_usage().sum() / 1024 ** 2
    # print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    # print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
tran_dtypes = {
    "t_dat": "str",
    "customer_id": "str",
    "article_id": "int",
    "product_code": "int",
    "price": "float",
    "sales_channel_id": "int",
}
art_dtypes = {
    "article_id": "int",
    "product_code": "int",
    "product_type_no": "int",
    "product_group_name": "str",
    "graphical_appearance_no": "int",
    "colour_group_code": "int",
    "perceived_colour_value_id": "int",
    "perceived_colour_master_id": "int",
    "department_no": "int",
    "index_code": "str",
    "index_group_no": "int",
    "section_no": "int",
    "garment_group_no": "int",
}
cust_dtypes = {"customer_id": "str"}

In [7]:
path = "../input/h-and-m-personalized-fashion-recommendations/"

df_trans = pd.read_csv(path + "transactions_train.csv", dtype=tran_dtypes)
df_trans["t_dat"] = pd.to_datetime(df_trans["t_dat"], format="%Y-%m-%d")
df_trans = df_trans.drop_duplicates(["customer_id", "article_id", "t_dat"])
df_trans["article_id"] = df_trans["article_id"].astype(str).str.zfill(10)

df_trans_week1 = df_trans[df_trans.t_dat > datetime.datetime(2020, 9, 15)]
df_trans_week1["target"] = 1
df_trans_week1 = df_trans_week1[['customer_id', 'article_id', "target"]]

In [8]:
df_art = pd.read_csv(path + "articles.csv", dtype=art_dtypes)
df_art["article_id"] = df_art["article_id"].astype(str).str.zfill(10)
le = LabelEncoder()
le.fit(df_art["index_code"].unique())
df_art["index_code"] = le.transform(df_art["index_code"])
le = LabelEncoder()
le.fit(df_art["product_group_name"].unique())
df_art["product_group_name"] = le.transform(df_art["product_group_name"])

df_cust = pd.read_csv(path + "customers.csv", dtype=cust_dtypes)
df_cust["age"] = df_cust["age"].fillna(df_cust["age"].mean())

# age_id
df_cust["age"] = df_cust["age"].astype(int)
customer_age_gorup = pd.read_csv("../save/customer_age_gorup.csv")
df_cust = df_cust.merge(customer_age_gorup, how="left", on=["age"])
df_cust["age_2"] = df_cust["age"]
df_cust.loc[df_cust["age"] >= 61, "age_2"] = 61
df_cust["age_id_1"] = df_cust["age_2"] // 10
df_cust["age_id_2"] = df_cust["age_2"] // 5
df_cust = df_cust.drop(columns=["age_2"])

df_cust[["FN", "Active"]] = df_cust[["FN", "Active"]].fillna(0)
df_cust["club_member_status"] = df_cust["club_member_status"].apply(
    lambda x: 1 if x == "ACTIVE" else 0
)
df_cust["fashion_news_frequency"] = df_cust["fashion_news_frequency"].apply(
    lambda x: 0 if x == "NONE" else 1
)

# postal_code_ce
df_cust["postal_code_ce"] = df_cust["postal_code"].map(
    df_cust["postal_code"].value_counts()
)

# postal_code, 10以下をまとめる
postal_code_cnt = df_cust["postal_code"].value_counts().reset_index()
postal_code_cnt.columns = ["postal_code", "cnt"]

code_map = {
    i: 0 for i in postal_code_cnt[postal_code_cnt.cnt <= 10]["postal_code"].values
}
for i, code in enumerate(
    postal_code_cnt[postal_code_cnt.cnt > 10]["postal_code"].values
):
    code_map[code] = i + 1

df_cust["postal_code"] = df_cust["postal_code"].map(code_map)

df_art.index = df_art.article_id
df_art.index.name = "article_id"
df_art = df_art.drop(columns=["article_id"])

df_cust.index = df_cust.customer_id
df_cust.index.name = "customer_id"
df_cust = df_cust.drop(columns=["customer_id"])

In [9]:
df_trans = fast_left_join(
    df_trans,
    df_art[
        [
            "product_code",
            "product_type_no",
            "product_group_name",  #
            "graphical_appearance_no",
            "colour_group_code",
            "perceived_colour_value_id",  #
            "perceived_colour_master_id",  #
            "department_no",
            "index_code",
            "index_group_no",
            "section_no",
            "garment_group_no",
        ]
    ],
    on="article_id",
)

df_trans = fast_left_join(
    df_trans,
    df_cust[
        [
            "age",
            "age_id",
            "age_id_1",
            "age_id_2",
            "FN",
            "Active",
            "club_member_status",
            "fashion_news_frequency",
            "postal_code",
            "postal_code_ce",
        ]
    ],
    on="customer_id",
)

In [10]:
df_trans.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,product_code,product_type_no,product_group_name,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,age,age_id,age_id_1,age_id_2,FN,Active,club_member_status,fashion_news_frequency,postal_code,postal_code_ce
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,663713001,0.050831,2,663713,283,16,1010016,9,4,5,1338,1,1,61,1017,24,4,2,4,0.0,0.0,1,0,3248,16
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,541518023,0.030492,2,541518,306,16,1010016,51,1,4,1334,1,1,61,1017,24,4,2,4,0.0,0.0,1,0,3248,16
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,505221004,0.015237,2,505221,252,7,1010010,52,2,4,5963,3,2,58,1003,32,7,3,6,1.0,1.0,1,1,0,5
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,685687003,0.016932,2,685687,252,7,1010010,52,7,4,3090,0,1,15,1023,32,7,3,6,1.0,1.0,1,1,0,5
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2,685687004,0.016932,2,685687,252,7,1010010,93,4,19,3090,0,1,15,1023,32,7,3,6,1.0,1.0,1,1,0,5


In [4]:
df_agg_val_1 = pd.read_pickle("../exp_iwata/cache/v2/df_agg_val_1_2020-09-23.pkl")
df_agg_val_1 = df_agg_val_1.rename(columns={"article_id": "gts"})

In [13]:
df_agg_val_1.head()

Unnamed: 0,customer_id,gts
0,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,[0624486001]
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf4672f30b3e622fec55,[0827487003]
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed6396773839f6bf71a9,"[0757926001, 0788575004, 0640021019]"
3,000525e3fe01600d717da8423643a8303390a055c578ed8a97256600baf54565,[0874110016]
4,00077dbd5c4a4991e092e63893ccf29294a9d5c46e85010e95f2fc10bf9437a4,"[0903762001, 0879189005, 0158340001, 0867966009, 0915529003, 0932798002, 0915529005, 0486639003,..."


## Ensemble

## Read OOF

In [14]:
def read_oof(exp_name):
    if exp_name in [
        "myaun_cat_v4-3",
        "cat_v4-3_decay_popular_sample",
        "cat_v7-2_random_weighted_sample",
        "cat_v4-3_short",
        "cat_v4-3-long-tail",
        "cat_v12",
        "cat_v4-3-bag"
    ]:
        oof = pd.read_pickle(exps[exp_name]["oof"])
        oof["article_id"] = oof["article_id"].astype(str).str.zfill(10)
    elif exp_name in ["nari_exp_lgbm_007", "nari_exp_lgbm_020", "nari_exp_lgbm_023"]:
        oof = pd.read_feather(exps[exp_name]["oof"])
        oof = oof.rename(columns={"oof":"pred"})
        oof = oof[~(oof.pred==-1.0)]
        oof = oof.sort_values(["customer_id", "pred"], ascending=False)
    elif exp_name == "nari_exp_lgbm_007_fix":
        oof = pd.read_feather(exps[exp_name]["oof"])
        oof = oof.rename(columns={"oof":"pred"})
        oof = oof.drop(columns=["oof_rank"])
        oof = oof.sort_values(["customer_id", "pred"], ascending=False)
    elif exp_name in ["iwata_exp05", "iwata_exp13"]:
        oof = pd.read_csv(exps[exp_name]["oof"])
        oof["article_id"] = oof["article_id"].astype(str).str.zfill(10)
    elif exp_name == "minguin_v28": # フォーマットが違う, ブレンドには使える
        oof = pd.read_csv(exps[exp_name]["oof"])
        oof["LGBMRanker"] = oof["LGBMRanker"].str.split(" ")
    elif exp_name in ["minguin_v34", "minguin_v45"]:
        oof = pd.read_csv(exps[exp_name]["oof"])
        oof = oof.rename(columns={"preds":"pred"})
        oof["article_id"] = oof["article_id"].astype(str).str.zfill(10)
        oof = oof.sort_values(["customer_id", "pred"], ascending=False)
    elif exp_name in ["moro_sub1", "moro_sub2_nn", "moro_sub3"]:
        oof = pd.read_pickle(exps[exp_name]["oof"])
        oof = oof.drop(columns=["true"])
        oof = oof.sort_values(["customer_id", "pred"], ascending=False)
    elif exp_name in ["stacking_nb_5"]:
        oof = pd.read_pickle(exps[exp_name]["oof"])
        oof = oof.sort_values(["customer_id", "pred"], ascending=False)
    return oof

# CVs are not consistent, as each creator calculates them a little differently.
exps = {
    "myaun_cat_v4-3": {
        "lb": 0.0307,
        "cv": 0.03539,
        "oof":"../exp_iwata/cat_v4-3/df_ans_eval_mid1_-1.pkl",
        "sub_pred":"../exp_iwata/cat_v4-3/df_ans_eval.pkl",
        "sub_pred_top30":"../exp_iwata/cat_v4-3/df_ans_eval_top30.pkl",
        "sub":"../sub/cat_v4-3_0.03539.csv",
    },
    "cat_v4-3_decay_popular_sample": {
        "lb": 0.0306,
        "cv": 0.03542,
        "oof":"../exp_iwata/cat_v4-3_decay_popular_sample/df_ans_eval_mid1_-1.pkl",
        "sub_pred":"../exp_iwata/cat_v4-3_decay_popular_sample/df_ans_eval.pkl",
        "sub_pred_top30": "../exp_iwata/cat_v4-3_decay_popular_sample/df_ans_eval_top30.pkl",
        "sub":"../sub/cat_v4-3_decay_popular_sample_0.03542.csv",
    },
    "cat_v4-3-bag": {
        "lb": None,
        "cv": 0.03506,
        "oof":"../exp_iwata/cat_v4-3-bag/df_ans_eval_mid1_-1.pkl",
        "sub_pred":"../exp_iwata/cat_v4-3-bag/df_ans_eval.pkl",
        "sub_pred_top30":"../exp_iwata/cat_v4-3-bag/df_ans_eval_top30.pkl",
        "sub":"../sub/cat_v4-3-bag_0.03505.csv",
    },
    "nari_exp_lgbm_007": {
        "lb":0.0302,
        "cv":0.0366,
        "oof":"../input/nari/oof_top200/oof_exp_lgbm_007_20220428230759_0.0366.feather",
        "sub_pred":"../input/nari/sub_top200/sub_exp_lgbm_007_20220428230759_0.0366.feather",
        "sub_pred_top30": "../input/nari/sub_top200/sub_exp_lgbm_007_20220428230759_0.0366_top30.pickle",
        "sub":"../sub/exp_lgbm_007_20220425033232_0.0366.csv",
    },
    "nari_exp_lgbm_007_fix": {
        "lb":0.0298,
        "cv":0.0366,
        "oof":"../input/nari/oof_top200/oof_exp_lgbm_007_20220505112349_0.0374.feather",
        "sub_pred":"../input/nari/sub_top200/sub_exp_lgbm_007_20220505112349_0.0374.feather",
        "sub_pred_top30": "../input/nari/sub_top200/sub_exp_lgbm_007_20220505112349_0.0374_top30.pickle",
        "sub": "../sub/sub_exp_lgbm_007_20220505112349_0.0374.csv",
    },
    "nari_exp_lgbm_020": {
        "lb":0.0298,
        "cv":0.0380,
        "oof":"../input/nari/exp_lgbm_020_20220507064057_0.0380/oof.feather",
        "sub_pred":"../input/nari/exp_lgbm_020_20220507064057_0.0380/sub.feather",
        "sub_pred_top30": "../input/nari/exp_lgbm_020_20220507064057_0.0380/sub_top30.pickle",
        "sub": None,
    },
    "nari_exp_lgbm_023": {
        "lb":None,
        "cv":0.0388,
        "oof":"../input/nari/exp_lgbm_023/oof.feather",
        "sub_pred":"../input/nari/exp_lgbm_023/sub.feather",
        "sub_pred_top30": "../input/nari/exp_lgbm_023/sub_top30.pickle",
        "sub": None,
    },
    "iwata_exp05": {
        "lb":0.0303,
        "cv":0.3559,
        "oof":"../input/iwata/oof_top200/oof_exp05_2020-09-16.csv",
        "sub_pred":"../input/iwata/sub_top200/oof_submission_exp05_2020-09-23.csv",
        "sub_pred_top30": "../input/iwata/sub_top200/oof_submission_exp05_2020-09-23_top30.pickle",
        "sub":"../sub/submission_exp05.csv",
    },
    "iwata_exp13": {
        "lb":0.0297,
        "cv":0.03625,
        "oof":"../input/iwata/oof_top200/oof_exp13_2020-09-16.csv",
        "sub_pred":"../input/iwata/sub_top200/oof_submission_exp13_2020-09-23.csv",
        "sub_pred_top30": "../input/iwata/sub_top200/oof_submission_exp13_2020-09-23_top30.pickle",
        "sub": None,
    },
    "minguin_v34": {
        "lb":0.0293,
        "cv":0.0314,
        "oof": "../input/minguin/sub2/oof_single_preds_h_m_single_v34_0314.csv",
        "sub_pred": "../input/minguin/sub2/sub_single_preds_h_m_single_v34_0314.csv",
        "sub_pred_top30": "../input/minguin/sub2/sub_single_preds_h_m_single_v34_0314_top30.pickle",
        "sub": "../sub/submission_single_v34.csv",
    },
    "minguin_v45": {
        "lb":None,
        "cv":0.0323,
        "oof": "../input/minguin/oof_single_v45_preds.csv",
        "sub_pred": "../input/minguin/sub_single_v45_preds.csv",
        "sub_pred_top30": "../input/minguin/sub2/sub_single_preds_h_m_single_v34_0314_top30.pickle",
        "sub": "../input/minguin/sub_single_v45.csv",
    },
    "moro_sub1": {
        "lb":0.0301,
        "cv":0.03249,
        "oof":"../input/moro/sub1_fix/train_oof_train123_all.pickle",
        "sub":"../input/moro/sub1/submission_lgb_train123_0.03249.csv",
        "sub_pred_top30": "../input/moro/sub1_fix2/sub_pred_top30_train123_0.03249.pickle",
    },
    "moro_sub3": {
        "lb":0.0294,
        "cv":0.03353,
        "oof":"../input/moro/sub3/oof_lgb2_train_0.03353.pickle",
        "sub":None,
        "sub_pred_top30": "../input/moro/sub3/sub_lgb2_pred_top30_0.03353.pickle",
    },
}

In [15]:
oofs = {}
for exp_name in tqdm(exps.keys()):
    oof = read_oof(exp_name=exp_name)
    tmp = oof.groupby(["customer_id", "article_id"]).size()
    duplicate_num = (tmp > 1).sum()
    oofs[exp_name] = oof

  0%|          | 0/13 [00:00<?, ?it/s]

## stacking exp

In [16]:
topN = 24
preds = []
for exp_name in tqdm(exps.keys()):
    
    oof = oofs[exp_name]
    oof_topN = oof.groupby("customer_id").head(topN).copy()
    
    # add pred rank
    oof_topN["pred_rank"] = oof_topN.groupby("customer_id")["pred"].rank(ascending=False, method="dense")
    
    # add pred norm
    oof_topN["pred_norm"] = oof_topN.groupby('customer_id')["pred"].transform(lambda x: (x - x.mean()) / x.std())
    oof_topN.loc[:, "model"] = exp_name
    
    preds.append(oof_topN)

preds = pd.concat(preds)

  0%|          | 0/13 [00:00<?, ?it/s]

In [18]:
preds.head(50)

Unnamed: 0,customer_id,article_id,pred,pred_rank,pred_norm,model
13796674,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20e02ce5d1e58a8f700b,881919001,5.395974,1.0,3.300488,myaun_cat_v4-3
13796613,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20e02ce5d1e58a8f700b,748269009,4.72859,2.0,1.484222,myaun_cat_v4-3
13796738,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20e02ce5d1e58a8f700b,910601003,4.545564,3.0,0.986124,myaun_cat_v4-3
13796676,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20e02ce5d1e58a8f700b,881919003,4.512596,4.0,0.896403,myaun_cat_v4-3
13796675,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20e02ce5d1e58a8f700b,881919002,4.50832,5.0,0.884766,myaun_cat_v4-3
13796645,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20e02ce5d1e58a8f700b,863646005,4.330892,6.0,0.401901,myaun_cat_v4-3
13796661,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20e02ce5d1e58a8f700b,873276003,4.298442,7.0,0.313589,myaun_cat_v4-3
13796629,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20e02ce5d1e58a8f700b,803757023,4.280724,8.0,0.26537,myaun_cat_v4-3
13796641,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20e02ce5d1e58a8f700b,863646001,4.280371,9.0,0.264409,myaun_cat_v4-3
13796713,fffef3b6b73545df065b521e19f64bf6fe93bfd450ab20e02ce5d1e58a8f700b,905365002,4.24773,10.0,0.175577,myaun_cat_v4-3


In [19]:
preds_pv = preds.pivot_table(values=['pred'], index=['customer_id', 'article_id'], columns=['model'])
preds_pv = preds_pv.reset_index()
preds_pv_columns = ['_'.join(col).strip() for col in preds_pv.columns.values]
preds_pv.columns = preds_pv_columns
preds_pv = preds_pv.rename(columns={"customer_id_":"customer_id", "article_id_":"article_id"})

preds_pv_rank = preds.pivot_table(values=['pred_rank'], index=['customer_id', 'article_id'], columns=['model'])
preds_pv_rank = preds_pv_rank.reset_index()
preds_pv_rank_columns = ['_'.join(col).strip() for col in preds_pv_rank.columns.values]
preds_pv_rank.columns = preds_pv_rank_columns
preds_pv_rank = preds_pv_rank.drop(columns=["customer_id_","article_id_"])

preds_pv_norm = preds.pivot_table(values=['pred_norm'], index=['customer_id', 'article_id'], columns=['model'])
preds_pv_norm = preds_pv_norm.reset_index()
preds_pv_norm_columns = ['_'.join(col).strip() for col in preds_pv_norm.columns.values]
preds_pv_norm.columns = preds_pv_norm_columns
preds_pv_norm = preds_pv_norm.drop(columns=["customer_id_","article_id_"])

preds_pv = pd.concat([
    preds_pv, preds_pv_rank, preds_pv_norm
], axis=1)

In [20]:
preds_pv.head()

Unnamed: 0,customer_id,article_id,pred_cat_v4-3-bag,pred_cat_v4-3_decay_popular_sample,pred_iwata_exp05,pred_iwata_exp13,pred_minguin_v34,pred_minguin_v45,pred_moro_sub1,pred_moro_sub3,pred_myaun_cat_v4-3,pred_nari_exp_lgbm_007,pred_nari_exp_lgbm_007_fix,pred_nari_exp_lgbm_020,pred_nari_exp_lgbm_023,pred_rank_cat_v4-3-bag,pred_rank_cat_v4-3_decay_popular_sample,pred_rank_iwata_exp05,pred_rank_iwata_exp13,pred_rank_minguin_v34,pred_rank_minguin_v45,pred_rank_moro_sub1,pred_rank_moro_sub3,pred_rank_myaun_cat_v4-3,pred_rank_nari_exp_lgbm_007,pred_rank_nari_exp_lgbm_007_fix,pred_rank_nari_exp_lgbm_020,pred_rank_nari_exp_lgbm_023,pred_norm_cat_v4-3-bag,pred_norm_cat_v4-3_decay_popular_sample,pred_norm_iwata_exp05,pred_norm_iwata_exp13,pred_norm_minguin_v34,pred_norm_minguin_v45,pred_norm_moro_sub1,pred_norm_moro_sub3,pred_norm_myaun_cat_v4-3,pred_norm_nari_exp_lgbm_007,pred_norm_nari_exp_lgbm_007_fix,pred_norm_nari_exp_lgbm_020,pred_norm_nari_exp_lgbm_023
0,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,111586001,,,,0.657795,,-0.479323,,,,,,,,,,,19.0,,3.0,,,,,,,,,,,-0.778055,,1.529569,,,,,,,
1,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,111593001,,,,0.672578,,,,,,,,,,,,,17.0,,,,,,,,,,,,,-0.535561,,,,,,,,,
2,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,156231001,,1.615412,0.793522,0.745965,,,,,,,,,,,24.0,6.0,5.0,,,,,,,,,,,-1.191958,0.525743,0.668245,,,,,,,,,
3,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,158340001,2.236995,2.131669,0.839775,0.807334,,-0.528458,,,2.133757,,,,,6.0,6.0,4.0,3.0,,5.0,,,6.0,,,,,0.454618,0.462913,1.347989,1.674924,,1.251714,,,0.489196,,,,
4,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,372860001,1.993111,1.92288,0.764054,0.732153,,,,,1.892704,-0.128967,-0.030585,0.031185,-0.123052,16.0,15.0,9.0,6.0,,,,,16.0,8.0,5.0,1.0,18.0,-0.398759,-0.206364,0.001871,0.441675,,,,,-0.285793,0.427665,0.836985,1.882791,-0.763097


In [21]:
# agg score

pred_names = [f"pred_{exp_name}" for exp_name in exps.keys()]
preds_pv["preds_sum"] = preds_pv[pred_names].sum(axis=1)
preds_pv["preds_min"] = preds_pv[pred_names].min(axis=1)
preds_pv["preds_max"] = preds_pv[pred_names].max(axis=1)
preds_pv["preds_cnt"] = preds_pv[pred_names].count(axis=1)

pred_names = [f"pred_rank_{exp_name}" for exp_name in exps.keys()]
preds_pv["preds_sum_rank"] = preds_pv[pred_names].sum(axis=1)
preds_pv["preds_min_rank"] = preds_pv[pred_names].min(axis=1)
preds_pv["preds_max_rank"] = preds_pv[pred_names].max(axis=1)


pred_names = [f"pred_norm_{exp_name}" for exp_name in exps.keys()]
preds_pv["preds_sum_norm"] = preds_pv[pred_names].sum(axis=1)
preds_pv["preds_min_norm"] = preds_pv[pred_names].min(axis=1)
preds_pv["preds_max_norm"] = preds_pv[pred_names].max(axis=1)

In [23]:
preds_pv.head(10)

Unnamed: 0,customer_id,article_id,pred_cat_v4-3-bag,pred_cat_v4-3_decay_popular_sample,pred_iwata_exp05,pred_iwata_exp13,pred_minguin_v34,pred_minguin_v45,pred_moro_sub1,pred_moro_sub3,pred_myaun_cat_v4-3,pred_nari_exp_lgbm_007,pred_nari_exp_lgbm_007_fix,pred_nari_exp_lgbm_020,pred_nari_exp_lgbm_023,pred_rank_cat_v4-3-bag,pred_rank_cat_v4-3_decay_popular_sample,pred_rank_iwata_exp05,pred_rank_iwata_exp13,pred_rank_minguin_v34,pred_rank_minguin_v45,pred_rank_moro_sub1,pred_rank_moro_sub3,pred_rank_myaun_cat_v4-3,pred_rank_nari_exp_lgbm_007,pred_rank_nari_exp_lgbm_007_fix,pred_rank_nari_exp_lgbm_020,pred_rank_nari_exp_lgbm_023,pred_norm_cat_v4-3-bag,pred_norm_cat_v4-3_decay_popular_sample,pred_norm_iwata_exp05,pred_norm_iwata_exp13,pred_norm_minguin_v34,pred_norm_minguin_v45,pred_norm_moro_sub1,pred_norm_moro_sub3,pred_norm_myaun_cat_v4-3,pred_norm_nari_exp_lgbm_007,pred_norm_nari_exp_lgbm_007_fix,pred_norm_nari_exp_lgbm_020,pred_norm_nari_exp_lgbm_023,preds_sum,preds_min,preds_max,preds_cnt,preds_sum_rank,preds_min_rank,preds_max_rank,preds_sum_norm,preds_min_norm,preds_max_norm
0,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,111586001,,,,0.657795,,-0.479323,,,,,,,,,,,19.0,,3.0,,,,,,,,,,,-0.778055,,1.529569,,,,,,,,0.178473,-0.479323,0.657795,2,22.0,3.0,19.0,0.751514,-0.778055,1.529569
1,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,111593001,,,,0.672578,,,,,,,,,,,,,17.0,,,,,,,,,,,,,-0.535561,,,,,,,,,,0.672578,0.672578,0.672578,1,17.0,17.0,17.0,-0.535561,-0.535561,-0.535561
2,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,156231001,,1.615412,0.793522,0.745965,,,,,,,,,,,24.0,6.0,5.0,,,,,,,,,,,-1.191958,0.525743,0.668245,,,,,,,,,,3.154899,0.745965,1.615412,3,35.0,5.0,24.0,0.002031,-1.191958,0.668245
3,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,158340001,2.236995,2.131669,0.839775,0.807334,,-0.528458,,,2.133757,,,,,6.0,6.0,4.0,3.0,,5.0,,,6.0,,,,,0.454618,0.462913,1.347989,1.674924,,1.251714,,,0.489196,,,,,7.621073,-0.528458,2.236995,6,30.0,3.0,6.0,5.681353,0.454618,1.674924
4,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,372860001,1.993111,1.92288,0.764054,0.732153,,,,,1.892704,-0.128967,-0.030585,0.031185,-0.123052,16.0,15.0,9.0,6.0,,,,,16.0,8.0,5.0,1.0,18.0,-0.398759,-0.206364,0.001871,0.441675,,,,,-0.285793,0.427665,0.836985,1.882791,-0.763097,7.053482,-0.128967,1.993111,9,94.0,1.0,18.0,1.936975,-0.763097,1.882791
5,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,372860002,1.768328,1.677892,0.744698,0.704284,,,-0.104046,,1.657297,-0.212044,-0.11394,-0.108223,-0.087289,22.0,21.0,11.0,10.0,,,23.0,,21.0,16.0,13.0,9.0,9.0,-1.185295,-0.991677,-0.342221,-0.015481,,,-0.84146,,-1.042629,-0.414271,-0.323405,0.377617,-0.232956,5.926957,-0.212044,1.768328,10,155.0,9.0,23.0,-5.011778,-1.185295,0.377617
6,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,436261001,2.872584,2.865778,0.908505,0.877108,-0.551301,-0.499056,0.411979,0.424794,2.874707,,,,,1.0,1.0,1.0,1.0,2.0,4.0,2.0,2.0,1.0,,,,,2.678605,2.81611,2.569829,2.819472,1.847884,1.417983,1.563098,1.18616,2.871358,,,,,10.185098,-0.551301,2.874707,9,15.0,1.0,4.0,19.770499,1.18616,2.871358
7,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,448509014,,,,,,,,,,,,-0.149352,,,,,,,,,,,,,12.0,,,,,,,,,,,,,-0.066442,,-0.149352,-0.149352,-0.149352,1,12.0,12.0,12.0,-0.066442,-0.066442,-0.066442
8,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,456163086,,,,,,,,,,-0.303326,-0.169945,-0.212715,,,,,,,,,,,20.0,24.0,18.0,,,,,,,,,,,-1.339375,-1.103057,-0.75057,,-0.685986,-0.303326,-0.169945,3,62.0,18.0,24.0,-3.193002,-1.339375,-0.75057
9,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,568601006,,,0.742441,0.679613,,,,,,-0.136375,-0.147073,,,,,13.0,14.0,,,,,,10.0,18.0,,,,,-0.38234,-0.420163,,,,,,0.352592,-0.784655,,,1.138607,-0.147073,0.742441,4,55.0,10.0,18.0,-1.234567,-0.784655,0.352592


In [66]:
# add art attrs

preds_pv = fast_left_join(
    preds_pv,
    df_art[
        [
            "product_code",
            "product_type_no",
            "product_group_name",  #
            "graphical_appearance_no",
            "colour_group_code",
            "perceived_colour_value_id",  #
            "perceived_colour_master_id",  #
            "department_no",
            "index_code",
            "index_group_no",
            "section_no",
            "garment_group_no",
        ]
    ],
    on="article_id",
)

preds_pv = fast_left_join(
    preds_pv,
    df_cust[
        [
            "age",
            "age_id",
            "age_id_1",
            "age_id_2",
            "FN",
            "Active",
            "club_member_status",
            "fashion_news_frequency",
            "postal_code",
            "postal_code_ce",
        ]
    ],
    on="customer_id",
)

In [68]:
def rebuy_rate_v2(df_trans_hist, name, key="article_id", target="customer_id", sm=5):
    df_buy1 = (
        df_trans_hist.groupby(key)[target]
        .nunique()
        .reset_index()
        .rename(columns={target: "cnt_buy1"})
    )
    df_buy2 = df_trans_hist[df_trans_hist.duplicated([target, key])].copy()
    df_buy2 = df_buy2.drop_duplicates([target, key])
    df_buy2 = df_buy2.groupby(key)[key].agg(cnt_buy2="count").reset_index()
    df_buy = pd.merge(df_buy1, df_buy2, how="left", on=key).fillna(0)

    df_buy[name] = df_buy["cnt_buy2"] / (df_buy["cnt_buy1"] + sm)
    df_buy = df_buy[[key, name]]
    df_buy.index = df_buy[key]
    df_buy.index.name = key
    df_buy = df_buy.drop(columns=[key])
    return df_buy

# add hist features

len_hist = 366
dev = "cpu"
def stacking_feat_store(df_trans, l_cust, ds, de, dsr, der, dsh, deh):
    feat = {}
    
    df_trans_yesterday = df_trans.query("(t_dat == @der)")  # 1day
    df_trans_recent = df_trans.query("(t_dat >= @dsr) and (t_dat <= @der)")  # 1week
    df_trans_hist = df_trans.query("(t_dat >= @dsh) and (t_dat <= @deh)")  # 1year
    
    # art
    
    # make decay count
    df_trans_hist["t_delay"] = (
        df_trans_hist["t_dat"].max() - df_trans_hist["t_dat"]
    ).dt.days
    df_trans_hist["decay_count"] = 1 / (1 + df_trans_hist["t_delay"] // 7)
    
    
    # agg art
    feat["art_buy_hist"] = df_trans_hist.groupby(["article_id"])["t_dat"].agg(
        art_buy_hist="count"
    )
    feat["art_buy_recent"] = df_trans_recent.groupby(["article_id"])["t_dat"].agg(
        art_buy_recent="count"
    )
    feat["art_buy_yesterday"] = df_trans_yesterday.groupby(["article_id"])[
        "t_dat"
    ].agg(art_buy_yesterday="count")
    feat["art_buy_hist_decay"] = df_trans_hist.groupby(["article_id"])["decay_count"].agg(
        art_buy_hist_decay="sum"
    )

    for k in feat.keys():
        feat[k] = reduce_mem_usage(feat[k])
    if dev == "gpu":
        feat[k] = cudf.from_pandas(feat[k])
    return feat

def custom_fillna(df):
    cnt_columns = [
        "n_buy_hist",
        "n_buy_hist_decay",
        "n_buy_hist_short",
        "n_buy_hist_mid",
        "n_buy_recent",
        "n_buy_hist_all",
        "n_buy_hist_all_decay",
        "n_buy_hist_short_all",
        "n_buy_hist_mid_all",
        "n_buy_recent_all",
        "n_buy_hist_prod",
        "n_buy_recent_prod",
        "n_buy_hist_ptype",
        "n_buy_recent_ptype",
        "n_buy_hist_graph",
        "n_buy_recent_graph",
        "n_buy_hist_col",
        "n_buy_recent_col",
        "n_buy_hist_dep",
        "n_buy_hist_short_dep",
        "n_buy_hist_mid_dep",
        "n_buy_recent_dep",
        "n_buy_hist_idx",
        "n_buy_recent_idx",
        "n_buy_hist_idxg",
        "n_buy_recent_idxg",
        "n_buy_hist_sec",
        "n_buy_recent_sec",
        "n_buy_hist_short_sec",
        "n_buy_hist_mid_sec",
        "n_buy_hist_garm",
        "n_buy_recent_garm",
        "art_buy_yesterday",
        "art_buy_recent",
        "art_buy_hist",
        "art_buy_hist_decay",
        "art_buy_hist_short",
        "art_buy_hist_mid",
        "art_buy_hist_ch1",
        "art_buy_hist_ch2",
        "art_buy_hist_ch1_decay",
        "art_buy_hist_ch2_decay",
        "art_buy_hist_short_ch1",
        "art_buy_hist_short_ch2",
        "code_buy_hist",
        "code_buy_hist_decay",
        "code_buy_recent",
        "code_buy_yesterday",
        "rebuy_rate",
        "code_rebuy_rate",
        "cust_rebuy_rate",
        "n_buy_hist_code_pcol",
        "n_buy_recent_code_pcol",
        "n_buy_hist_idxg_sec",
        "n_buy_recent_idxg_sec",
        "n_buy_hist_idxg_gram",
        "n_buy_recent_idxg_gram",
        "age_id_n_buy_hist",
        "age_id_n_buy_hist_decay",
        "age_id_n_buy_recent",
        # add
        "n_buy_yesterday",
        "hist_index_group_no_1_norm_sm",
        "hist_index_group_no_4_norm_sm",
        "hist_index_group_no_3_norm_sm",
        "hist_index_group_no_26_norm_sm",
        "hist_index_group_no_2_norm_sm",
        "recent_index_group_no_1_norm_sm",
        "recent_index_group_no_4_norm_sm",
        "recent_index_group_no_3_norm_sm",
        "recent_index_group_no_26_norm_sm",
        "recent_index_group_no_2_norm_sm",
        "hist_perceived_colour_value_id_4_norm_sm",
        "hist_perceived_colour_value_id_3_norm_sm",
        "hist_perceived_colour_value_id_1_norm_sm",
        "hist_perceived_colour_value_id_2_norm_sm",
        "hist_perceived_colour_value_id_5_norm_sm",
        "hist_perceived_colour_value_id_7_norm_sm",
        "hist_perceived_colour_value_id_6_norm_sm",
        "hist_perceived_colour_value_id_-1_norm_sm",
        "recent_perceived_colour_value_id_4_norm_sm",
        "recent_perceived_colour_value_id_3_norm_sm",
        "recent_perceived_colour_value_id_1_norm_sm",
        "recent_perceived_colour_value_id_2_norm_sm",
        "recent_perceived_colour_value_id_5_norm_sm",
        "recent_perceived_colour_value_id_7_norm_sm",
        "recent_perceived_colour_value_id_6_norm_sm",
        "recent_perceived_colour_value_id_-1_norm_sm",
        "postal_code_n_buy_hist",
        "postal_code_n_buy_hist_short",
        "postal_code_n_buy_hist_mid",
        "postal_code_n_buy_recent",
        "postal_code_n_buy_yesterday",
        "postal_code_n_buy_hist_ch1",
        "postal_code_n_buy_recent_ch1",
        "postal_code_n_buy_yesterday_ch1",
    ]
    for col in cnt_columns:
        if not col in df.columns: continue
        df[col] = df[col].fillna(0)

    days_columns = [
        "days_after_buy",
        "days_after_buy_all",
        "days_after_buy_prod",
        "days_after_buy_ptype",
        "days_after_buy_graph",
        "days_after_buy_col",
        "days_after_buy_dep",
        "days_after_buy_idx",
        "days_after_buy_idxg",
        "days_after_buy_sec",
        "days_after_buy_garm",
        "days_after_buy_code_pcol",
        "days_after_buy_idxg_sec",
        "days_after_buy_idxg_gram",
        "art_days_after_buy",
        "art_days_from_oldest_buy",
        "art_days_from_mode_buy",
        "days_from_oldest_buy_all",
    ]
    for col in days_columns:
        if not col in df.columns: continue
        df[col] = df[col].fillna(10 + len_hist)

    ch_columns = [
        "rate_sales_channel_hist",
        "rate_sales_channel_recent",
        "art_rate_sales_channel_hist",
        "art_rate_sales_channel_recent",
    ]
    for col in ch_columns:
        if not col in df.columns: continue
        df[col] = df[col].fillna(1.5)

    rank_columns = [i for i in df.columns if "_rank" in i]
    for col in rank_columns:
        if not col in df.columns: continue
        df[col] = df[col].fillna(100000)

    fill_mean_columns = [
        "art_price_hist_mean",
        "art_price_hist_median",
        "art_price_hist_max",
        "art_price_hist_min",
        "art_age_hist_mean",
        "art_age_hist_median",
        "art_age_hist_max",
        "art_age_hist_min",
    ]
    for col in fill_mean_columns:
        if not col in df.columns: continue
        mn = df[col]
        df[col] = df[col].fillna(mn)
    return df

def stacking_add_feat(df, feat):
    df = reduce_mem_usage(df)
    if dev == "gpu":
        df = cudf.from_pandas(df)

    # merge aid
    for col in [
        "art_buy_hist",
        "art_buy_recent",
        "art_buy_yesterday",
        "art_buy_hist_decay",
    ]:
        if dev == "gpu":
            df = df.merge(
                feat[col], how="left", left_on=["article_id"], right_index=True
            )
        else:
            df = fast_left_join(df, feat[col], on="article_id")

    df = custom_fillna(df)
    if dev == "gpu":
        df = df.to_pandas()
    return df

In [69]:
%%time

feat = stacking_feat_store(
    df_trans,
    l_cust=preds_pv["customer_id"].unique(),
    ds=datetime.datetime(2020, 9, 16),
    de=datetime.datetime(2020, 9, 22),
    dsr=datetime.datetime(2020, 9, 9),
    der=datetime.datetime(2020, 9, 15),
    dsh=datetime.datetime(2019, 9, 15),
    deh=datetime.datetime(2020, 9, 15),
)

preds_pv = stacking_add_feat(preds_pv, feat)

CPU times: user 18.6 s, sys: 6.41 s, total: 25 s
Wall time: 24.9 s


In [71]:
# add target

preds_pv = preds_pv.merge(df_trans_week1, how="left", on=['customer_id', 'article_id'])
preds_pv["target"] = preds_pv["target"].fillna(0).astype(int)

In [93]:
preds_pv = preds_pv.drop_duplicates()

In [97]:
in_target_sum = preds_pv["target"].sum() 
recall_rate = in_target_sum / len(df_trans_week1)
in_target_user_sum = (preds_pv.groupby("customer_id")["target"].sum() > 0).sum()
user_num = preds_pv["customer_id"].nunique()
user_cover_rate = in_target_user_sum / preds_pv["customer_id"].nunique()

print(f"recall: {recall_rate} | {in_target_sum} / {len(df_trans_week1)}")
print(f"user_cover_rate: {user_cover_rate} | {in_target_user_sum} / {user_num}")

recall: 0.15956769622495295 | 34593 / 216792
user_cover_rate: 0.34844311724457844 | 24037 / 68984


In [104]:
from sklearn.model_selection import GroupKFold
# from sklearn.model_selection import StratifiedGroupKFold

FOLD_NUM = 10
RANDOM_STATE = 46
CAT_PARAMS = {
    "depth": 5,
    "learning_rate": 0.1,
    "boosting_type": "Plain",
    "bootstrap_type": "Bernoulli",
    "subsample": 0.8,
    "reg_lambda": 0.001,
    "iterations": 10000,
    "od_type": "Iter",
    "od_wait": 30,
    "metric_period": 20,
    "random_seed": RANDOM_STATE,
    "task_type": "CPU",
    # "task_type": "GPU",
    # "gpu_ram_part": 0.95,
    # "devices": "1",
    "verbose": True,
    "loss_function": "YetiRank",
    "eval_metric": "MAP:top=12",
}

models = []
scores = []
oof = np.zeros(len(preds_pv))
y = preds_pv['target'].values
groups = preds_pv['customer_id'].values
kfold = GroupKFold(n_splits=FOLD_NUM)
for fold, (trn, val) in enumerate(kfold.split(preds_pv, y, groups)):
    
    df_train = preds_pv.iloc[trn]
    df_val = preds_pv.iloc[val]
    
    df_train = df_train.sort_values("customer_id")
    X_train = df_train.drop(
        [
            "customer_id",
            "article_id",
            "target",
        ],
        axis=1,
    )
    y_train = df_train["target"]
    idx = df_train.groupby("customer_id")["customer_id"].count().values
    group_id_train = [
        i for i, bascket_num in enumerate(idx) for _ in range(bascket_num)
    ]

    df_val = df_val.sort_values("customer_id")
    X_val = df_val.drop(
        [
            "customer_id",
            "article_id",
            "target",
        ],
        axis=1,
    )
    y_val = df_val["target"]
    idx = df_val.groupby("customer_id")["customer_id"].count().values
    group_id_val = [
        i for i, bascket_num in enumerate(idx) for _ in range(bascket_num)
    ]

    cat_features_index = []
    train_pool = Pool(
        data=X_train,
        label=y_train,
        group_id=group_id_train,
        cat_features=cat_features_index,
    )
    val_pool = Pool(
        data=X_val,
        label=y_val,
        group_id=group_id_val,
        cat_features=cat_features_index,
    )

    model = CatBoostRanker(**CAT_PARAMS)
    model.fit(train_pool, eval_set=val_pool)
    gbdt_score = model.best_score_["validation"]["MAP:top=12"]
    
    # not sorted sample
    X_val = preds_pv.iloc[val].drop(
        [
            "customer_id",
            "article_id",
            "target",
        ],
        axis=1,
    )
    preds = model.predict(X_val)
    oof[val] = preds
    scores.append(gbdt_score)
    models.append(model)



0:	learn: 0.0410067	test: 0.0419911	best: 0.0419911 (0)	total: 907ms	remaining: 2h 31m 13s
20:	learn: 0.0620349	test: 0.0606189	best: 0.0606189 (20)	total: 17.5s	remaining: 2h 18m 31s
40:	learn: 0.0632302	test: 0.0616330	best: 0.0616330 (40)	total: 33.9s	remaining: 2h 17m 14s
60:	learn: 0.0637786	test: 0.0627354	best: 0.0627422 (59)	total: 50.1s	remaining: 2h 16m 11s
80:	learn: 0.0642913	test: 0.0626608	best: 0.0629474 (76)	total: 1m 6s	remaining: 2h 15m 26s
100:	learn: 0.0647752	test: 0.0626312	best: 0.0629474 (76)	total: 1m 22s	remaining: 2h 14m 53s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06294735007
bestIteration = 76

Shrink model to first 77 iterations.




0:	learn: 0.0355655	test: 0.0329585	best: 0.0329585 (0)	total: 888ms	remaining: 2h 28m 1s
20:	learn: 0.0623092	test: 0.0586025	best: 0.0586025 (20)	total: 17.5s	remaining: 2h 18m 57s
40:	learn: 0.0633667	test: 0.0591011	best: 0.0591598 (35)	total: 33.9s	remaining: 2h 17m 16s
60:	learn: 0.0640615	test: 0.0594890	best: 0.0597118 (57)	total: 50.1s	remaining: 2h 15m 58s
80:	learn: 0.0645370	test: 0.0593591	best: 0.0597118 (57)	total: 1m 6s	remaining: 2h 15m 26s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.05971176454
bestIteration = 57

Shrink model to first 58 iterations.




0:	learn: 0.0433469	test: 0.0429370	best: 0.0429370 (0)	total: 907ms	remaining: 2h 31m 4s
20:	learn: 0.0619943	test: 0.0619695	best: 0.0620013 (16)	total: 17.6s	remaining: 2h 19m 11s
40:	learn: 0.0628285	test: 0.0632759	best: 0.0633406 (36)	total: 33.8s	remaining: 2h 16m 44s
60:	learn: 0.0635919	test: 0.0636102	best: 0.0639698 (57)	total: 50.2s	remaining: 2h 16m 16s
80:	learn: 0.0641711	test: 0.0639485	best: 0.0640976 (77)	total: 1m 6s	remaining: 2h 15m 46s
100:	learn: 0.0648310	test: 0.0640604	best: 0.0640976 (77)	total: 1m 22s	remaining: 2h 15m 24s
120:	learn: 0.0652919	test: 0.0644194	best: 0.0644924 (108)	total: 1m 39s	remaining: 2h 14m 55s
140:	learn: 0.0656743	test: 0.0646684	best: 0.0647715 (137)	total: 1m 55s	remaining: 2h 14m 39s
160:	learn: 0.0663134	test: 0.0647593	best: 0.0648711 (143)	total: 2m 11s	remaining: 2h 14m 18s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06487113765
bestIteration = 143

Shrink model to first 144 iterations.




0:	learn: 0.0351619	test: 0.0371634	best: 0.0371634 (0)	total: 862ms	remaining: 2h 23m 41s
20:	learn: 0.0614695	test: 0.0649544	best: 0.0650749 (19)	total: 17.5s	remaining: 2h 18m 58s
40:	learn: 0.0626193	test: 0.0655797	best: 0.0655797 (40)	total: 33.9s	remaining: 2h 17m 22s
60:	learn: 0.0634080	test: 0.0658575	best: 0.0658855 (55)	total: 50.8s	remaining: 2h 18m 3s
80:	learn: 0.0639463	test: 0.0657942	best: 0.0661063 (72)	total: 1m 8s	remaining: 2h 18m 47s
100:	learn: 0.0644143	test: 0.0659975	best: 0.0661133 (97)	total: 1m 24s	remaining: 2h 18m 40s
120:	learn: 0.0648643	test: 0.0663763	best: 0.0665195 (118)	total: 1m 42s	remaining: 2h 19m 13s
140:	learn: 0.0655178	test: 0.0663252	best: 0.0666005 (127)	total: 1m 59s	remaining: 2h 19m 27s
160:	learn: 0.0659868	test: 0.0666522	best: 0.0668768 (153)	total: 2m 16s	remaining: 2h 18m 41s
180:	learn: 0.0665563	test: 0.0666908	best: 0.0669123 (169)	total: 2m 32s	remaining: 2h 17m 51s
Stopped by overfitting detector  (30 iterations wait)

best



0:	learn: 0.0355023	test: 0.0376606	best: 0.0376606 (0)	total: 892ms	remaining: 2h 28m 42s
20:	learn: 0.0614933	test: 0.0650796	best: 0.0650796 (20)	total: 17.6s	remaining: 2h 19m 34s
40:	learn: 0.0625300	test: 0.0665077	best: 0.0665077 (40)	total: 33.9s	remaining: 2h 17m 10s
60:	learn: 0.0633953	test: 0.0666065	best: 0.0666065 (60)	total: 50.2s	remaining: 2h 16m 13s
80:	learn: 0.0637820	test: 0.0671247	best: 0.0671343 (79)	total: 1m 7s	remaining: 2h 17m 7s
100:	learn: 0.0642364	test: 0.0674581	best: 0.0674581 (100)	total: 1m 24s	remaining: 2h 17m 54s
120:	learn: 0.0647159	test: 0.0674495	best: 0.0676276 (106)	total: 1m 41s	remaining: 2h 18m 26s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06762756336
bestIteration = 106

Shrink model to first 107 iterations.




0:	learn: 0.0428138	test: 0.0408158	best: 0.0408158 (0)	total: 963ms	remaining: 2h 40m 30s
20:	learn: 0.0623574	test: 0.0593729	best: 0.0593729 (20)	total: 18.5s	remaining: 2h 26m 13s
40:	learn: 0.0633501	test: 0.0597389	best: 0.0600547 (29)	total: 35.7s	remaining: 2h 24m 33s
60:	learn: 0.0640087	test: 0.0602015	best: 0.0602015 (60)	total: 52.9s	remaining: 2h 23m 31s
80:	learn: 0.0647270	test: 0.0602087	best: 0.0604324 (62)	total: 1m 9s	remaining: 2h 22m 44s
100:	learn: 0.0652751	test: 0.0603508	best: 0.0604686 (88)	total: 1m 27s	remaining: 2h 22m 17s
120:	learn: 0.0656914	test: 0.0606103	best: 0.0606879 (118)	total: 1m 44s	remaining: 2h 21m 44s
140:	learn: 0.0660739	test: 0.0608251	best: 0.0609131 (125)	total: 2m 1s	remaining: 2h 21m 11s
160:	learn: 0.0665978	test: 0.0606263	best: 0.0609200 (145)	total: 2m 18s	remaining: 2h 20m 48s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.0609200294
bestIteration = 145

Shrink model to first 146 iterations.




0:	learn: 0.0426949	test: 0.0430064	best: 0.0430064 (0)	total: 968ms	remaining: 2h 41m 21s
20:	learn: 0.0616739	test: 0.0621087	best: 0.0621087 (20)	total: 18.4s	remaining: 2h 25m 39s
40:	learn: 0.0628990	test: 0.0625894	best: 0.0629657 (35)	total: 35.4s	remaining: 2h 23m 27s
60:	learn: 0.0634779	test: 0.0627211	best: 0.0629657 (35)	total: 52.4s	remaining: 2h 22m 22s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06296570219
bestIteration = 35

Shrink model to first 36 iterations.




0:	learn: 0.0341238	test: 0.0325043	best: 0.0325043 (0)	total: 912ms	remaining: 2h 31m 59s
20:	learn: 0.0622246	test: 0.0589543	best: 0.0590330 (19)	total: 18.5s	remaining: 2h 26m 9s
40:	learn: 0.0635265	test: 0.0589980	best: 0.0592217 (31)	total: 35.6s	remaining: 2h 24m 13s
60:	learn: 0.0640481	test: 0.0591955	best: 0.0595899 (53)	total: 52.7s	remaining: 2h 23m 5s
80:	learn: 0.0648303	test: 0.0588941	best: 0.0595899 (53)	total: 1m 9s	remaining: 2h 22m 51s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.05958986989
bestIteration = 53

Shrink model to first 54 iterations.




0:	learn: 0.0450129	test: 0.0413012	best: 0.0413012 (0)	total: 981ms	remaining: 2h 43m 30s
20:	learn: 0.0623917	test: 0.0592824	best: 0.0593887 (18)	total: 18.7s	remaining: 2h 28m 5s
40:	learn: 0.0632533	test: 0.0599833	best: 0.0600591 (34)	total: 36s	remaining: 2h 25m 38s
60:	learn: 0.0638814	test: 0.0604336	best: 0.0608665 (57)	total: 53.2s	remaining: 2h 24m 21s
80:	learn: 0.0643731	test: 0.0611561	best: 0.0611561 (80)	total: 1m 10s	remaining: 2h 23m 3s
100:	learn: 0.0650130	test: 0.0613686	best: 0.0614531 (97)	total: 1m 27s	remaining: 2h 22m 25s
120:	learn: 0.0654887	test: 0.0611866	best: 0.0614531 (97)	total: 1m 44s	remaining: 2h 22m 15s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06145308596
bestIteration = 97

Shrink model to first 98 iterations.




0:	learn: 0.0384561	test: 0.0398231	best: 0.0398231 (0)	total: 953ms	remaining: 2h 38m 49s
20:	learn: 0.0621018	test: 0.0624178	best: 0.0624178 (20)	total: 18.4s	remaining: 2h 25m 39s
40:	learn: 0.0632463	test: 0.0624890	best: 0.0627403 (24)	total: 35.8s	remaining: 2h 24m 48s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06274025278
bestIteration = 24

Shrink model to first 25 iterations.


In [106]:
# 10fold

preds_cv_val = preds_pv[["customer_id", "article_id"]].copy()
preds_cv_val["oof"] = oof

preds_cv_val = preds_cv_val.sort_values(["customer_id", "oof"], ascending=False)
preds_cv_val = preds_cv_val.groupby("customer_id").head(12)
preds_cv_val = preds_cv_val.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"oof"})
preds_cv_val = preds_cv_val.merge(df_agg_val_1, on="customer_id", how="left").rename(columns={"article_id":"gts"})

gbdt_cv = np.mean(scores)
cv10 = mapk(preds_cv_val["gts"], preds_cv_val["oof"], k=10)
cv12 = mapk(preds_cv_val["gts"], preds_cv_val["oof"], k=12)

print(f"gbdt_cv: {gbdt_cv:.5f}")
print(f"CV@10: {cv10:.6f}")
print(f"CV: {cv12:.6f}")

gbdt_cv: 0.06297
CV@10: 0.040034
CV: 0.040701


In [107]:
# save oof

stacking_name = "stacking_nb_6+risk"
share_dir = "../share/oof_and_subs"
os.makedirs(share_dir, exist_ok=True)

exp_dir = f"{share_dir}/{stacking_name}"
os.makedirs(exp_dir, exist_ok=True)

preds_cv_val = preds_pv[["customer_id", "article_id"]].copy()
preds_cv_val["pred"] = oof
preds_cv_val = preds_cv_val.sort_values(["customer_id", "pred"], ascending=False)
preds_cv_val = preds_cv_val.groupby("customer_id").head(30)
pd.to_pickle(preds_cv_val, f"{exp_dir}/oof.pickle")

## Inference

In [108]:
%%time

# read sub pred topN

topN = 24
sub_preds = []
for exp_name in tqdm(exps.keys()):
    
    if os.path.exists(exps[exp_name]["sub_pred_top30"] + f"_with_feat_top{topN}"):
        print(exp_name, "skip preprocess...")
        sub_topN = pd.read_pickle(exps[exp_name]["sub_pred_top30"] + f"_with_feat_top{topN}")
    else:
        sub = pd.read_pickle(exps[exp_name]["sub_pred_top30"])
        sub_topN = sub.groupby("customer_id").head(topN).copy()
        # add pred rank
        sub_topN["pred_rank"] = sub_topN.groupby("customer_id")["pred"].rank(ascending=False, method="dense")
        # add pred norm, 時間掛かる...保存する
        sub_topN["pred_norm"] = sub_topN.groupby('customer_id')["pred"].transform(lambda x: (x - x.mean()) / x.std())
        sub_topN.to_pickle(exps[exp_name]["sub_pred_top30"] + f"_with_feat_top{topN}")
    
    sub_topN.loc[:, "model"] = exp_name
    sub_preds.append(sub_topN)

sub_preds = pd.concat(sub_preds)

  0%|          | 0/13 [00:00<?, ?it/s]

myaun_cat_v4-3 skip preprocess...
cat_v4-3_decay_popular_sample skip preprocess...
cat_v4-3-bag skip preprocess...
nari_exp_lgbm_007 skip preprocess...
nari_exp_lgbm_007_fix skip preprocess...
nari_exp_lgbm_020 skip preprocess...
nari_exp_lgbm_023 skip preprocess...
iwata_exp05 skip preprocess...
iwata_exp13 skip preprocess...
minguin_v34 skip preprocess...
minguin_v45 skip preprocess...
moro_sub1 skip preprocess...
moro_sub3 skip preprocess...
CPU times: user 48.7 s, sys: 22.5 s, total: 1min 11s
Wall time: 1min 11s


In [109]:
sub_preds = reduce_mem_usage(sub_preds)

In [110]:
feat = stacking_feat_store(
    df_trans,
    l_cust=preds_pv["customer_id"].unique(),
    ds=datetime.datetime(2020, 9, 23),
    de=datetime.datetime(2020, 9, 29),
    dsr=datetime.datetime(2020, 9, 16),
    der=datetime.datetime(2020, 9, 22),
    dsh=datetime.datetime(2019, 9, 22),
    deh=datetime.datetime(2020, 9, 22),
)

In [111]:
customer_ids = sub_preds["customer_id"].unique()

In [112]:
size_block = 30000
list_slice = list(range(0, len(customer_ids), size_block))
if list_slice[-1] != len(customer_ids):
    list_slice.append(len(customer_ids))

In [114]:
sub_all = []
sub_preds_top30_all = []
for batch_idx in tqdm(range(len(list_slice) - 1)):
    customer_ids_batch = customer_ids[list_slice[batch_idx] : list_slice[batch_idx + 1]]
    
    preds = sub_preds[sub_preds.customer_id.isin(customer_ids_batch)]
    
    # to pv
    preds_pv = preds.pivot_table(values=['pred'], index=['customer_id', 'article_id'], columns=['model'])
    preds_pv = preds_pv.reset_index()
    preds_pv_columns = ['_'.join(col).strip() for col in preds_pv.columns.values]
    preds_pv.columns = preds_pv_columns
    preds_pv = preds_pv.rename(columns={"customer_id_":"customer_id", "article_id_":"article_id"})

    preds_pv_rank = preds.pivot_table(values=['pred_rank'], index=['customer_id', 'article_id'], columns=['model'])
    preds_pv_rank = preds_pv_rank.reset_index()
    preds_pv_rank_columns = ['_'.join(col).strip() for col in preds_pv_rank.columns.values]
    preds_pv_rank.columns = preds_pv_rank_columns
    preds_pv_rank = preds_pv_rank.drop(columns=["customer_id_","article_id_"])

    preds_pv_norm = preds.pivot_table(values=['pred_norm'], index=['customer_id', 'article_id'], columns=['model'])
    preds_pv_norm = preds_pv_norm.reset_index()
    preds_pv_norm_columns = ['_'.join(col).strip() for col in preds_pv_norm.columns.values]
    preds_pv_norm.columns = preds_pv_norm_columns
    preds_pv_norm = preds_pv_norm.drop(columns=["customer_id_","article_id_"])

    preds_pv = pd.concat([
        preds_pv, preds_pv_rank, preds_pv_norm
    ], axis=1)

    # agg score

    pred_names = [f"pred_{exp_name}" for exp_name in exps.keys()]
    preds_pv["preds_sum"] = preds_pv[pred_names].sum(axis=1)
    preds_pv["preds_min"] = preds_pv[pred_names].min(axis=1)
    preds_pv["preds_max"] = preds_pv[pred_names].max(axis=1)
    preds_pv["preds_cnt"] = preds_pv[pred_names].count(axis=1)

    pred_names = [f"pred_rank_{exp_name}" for exp_name in exps.keys()]
    preds_pv["preds_sum_rank"] = preds_pv[pred_names].sum(axis=1)
    preds_pv["preds_min_rank"] = preds_pv[pred_names].min(axis=1)
    preds_pv["preds_max_rank"] = preds_pv[pred_names].max(axis=1)


    pred_names = [f"pred_norm_{exp_name}" for exp_name in exps.keys()]
    preds_pv["preds_sum_norm"] = preds_pv[pred_names].sum(axis=1)
    preds_pv["preds_min_norm"] = preds_pv[pred_names].min(axis=1)
    preds_pv["preds_max_norm"] = preds_pv[pred_names].max(axis=1)
    
    # add art attrs

    preds_pv = fast_left_join(
        preds_pv,
        df_art[
            [
                "product_code",
                "product_type_no",
                "product_group_name",  #
                "graphical_appearance_no",
                "colour_group_code",
                "perceived_colour_value_id",  #
                "perceived_colour_master_id",  #
                "department_no",
                "index_code",
                "index_group_no",
                "section_no",
                "garment_group_no",
            ]
        ],
        on="article_id",
    )

    preds_pv = fast_left_join(
        preds_pv,
        df_cust[
            [
                "age",
                "age_id",
                "age_id_1",
                "age_id_2",
                "FN",
                "Active",
                "club_member_status",
                "fashion_news_frequency",
                "postal_code",
                "postal_code_ce",
            ]
        ],
        on="customer_id",
    )

    # add feat

    preds_pv = stacking_add_feat(preds_pv, feat)
    
    # predict (5fold)
    preds_tmp = np.zeros(len(preds_pv))
    for i in range(len(models)):
        preds_tmp += models[i].predict(preds_pv.drop(
            [
                "customer_id",
                "article_id",
            ],
            axis=1,
        ))
    preds_tmp /= len(models)
    
    sub_batch = preds_pv[["customer_id", "article_id", "art_buy_recent"]].copy()
    sub_batch["pred"] = preds_tmp

    # sub_batch = sub_batch.sort_values(["customer_id", "pred"], ascending=False)
    sub_batch = sub_batch.sort_values(["customer_id", "pred", "art_buy_recent"], ascending=False)
    sub_batch = sub_batch.groupby("customer_id").head(12)
    sub_batch = sub_batch.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"prediction"})
    sub_batch["prediction"] = sub_batch["prediction"].apply(lambda x:' '.join(x))
    
    sub_all.append(sub_batch)

    sub_batch = preds_pv[["customer_id", "article_id", "art_buy_recent"]].copy()
    sub_batch["pred"] = preds_tmp

    sub_batch = sub_batch.sort_values(["customer_id", "pred", "art_buy_recent"], ascending=False)
    sub_batch = sub_batch.groupby("customer_id").head(30)
    
    sub_preds_top30_all.append(sub_batch)

  0%|          | 0/46 [00:00<?, ?it/s]

In [115]:
sub_all = pd.concat(sub_all)
sub_all = sub_all.sort_values("customer_id")
sub_all = sub_all.reset_index(drop=True)

In [117]:
sub_all.to_csv(f"../sub/{stacking_name}_{cv12:.5f}.csv", index=False)

In [118]:
sub_all

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,0568601006 0568601043 0568601044 0568601007 0568601030 0568601023 0918890002 0673677002 09242430...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,0918522001 0714790020 0924243002 0448509014 0351484002 0918292001 0915529003 0673677002 08635830...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0794321007 0794321011 0794321008 0924243001 0924243002 0918522001 0866731001 0915529005 09182920...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e,0805000001 0804992014 0751471001 0740519002 0751471043 0852584001 0730683050 0673677002 08050000...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,0730683050 0896152002 0791587001 0730683062 0791587015 0791587021 0924243002 0896152001 08667310...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e4747568cac33e8c541831,0557599022 0791587001 0822344001 0791587015 0720125039 0804992014 0740922009 0791587010 08049920...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab53481233731b5c4f8b7,0762846027 0762846026 0762846006 0762846031 0706016001 0448509014 0762846008 0762846029 07060160...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1778d0116cffd259264,0762846027 0762846006 0762846031 0762846026 0762846029 0762846008 0706016002 0673677002 08840810...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38b2236865d949d4df6a,0714790020 0448509014 0821395005 0874110016 0866731001 0914441004 0714790028 0714790024 05730850...


In [119]:
# save sub preds

sub_preds = pd.concat(sub_preds_top30_all)
sub_preds = sub_preds.sort_values(["customer_id", "pred"], ascending=False)
sub_preds = sub_preds.reset_index(drop=True)

pd.to_pickle(sub_preds, f"{exp_dir}/sub_pred_top30.pickle")

In [121]:
sub_preds

Unnamed: 0,customer_id,article_id,art_buy_recent,pred
0,ffffd9ac14e89946416d80e791d064701994755c3ab686a1eaf3458c36f52241,0806050001,18.0,1.788807
1,ffffd9ac14e89946416d80e791d064701994755c3ab686a1eaf3458c36f52241,0924243002,545.0,0.913257
2,ffffd9ac14e89946416d80e791d064701994755c3ab686a1eaf3458c36f52241,0930380001,297.0,0.787084
3,ffffd9ac14e89946416d80e791d064701994755c3ab686a1eaf3458c36f52241,0924243001,775.0,0.637009
4,ffffd9ac14e89946416d80e791d064701994755c3ab686a1eaf3458c36f52241,0751471043,315.0,0.586333
...,...,...,...,...
41159385,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,0898692006,289.0,-0.057666
41159386,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,0865929003,222.0,-0.088662
41159387,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,0839332001,97.0,-0.093574
41159388,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,0907188001,179.0,-0.098454
