In [1]:
import cudf
from cuml import ForestInference
from cuml.preprocessing.TargetEncoder import TargetEncoder
from catboost import CatBoostClassifier, CatBoostRanker, Pool

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import gc
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
import datetime
import itertools
import os
from contextlib import redirect_stdout
from tqdm.notebook import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

#カラム内の文字数。デフォルトは50だった
pd.set_option("display.max_colwidth", 100)

#行数
pd.set_option("display.max_rows", 500)

#列数
pd.set_option("display.max_columns", 300)

def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

def apk_list(actual, predicted, k=10):
    return [apk(a,p,k) for a,p in zip(actual, predicted)]

# https://www.kaggle.com/tkm2261/fast-pandas-left-join-357x-faster-than-pd-merge
# add mulple index option
def fast_left_join(df, join_df, on):
    if isinstance(on, list):
        return pd.concat(
            [
                df.reset_index(drop=True),
                join_df.reindex(pd.MultiIndex.from_tuples(tuple(map(tuple, df[on].values)), names=on)).reset_index(drop=True)
            ],
            axis=1,
        )        
    else:
        return pd.concat(
            [
                df.reset_index(drop=True),
                join_df.reindex(df[on].values).reset_index(drop=True),
            ],
            axis=1,
        )

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for i, col in enumerate(df.columns):
        try:
            col_type = df[col].dtype

            if col_type != object:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == "int":
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif (
                        c_min > np.iinfo(np.int16).min
                        and c_max < np.iinfo(np.int16).max
                    ):
                        df[col] = df[col].astype(np.int16)
                    elif (
                        c_min > np.iinfo(np.int32).min
                        and c_max < np.iinfo(np.int32).max
                    ):
                        df[col] = df[col].astype(np.int32)
                    elif (
                        c_min > np.iinfo(np.int64).min
                        and c_max < np.iinfo(np.int64).max
                    ):
                        df[col] = df[col].astype(np.int32)
                else:
                    if (
                        c_min > np.finfo(np.float16).min
                        and c_max < np.finfo(np.float16).max
                    ):
                        df[col] = df[col].astype(np.float32)
                    elif (
                        c_min > np.finfo(np.float32).min
                        and c_max < np.finfo(np.float32).max
                    ):
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float32)
        except ValueError:
            continue

    end_mem = df.memory_usage().sum() / 1024 ** 2
    # print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    # print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df

In [2]:
def calc_mapk(preds, df_agg_val_1):
    tmp = preds.groupby("customer_id").head(12)
    tmp = tmp.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"preds"})
    tmp = tmp.merge(df_agg_val_1, how="left", on="customer_id").rename(columns={"article_id":"gts"})
    mapk_val = mapk(tmp["gts"], tmp["preds"])
    print(f"mapk:{mapk_val:.5f}")

In [3]:
tran_dtypes = {
    "t_dat": "str",
    "customer_id": "str",
    "article_id": "int",
    "product_code": "int",
    "price": "float",
    "sales_channel_id": "int",
}
art_dtypes = {
    "article_id": "int",
    "product_code": "int",
    "product_type_no": "int",
    "product_group_name": "str",
    "graphical_appearance_no": "int",
    "colour_group_code": "int",
    "perceived_colour_value_id": "int",
    "perceived_colour_master_id": "int",
    "department_no": "int",
    "index_code": "str",
    "index_group_no": "int",
    "section_no": "int",
    "garment_group_no": "int",
}
cust_dtypes = {"customer_id": "str"}

## Ensemble

In [4]:
df_agg_val_1 = pd.read_pickle("../exp_iwata/cache/v1/df_agg_val_1_2020-09-23.pkl")

In [5]:
df_agg_val_1 = df_agg_val_1.rename(columns={"article_id": "gts"})

# Read OOF

# Stacking

In [6]:
path = "../input/h-and-m-personalized-fashion-recommendations/"

df_trans = pd.read_csv(path + "transactions_train.csv", dtype=tran_dtypes)
df_trans["t_dat"] = pd.to_datetime(df_trans["t_dat"], format="%Y-%m-%d")
df_trans = df_trans.drop_duplicates(["customer_id", "article_id", "t_dat"])
df_trans["article_id"] = df_trans["article_id"].astype(str).str.zfill(10)

df_trans_week1 = df_trans[df_trans.t_dat > datetime.datetime(2020, 9, 15)]
df_trans_week1["target"] = 1
df_trans_week1 = df_trans_week1[['customer_id', 'article_id', "target"]]

In [7]:
df_art = pd.read_csv(path + "articles.csv", dtype=art_dtypes)
df_art["article_id"] = df_art["article_id"].astype(str).str.zfill(10)
le = LabelEncoder()
le.fit(df_art["index_code"].unique())
df_art["index_code"] = le.transform(df_art["index_code"])
le = LabelEncoder()
le.fit(df_art["product_group_name"].unique())
df_art["product_group_name"] = le.transform(df_art["product_group_name"])

df_cust = pd.read_csv(path + "customers.csv", dtype=cust_dtypes)
df_cust["age"] = df_cust["age"].fillna(df_cust["age"].mean())

# age_id
df_cust["age"] = df_cust["age"].astype(int)
customer_age_gorup = pd.read_csv("../save/customer_age_gorup.csv")
df_cust = df_cust.merge(customer_age_gorup, how="left", on=["age"])
df_cust["age_2"] = df_cust["age"]
df_cust.loc[df_cust["age"] >= 61, "age_2"] = 61
df_cust["age_id_1"] = df_cust["age_2"] // 10
df_cust["age_id_2"] = df_cust["age_2"] // 5
df_cust = df_cust.drop(columns=["age_2"])

df_cust[["FN", "Active"]] = df_cust[["FN", "Active"]].fillna(0)
df_cust["club_member_status"] = df_cust["club_member_status"].apply(
    lambda x: 1 if x == "ACTIVE" else 0
)
df_cust["fashion_news_frequency"] = df_cust["fashion_news_frequency"].apply(
    lambda x: 0 if x == "NONE" else 1
)

# postal_code_ce
df_cust["postal_code_ce"] = df_cust["postal_code"].map(
    df_cust["postal_code"].value_counts()
)

# postal_code, 10以下をまとめる
postal_code_cnt = df_cust["postal_code"].value_counts().reset_index()
postal_code_cnt.columns = ["postal_code", "cnt"]

code_map = {
    i: 0 for i in postal_code_cnt[postal_code_cnt.cnt <= 10]["postal_code"].values
}
for i, code in enumerate(
    postal_code_cnt[postal_code_cnt.cnt > 10]["postal_code"].values
):
    code_map[code] = i + 1

df_cust["postal_code"] = df_cust["postal_code"].map(code_map)

df_art.index = df_art.article_id
df_art.index.name = "article_id"
df_art = df_art.drop(columns=["article_id"])

df_cust.index = df_cust.customer_id
df_cust.index.name = "customer_id"
df_cust = df_cust.drop(columns=["customer_id"])

## Read OOF

In [8]:
def read_oof(exp_name):
    if exp_name in ["myaun_cat_v4-3", "cat_v4-3_decay_popular_sample"]:
        oof = pd.read_pickle(exps[exp_name]["oof"])
        oof["article_id"] = oof["article_id"].astype(str).str.zfill(10)
    elif exp_name == "nari_exp_lgbm_007":
        oof = pd.read_feather(exps[exp_name]["oof"])
        oof = oof.rename(columns={"oof":"pred"})
        oof = oof[~(oof.pred==-1.0)]
        oof = oof.sort_values(["customer_id", "pred"], ascending=False)
    elif exp_name == "iwata_exp05":
        oof = pd.read_csv(exps[exp_name]["oof"])
        oof["article_id"] = oof["article_id"].astype(str).str.zfill(10)
    # フォーマットが違う, ブレンドには使える
    elif exp_name == "minguin_v28":
        oof = pd.read_csv(exps[exp_name]["oof"])
        oof["LGBMRanker"] = oof["LGBMRanker"].str.split(" ")
    elif exp_name == "moro_sub1":
        oof = pd.read_pickle(exps[exp_name]["oof"])
        oof = oof.drop(columns=["true"])
        oof = oof.sort_values(["customer_id", "pred"], ascending=False)
    return oof

exps = {
    "myaun_cat_v4-3": {
        "lb":0.0307,
        "cv":0.03539,
        "oof":"../exp_iwata/cat_v4-3/df_ans_eval_mid1_-1.pkl",
        "sub_pred":"../exp_iwata/cat_v4-3/df_ans_eval.pkl",
        "sub_pred_top30":"../exp_iwata/cat_v4-3/df_ans_eval_top30.pkl",
        "sub":"../sub/cat_v4-3_0.03539.csv",
    },
    "cat_v4-3_decay_popular_sample": {
        "lb":0.0,
        "cv":0.03542,
        "oof":"../exp_iwata/cat_v4-3_decay_popular_sample/df_ans_eval_mid1_-1.pkl",
        "sub_pred":"../exp_iwata/cat_v4-3_decay_popular_sample/df_ans_eval.pkl",
        "sub_pred_top30": "../exp_iwata/cat_v4-3_decay_popular_sample/df_ans_eval_top30.pkl",
        "sub":"../sub/cat_v4-3_decay_popular_sample_0.03542.csv",
    },
    "nari_exp_lgbm_007": {
        "lb":0.0302,
        "cv":0.0366,
        "oof":"../input/nari/oof_top200/oof_exp_lgbm_007_20220428230759_0.0366.feather",
        "sub_pred":"../input/nari/sub_top200/sub_exp_lgbm_007_20220428230759_0.0366.feather",
        "sub_pred_top30": "../input/nari/sub_top200/sub_exp_lgbm_007_20220428230759_0.0366_top30.pickle",
        "sub":"../sub/exp_lgbm_007_20220425033232_0.0366.csv",
    },
    "iwata_exp05": {
        "lb":0.0303,
        "cv":0.3559,
        "oof":"../input/iwata/oof_top200/oof_exp05_2020-09-16.csv",
        "sub_pred":"../input/iwata/sub_top200/oof_submission_exp05_2020-09-23.csv",
        "sub_pred_top30": "../input/iwata/sub_top200/oof_submission_exp05_2020-09-23_top30.pickle",
        "sub":"../sub/submission_exp05.csv",
    },
    # "minguin_v28": {
    #     "lb":0.0294,
    #     "cv":0.03079,
    #     "oof":"../input/minguin/sub1/oof_h_m_v28_rerun_0294.csv",
    #     "sub":"../input/minguin/sub1/submission_h_m_v28_rerun_0294.csv",
    # },
    # "moro_sub1": {
    #     "lb":0.0301,
    #     "cv":0.03249,
    #     "oof":"../input/moro/sub1_fix/train_oof_train123_all.pickle",
    #     "sub":"../input/moro/sub1/submission_lgb_train123_0.03249.csv",
    # },
}

In [9]:
oofs = {
    exp_name: read_oof(exp_name=exp_name) for exp_name in exps.keys()
}

## stacking exp

In [10]:
topN = 15
preds = []
for exp_name in exps.keys():
    oof = oofs[exp_name]
    oof_topN = oof.groupby("customer_id").head(topN).copy()
    oof_topN.loc[:, "model"] = exp_name
    preds.append(oof_topN)

preds = pd.concat(preds)

In [11]:
preds_pv = preds.pivot_table(values=['pred'], index=['customer_id', 'article_id'], columns=['model'])
preds_pv = preds_pv.reset_index()
preds_pv.columns = ['_'.join(col).strip() for col in preds_pv.columns.values]
preds_pv = preds_pv.rename(columns={"customer_id_":"customer_id", "article_id_":"article_id"})

In [12]:
# add art attrs

preds_pv = fast_left_join(
    preds_pv,
    df_art[
        [
            "product_code",
            "product_type_no",
            "product_group_name",  #
            "graphical_appearance_no",
            "colour_group_code",
            "perceived_colour_value_id",  #
            "perceived_colour_master_id",  #
            "department_no",
            "index_code",
            "index_group_no",
            "section_no",
            "garment_group_no",
        ]
    ],
    on="article_id",
)

preds_pv = fast_left_join(
    preds_pv,
    df_cust[
        [
            "age",
            "age_id",
            "FN",
            "Active",
            "club_member_status",
            "fashion_news_frequency",
            "postal_code",
            "postal_code_ce",
        ]
    ],
    on="customer_id",
)

In [13]:
%%time

# add hist features

len_hist = 366
dev = "cpu"
def stacking_feat_store(df_trans, l_cust, ds, de, dsr, der, dsh, deh):
    feat = {}
    
    df_trans_yesterday = df_trans.query("(t_dat == @der)")  # 1day
    df_trans_recent = df_trans.query("(t_dat >= @dsr) and (t_dat <= @der)")  # 1week
    df_trans_hist = df_trans.query("(t_dat >= @dsh) and (t_dat <= @deh)")  # 1year
    
    # make decay count
    df_trans_hist["t_delay"] = (
        df_trans_hist["t_dat"].max() - df_trans_hist["t_dat"]
    ).dt.days
    df_trans_hist["decay_count"] = 1 / (1 + df_trans_hist["t_delay"] // 7)
    
    # agg art
    feat["art_buy_hist"] = df_trans_hist.groupby(["article_id"])["t_dat"].agg(
        art_buy_hist="count"
    )
    feat["art_buy_recent"] = df_trans_recent.groupby(["article_id"])["t_dat"].agg(
        art_buy_recent="count"
    )
    feat["art_buy_yesterday"] = df_trans_yesterday.groupby(["article_id"])[
        "t_dat"
    ].agg(art_buy_yesterday="count")
    feat["art_buy_hist_decay"] = df_trans_hist.groupby(["article_id"])["decay_count"].agg(
        art_buy_hist_decay="sum"
    )

    for k in feat.keys():
        feat[k] = reduce_mem_usage(feat[k])
    if dev == "gpu":
        feat[k] = cudf.from_pandas(feat[k])
    return feat

def stacking_add_feat(df, feat):
    df = reduce_mem_usage(df)
    if dev == "gpu":
        df = cudf.from_pandas(df)

    # merge aid
    for col in [
        "art_buy_hist",
        "art_buy_recent",
        "art_buy_yesterday",
        "art_buy_hist_decay",
    ]:
        if dev == "gpu":
            df = df.merge(
                feat[col], how="left", left_on=["article_id"], right_index=True
            )
        else:
            df = fast_left_join(df, feat[col], on="article_id")
    
    if dev == "gpu":
        df = df.to_pandas()
    return df

feat = stacking_feat_store(
    df_trans,
    l_cust=preds_pv["customer_id"].unique(),
    ds=datetime.datetime(2020, 9, 16),
    de=datetime.datetime(2020, 9, 22),
    dsr=datetime.datetime(2020, 9, 9),
    der=datetime.datetime(2020, 9, 15),
    dsh=datetime.datetime(2019, 9, 15),
    deh=datetime.datetime(2020, 9, 15),
)

preds_pv = stacking_add_feat(preds_pv, feat)

CPU times: user 6.03 s, sys: 769 ms, total: 6.8 s
Wall time: 6.68 s


In [14]:
# add target

preds_pv = preds_pv.merge(df_trans_week1, how="left", on=['customer_id', 'article_id'])
preds_pv["target"] = preds_pv["target"].fillna(0).astype(int)

In [15]:
preds_pv.head()

Unnamed: 0,customer_id,article_id,pred_cat_v4-3_decay_popular_sample,pred_iwata_exp05,pred_myaun_cat_v4-3,pred_nari_exp_lgbm_007,product_code,product_type_no,product_group_name,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,age,age_id,FN,Active,club_member_status,fashion_news_frequency,postal_code,postal_code_ce,art_buy_hist,art_buy_recent,art_buy_yesterday,art_buy_hist_decay,target
0,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,156231001,,0.793522,,,156231,304,13,1010016,9,4,5,3608,1,1,62,1021,27,5,0.0,0.0,1,0,0,5,6028.0,123.0,8.0,436.081482,0
1,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,158340001,2.131669,0.839775,2.133757,,158340,273,6,1010016,9,4,5,3608,1,1,62,1021,27,5,0.0,0.0,1,0,0,5,6573.0,186.0,24.0,507.398865,0
2,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,372860001,1.92288,0.764054,,-0.128967,372860,302,13,1010016,9,4,5,3611,1,1,62,1021,27,5,0.0,0.0,1,0,0,5,14123.0,249.0,13.0,1199.171875,0
3,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,372860002,,0.744698,,,372860,302,13,1010016,10,3,9,3611,1,1,62,1021,27,5,0.0,0.0,1,0,0,5,11951.0,226.0,22.0,1221.164795,0
4,00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793,436261001,2.865777,0.908505,2.874707,,436261,273,6,1010016,9,4,5,3608,1,1,62,1021,27,5,0.0,0.0,1,0,0,5,3865.0,35.0,3.0,211.724411,0


In [16]:
from sklearn.model_selection import GroupKFold

FOLD_NUM = 5
RANDOM_STATE = 46
CAT_PARAMS = {
    "depth": 5,
    "learning_rate": 0.1,
    "boosting_type": "Plain",
    "bootstrap_type": "Bernoulli",
    "subsample": 0.8,
    "reg_lambda": 0.001,
    "iterations": 10000,
    "od_type": "Iter",
    "od_wait": 30,
    "metric_period": 20,
    "random_seed": RANDOM_STATE,
    "task_type": "CPU",
    # "task_type": "GPU",
    # "gpu_ram_part": 0.95,
    # "devices": "1",
    "verbose": True,
    "loss_function": "YetiRank",
    "eval_metric": "MAP:top=12",
}

models = []
scores = []
oof = np.zeros(len(preds_pv))
y = preds_pv['target'].values
groups = preds_pv['customer_id'].values
kfold = GroupKFold(n_splits=FOLD_NUM)
for fold, (trn, val) in enumerate(kfold.split(preds_pv, y, groups)):
    
    df_train = preds_pv.iloc[trn]
    df_val = preds_pv.iloc[val]
    
    df_train = df_train.sort_values("customer_id")
    X_train = df_train.drop(
        [
            "customer_id",
            "article_id",
            "target",
        ],
        axis=1,
    )
    y_train = df_train["target"]
    idx = df_train.groupby("customer_id")["customer_id"].count().values
    group_id_train = [
        i for i, bascket_num in enumerate(idx) for _ in range(bascket_num)
    ]

    df_val = df_val.sort_values("customer_id")
    X_val = df_val.drop(
        [
            "customer_id",
            "article_id",
            "target",
        ],
        axis=1,
    )
    y_val = df_val["target"]
    idx = df_val.groupby("customer_id")["customer_id"].count().values
    group_id_val = [
        i for i, bascket_num in enumerate(idx) for _ in range(bascket_num)
    ]

    cat_features_index = []
    train_pool = Pool(
        data=X_train,
        label=y_train,
        group_id=group_id_train,
        cat_features=cat_features_index,
    )
    val_pool = Pool(
        data=X_val,
        label=y_val,
        group_id=group_id_val,
        cat_features=cat_features_index,
    )

    model = CatBoostRanker(**CAT_PARAMS)
    model.fit(train_pool, eval_set=val_pool)
    gbdt_score = model.best_score_["validation"]["MAP:top=12"]
    
    # not sorted sample
    X_val = preds_pv.iloc[val].drop(
        [
            "customer_id",
            "article_id",
            "target",
        ],
        axis=1,
    )
    preds = model.predict(X_val)
    oof[val] = preds
    scores.append(gbdt_score)
    models.append(model)



0:	learn: 0.0371723	test: 0.0393621	best: 0.0393621 (0)	total: 381ms	remaining: 1h 3m 26s
20:	learn: 0.0635651	test: 0.0664345	best: 0.0664345 (20)	total: 5.16s	remaining: 40m 50s
40:	learn: 0.0652268	test: 0.0671958	best: 0.0672540 (37)	total: 9.85s	remaining: 39m 53s
60:	learn: 0.0659804	test: 0.0676563	best: 0.0676563 (60)	total: 14.6s	remaining: 39m 42s
80:	learn: 0.0663866	test: 0.0679430	best: 0.0679430 (80)	total: 19.3s	remaining: 39m 21s
100:	learn: 0.0671970	test: 0.0678164	best: 0.0682240 (85)	total: 23.9s	remaining: 39m 4s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06822404074
bestIteration = 85

Shrink model to first 86 iterations.




0:	learn: 0.0377061	test: 0.0373393	best: 0.0373393 (0)	total: 269ms	remaining: 44m 49s
20:	learn: 0.0642019	test: 0.0626487	best: 0.0629576 (16)	total: 4.81s	remaining: 38m 5s
40:	learn: 0.0656921	test: 0.0641341	best: 0.0641356 (39)	total: 9.39s	remaining: 38m
60:	learn: 0.0665413	test: 0.0644006	best: 0.0644006 (60)	total: 14s	remaining: 38m 8s
80:	learn: 0.0671609	test: 0.0644839	best: 0.0646486 (66)	total: 18.6s	remaining: 37m 54s
100:	learn: 0.0676795	test: 0.0648601	best: 0.0649245 (96)	total: 25s	remaining: 40m 49s
120:	learn: 0.0683581	test: 0.0648910	best: 0.0650168 (107)	total: 33.9s	remaining: 46m 9s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06501682605
bestIteration = 107

Shrink model to first 108 iterations.




0:	learn: 0.0380290	test: 0.0383102	best: 0.0383102 (0)	total: 258ms	remaining: 43m 3s
20:	learn: 0.0643970	test: 0.0646380	best: 0.0646380 (20)	total: 4.89s	remaining: 38m 43s
40:	learn: 0.0657170	test: 0.0657093	best: 0.0657093 (40)	total: 9.44s	remaining: 38m 12s
60:	learn: 0.0664209	test: 0.0656253	best: 0.0659833 (44)	total: 14s	remaining: 37m 54s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06598325308
bestIteration = 44

Shrink model to first 45 iterations.




0:	learn: 0.0372065	test: 0.0379759	best: 0.0379759 (0)	total: 254ms	remaining: 42m 18s
20:	learn: 0.0642308	test: 0.0636657	best: 0.0636657 (20)	total: 4.87s	remaining: 38m 34s
40:	learn: 0.0660219	test: 0.0641866	best: 0.0643371 (38)	total: 9.42s	remaining: 38m 7s
60:	learn: 0.0668040	test: 0.0645513	best: 0.0646168 (58)	total: 14s	remaining: 37m 55s
80:	learn: 0.0673764	test: 0.0643960	best: 0.0646168 (58)	total: 18.5s	remaining: 37m 45s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06461682988
bestIteration = 58

Shrink model to first 59 iterations.




0:	learn: 0.0377118	test: 0.0348837	best: 0.0348837 (0)	total: 259ms	remaining: 43m 14s
20:	learn: 0.0642950	test: 0.0596436	best: 0.0597151 (19)	total: 4.84s	remaining: 38m 21s
40:	learn: 0.0661121	test: 0.0616789	best: 0.0616789 (40)	total: 9.41s	remaining: 38m 5s
60:	learn: 0.0669863	test: 0.0619871	best: 0.0621860 (59)	total: 13.9s	remaining: 37m 51s
80:	learn: 0.0677475	test: 0.0627075	best: 0.0627910 (79)	total: 18.5s	remaining: 37m 48s
100:	learn: 0.0684462	test: 0.0627927	best: 0.0628066 (97)	total: 23.1s	remaining: 37m 41s
120:	learn: 0.0689833	test: 0.0629176	best: 0.0630523 (116)	total: 27.7s	remaining: 37m 38s
140:	learn: 0.0695110	test: 0.0634485	best: 0.0634485 (140)	total: 32.2s	remaining: 37m 30s
160:	learn: 0.0699438	test: 0.0635611	best: 0.0635808 (159)	total: 36.7s	remaining: 37m 24s
180:	learn: 0.0705370	test: 0.0635120	best: 0.0636436 (165)	total: 41.3s	remaining: 37m 19s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.06364358165
bestIteration 

In [17]:
# add aid hist feat (only 3 + hist_decay)


preds_cv = preds_pv[["customer_id", "article_id"]].copy()
preds_cv["oof"] = oof

preds_cv = preds_cv.sort_values(["customer_id", "oof"], ascending=False)
preds_cv = preds_cv.groupby("customer_id").head(12)
preds_cv = preds_cv.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"oof"})
preds_cv = preds_cv.merge(df_agg_val_1, on="customer_id", how="left").rename(columns={"article_id":"gts"})

gbdt_cv = np.mean(scores)
cv = mapk(preds_cv["gts"], preds_cv["oof"])

print(f"gbdt_cv: {gbdt_cv}")
print(f"CV: {cv}")

gbdt_cv: 0.06549690628096158
CV: 0.03822190930938974


In [32]:
exps = {
    "myaun_cat_v4-3": {
        "lb":0.0307,
        "cv":0.03539,
        "oof":"../exp_iwata/cat_v4-3/df_ans_eval_mid1_-1.pkl",
        "sub_pred":"../exp_iwata/cat_v4-3/df_ans_eval.pkl",
        "sub_pred_top30":"../exp_iwata/cat_v4-3/df_ans_eval_top30.pkl",
        "sub":"../sub/cat_v4-3_0.03539.csv",
    },
    "cat_v4-3_decay_popular_sample": {
        "lb":0.0,
        "cv":0.03542,
        "oof":"../exp_iwata/cat_v4-3_decay_popular_sample/df_ans_eval_mid1_-1.pkl",
        "sub_pred":"../exp_iwata/cat_v4-3_decay_popular_sample/df_ans_eval.pkl",
        "sub_pred_top30": "../exp_iwata/cat_v4-3_decay_popular_sample/df_ans_eval_top30.pkl",
        "sub":"../sub/cat_v4-3_decay_popular_sample_0.03542.csv",
    },
    "nari_exp_lgbm_007": {
        "lb":0.0302,
        "cv":0.0366,
        "oof":"../input/nari/oof_top200/oof_exp_lgbm_007_20220428230759_0.0366.feather",
        "sub_pred":"../input/nari/sub_top200/sub_exp_lgbm_007_20220428230759_0.0366.feather",
        "sub_pred_top30": "../input/nari/sub_top200/sub_exp_lgbm_007_20220428230759_0.0366_top30.pickle",
        "sub":"../sub/exp_lgbm_007_20220425033232_0.0366.csv",
    },
    "iwata_exp05": {
        "lb":0.0303,
        "cv":0.3559,
        "oof":"../input/iwata/oof_top200/oof_exp05_2020-09-16.csv",
        "sub_pred":"../input/iwata/sub_top200/oof_submission_exp05_2020-09-23.csv",
        "sub_pred_top30": "../input/iwata/sub_top200/oof_submission_exp05_2020-09-23_top30.pickle",
        "sub":"../sub/submission_exp05.csv",
    },
    # "minguin_v28": {
    #     "lb":0.0294,
    #     "cv":0.03079,
    #     "oof":"../input/minguin/sub1/oof_h_m_v28_rerun_0294.csv",
    #     "sub":"../input/minguin/sub1/submission_h_m_v28_rerun_0294.csv",
    # },
    # "moro_sub1": {
    #     "lb":0.0301,
    #     "cv":0.03249,
    #     "oof":"../input/moro/sub1_fix/train_oof_train123_all.pickle",
    #     "sub":"../input/moro/sub1/submission_lgb_train123_0.03249.csv",
    # },
}

## Inference

In [33]:
# read sub pred topN

topN = 15
sub_preds = []
for exp_name in tqdm(exps.keys()):
    sub = pd.read_pickle(exps[exp_name]["sub_pred_top30"])
    sub_topN = sub.groupby("customer_id").head(topN).copy()
    sub_topN.loc[:, "model"] = exp_name
    sub_preds.append(sub_topN)

sub_preds = pd.concat(sub_preds)

  0%|          | 0/4 [00:00<?, ?it/s]

In [42]:
feat = stacking_feat_store(
    df_trans,
    l_cust=preds_pv["customer_id"].unique(),
    ds=datetime.datetime(2020, 9, 23),
    de=datetime.datetime(2020, 9, 29),
    dsr=datetime.datetime(2020, 9, 16),
    der=datetime.datetime(2020, 9, 22),
    dsh=datetime.datetime(2019, 9, 22),
    deh=datetime.datetime(2020, 9, 22),
)

In [34]:
customer_ids = sub_preds["customer_id"].unique()

In [68]:
size_block = 10000
list_slice = list(range(0, len(customer_ids), size_block))
if list_slice[-1] != len(customer_ids):
    list_slice.append(len(customer_ids))

In [69]:
sub_all = []
for batch_idx in tqdm(range(len(list_slice) - 1)):
    customer_ids_batch = customer_ids[list_slice[batch_idx] : list_slice[batch_idx + 1]]
    
    sub_preds_batch = sub_preds[sub_preds.customer_id.isin(customer_ids_batch)]
    
    # to pv
    sub_preds_batch = sub_preds_batch.pivot_table(values=['pred'], index=['customer_id', 'article_id'], columns=['model'])
    sub_preds_batch = sub_preds_batch.reset_index()
    sub_preds_batch.columns = ['_'.join(col).strip() for col in sub_preds_batch.columns.values]
    sub_preds_batch = sub_preds_batch.rename(columns={"customer_id_":"customer_id", "article_id_":"article_id"})
    
    # add art attrs
    sub_preds_batch = fast_left_join(
        sub_preds_batch,
        df_art[
            [
                "product_code",
                "product_type_no",
                "product_group_name",  #
                "graphical_appearance_no",
                "colour_group_code",
                "perceived_colour_value_id",  #
                "perceived_colour_master_id",  #
                "department_no",
                "index_code",
                "index_group_no",
                "section_no",
                "garment_group_no",
            ]
        ],
        on="article_id",
    )

    sub_preds_batch = fast_left_join(
        sub_preds_batch,
        df_cust[
            [
                "age",
                "age_id",
                "FN",
                "Active",
                "club_member_status",
                "fashion_news_frequency",
                "postal_code",
                "postal_code_ce",
            ]
        ],
        on="customer_id",
    )
    
    # add feat
    sub_preds_batch = stacking_add_feat(sub_preds_batch, feat)
    
    # predict (5fold)
    preds_tmp = np.zeros(len(sub_preds_batch))
    for i in range(len(models)):
        preds_tmp += models[i].predict(sub_preds_batch.drop(
            [
                "customer_id",
                "article_id",
            ],
            axis=1,
        ))
    preds_tmp /= len(models)
    
    sub_batch = sub_preds_batch[["customer_id", "article_id"]].copy()
    sub_batch["pred"] = preds_tmp

    sub_batch = sub_batch.sort_values(["customer_id", "pred"], ascending=False)
    sub_batch = sub_batch.groupby("customer_id").head(12)
    sub_batch = sub_batch.groupby("customer_id")["article_id"].apply(list).reset_index().rename(columns={"article_id":"prediction"})
    sub_batch["prediction"] = sub_batch["prediction"].apply(lambda x:' '.join(x))
    
    sub_all.append(sub_batch)

  0%|          | 0/138 [00:00<?, ?it/s]

In [83]:
sub_all = pd.concat(sub_all)
sub_all = sub_all.sort_values("customer_id")
sub_all = sub_all.reset_index(drop=True)

In [86]:
sub_all.to_csv(f"../sub/stacking-v1_{cv:.5f}.csv", index=False)

In [87]:
sub_all

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657,0568601006 0568601043 0568601044 0673677002 0579541001 0678942001 0568601023 0779781015 05686010...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa,0918522001 0924243002 0915529003 0448509014 0902528006 0909059002 0788575004 0714790020 09292750...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318,0794321007 0794321011 0866731001 0794321008 0924243002 0918292001 0918522001 0915529005 09155290...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e,0924243002 0852584001 0866731001 0730683050 0791587001 0751471043 0804992014 0740519002 09182920...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a,0730683050 0791587001 0896152002 0791587015 0730683062 0866731001 0924243002 0791587021 08525840...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e4747568cac33e8c541831,0557599022 0791587001 0866731001 0791587015 0791587010 0822344001 0804992014 0804992013 09182920...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab53481233731b5c4f8b7,0762846027 0762846031 0706016001 0706016003 0762846006 0762846008 0448509014 0706016062 07060160...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1778d0116cffd259264,0762846027 0762846006 0762846031 0762846026 0762846029 0762846008 0706016002 0850917001 06736770...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38b2236865d949d4df6a,0714790020 0448509014 0874110016 0855706009 0866731001 0714790028 0821395005 0914441004 07147900...
