In [None]:
import pickle
import os
import gc
import math
import numpy as np
import pandas as pd
import polars as pl
from sklearn.model_selection import StratifiedGroupKFold
from catboost import CatBoost
from catboost import Pool
import warnings

warnings.filterwarnings("ignore")

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

print(os.cpu_count())

In [None]:
# グローバル変数設定

ROOT = ""  # コンペ用ディレクトリ
OUTPUT_DIR = ""
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# メモリ削減（数値カラムのみ）
def reduce_mem_usage_for_numeric(df):
    """iterate through  the numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if "int" in str(col_type) or "float" in str(col_type):
            c_min = df[col].min()
            c_max = df[col].max()
            if "int" in str(col_type):
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            elif "float" in str(col_type):
                # if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #     df[col] = df[col].astype(np.float16)# サポート対象故
                if (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
# 理想時のcvを見ることができる。
def calc_candidates_recall(candidates, label, type):
    pred = candidates.groupby("session").aid.apply(set)
    pred = pred.reset_index()
    gt = label[label["type"] == type]

    gt_pred = gt.merge(pred, on="session", how="left")

    # negasamp後に、gtに紐づかないaidが出る可能性があるので、空のsetでfillna
    gt_pred["aid"] = gt_pred["aid"].apply(lambda d: d if isinstance(d, set) else set())

    gt_pred["hits"] = gt_pred.apply(
        lambda x: min(len(set(x["ground_truth"]) & x["aid"]), 20), axis=1
    )
    gt_pred["gt_count"] = gt_pred.apply(
        lambda x: min(len(x["ground_truth"]), 20), axis=1
    )
    print(gt_pred.hits.sum() / gt_pred.gt_count.sum())

# data


In [None]:
valid_label = pd.read_pickle(f"{ROOT}/data/input/valid_label_1week.pkl")
valid_session = pd.read_pickle(f"{ROOT}/data/input/valid_trimed_session_1week.pkl")

# make_candidate.ipynb で作成した candidates を load & 目的変数生成


In [None]:
valid_candidates = pd.read_pickle(f"{OUTPUT_DIR}/valid_cart_order_candidates.pkl")

In [None]:
gt = valid_label[valid_label["type"] == "orders"]
gt = gt.explode("ground_truth").reset_index(drop=True)

CHUNKS = 4
chunk_len = math.ceil(len(valid_candidates) / CHUNKS)

chunk_candidates = []
for chunk in range(CHUNKS):
    print("chunk", chunk)
    target_candidates = valid_candidates.iloc[
        chunk * chunk_len : (chunk + 1) * chunk_len
    ]

    target_candidates = target_candidates.merge(
        gt, left_on=["session", "aid"], right_on=["session", "ground_truth"], how="left"
    )

    target_candidates["target"] = 1
    target_candidates.loc[target_candidates["ground_truth"].isnull(), "target"] = 0

    target_candidates = target_candidates.drop(["type", "ground_truth"], axis=1)

    # negative sample
    positives = target_candidates.loc[target_candidates["target"] == 1]
    negatives = target_candidates.loc[target_candidates["target"] == 0].sample(
        frac=0.1, random_state=42
    )
    target_candidates = pd.concat([positives, negatives], axis=0, ignore_index=True)

    chunk_candidates.append(target_candidates)

del valid_candidates, target_candidates
gc.collect()

valid_candidates = pd.concat(chunk_candidates, ignore_index=True)
del chunk_candidates
gc.collect()

In [None]:
valid_candidates.target.value_counts()

In [None]:
# 並びをランダム化する
valid_candidates = valid_candidates.sample(frac=1, random_state=42)
valid_candidates = valid_candidates.reset_index(drop=True)

# 特徴量結合


In [None]:
item_features = pd.read_pickle(
    f"{ROOT}/data/output/features/train/BaseItemFeatures.pkl"
)
user_features = pd.read_pickle(
    f"{ROOT}/data/output/features/train/BaseUserFeatures.pkl"
)
user_item_features = pd.read_pickle(
    f"{ROOT}/data/output/features/train/BaseInteractiveFeatures.pkl"
)

item_count_features = pd.read_pickle(
    f"{ROOT}/data/output/features/train/ItemCountFeatures.pkl"
)

# popularity
popularity_features = pd.read_pickle(
    f"{ROOT}/data/output/features/train/PopularityFeatures.pkl"
)

# type count valid only
item_count_features2 = pd.read_pickle(
    f"{ROOT}/data/output/features/train/ItemCountFeatures2.pkl"
)

# base valid only
item_features2 = pd.read_pickle(
    f"{ROOT}/data/output/features/train/BaseItemFeatures2.pkl"
)

In [None]:
valid_candidates = valid_candidates.merge(item_features, on="aid", how="left")
valid_candidates = valid_candidates.merge(user_features, on="session", how="left")

valid_candidates = valid_candidates.merge(
    user_item_features, on=["session", "aid"], how="left"
)

valid_candidates = valid_candidates.merge(item_count_features, on="aid", how="left")

valid_candidates = valid_candidates.merge(popularity_features, on="aid", how="left")
valid_candidates = valid_candidates.merge(item_count_features2, on="aid", how="left")

valid_candidates = valid_candidates.merge(item_features2, on="aid", how="left")

In [None]:
# castする
valid_candidates = reduce_mem_usage_for_numeric(valid_candidates)

# aidがobjectになることがあるのでintに
valid_candidates.aid = valid_candidates.aid.astype(np.int32)

# catboost


In [None]:
params = {
    "loss_function": "YetiRank",
    "learning_rate": 0.1,
    "iterations": 30000,
    "use_best_model": True,
    "early_stopping_rounds": 1000,
    "task_type": "GPU",
    "random_state": 42,
}

In [None]:
FEATURES = [
    col for col in valid_candidates.columns if col not in ["session", "aid", "target"]
]

with open(f"{OUTPUT_DIR}/order_features.pkl", "wb") as f:
    pickle.dump(FEATURES, f)

TARGET = "target"


sgkf = StratifiedGroupKFold(n_splits=4, shuffle=True, random_state=42)

# rankerを使うからsort
valid_candidates = valid_candidates.sort_values("session").reset_index(drop=True)

In [None]:
for fold, (train_index, valid_index) in enumerate(
    sgkf.split(
        valid_candidates, valid_candidates["target"], groups=valid_candidates["session"]
    )
):
    train = valid_candidates.loc[train_index]
    valid = valid_candidates.loc[valid_index]

    train_X = train[FEATURES]
    train_y = train[TARGET]

    valid_X = valid[FEATURES]
    valid_y = valid[TARGET]

    train_pool = Pool(
        data=train_X, label=train_y, group_id=train.session, cat_features=[]
    )
    eval_pool = Pool(
        data=valid_X, label=valid_y, group_id=valid.session, cat_features=[]
    )

    model = CatBoost(params)
    model.fit(train_pool, eval_set=eval_pool)

    # 予測値を保存
    val_pred = model.predict(valid_X)
    valid["prediction"] = val_pred

    valid[["session", "aid", "target", "prediction"]].to_csv(
        f"{OUTPUT_DIR}/fold{fold}_order_valid_prediction.csv", index=False
    )

    # モデル保存
    with open(f"{OUTPUT_DIR}/fold{fold}_order_cbt.pkl", "wb") as f:
        pickle.dump(model, f)

    # メモリ解放
    del val_pred
    gc.collect()

# cv


## negative sample 後の cv


In [None]:
# overall
tmps = []
for fold in range(4):
    tmp = pd.read_csv(f"{OUTPUT_DIR}/fold{fold}_order_valid_prediction.csv")
    tmps.append(tmp)

pred = pd.concat(tmps).reset_index(drop=True)

pred = pred.sort_values("prediction", ascending=False)
pred = pred.groupby("session").head(20)

In [None]:
for type in ["orders"]:
    print(f"{type}_candidates_recall")
    calc_candidates_recall(pred, valid_label, type)

## negative sample 前の cv


In [None]:
valid_candidates = pd.read_pickle(f"{OUTPUT_DIR}/valid_cart_order_candidates.pkl")
valid_candidates = pl.from_pandas(valid_candidates)

In [None]:
item_features = pd.read_pickle(
    f"{ROOT}/data/output/features/train/BaseItemFeatures.pkl"
)
user_features = pd.read_pickle(
    f"{ROOT}/data/output/features/train/BaseUserFeatures.pkl"
)
user_item_features = pd.read_pickle(
    f"{ROOT}/data/output/features/train/BaseInteractiveFeatures.pkl"
)

item_count_features = pd.read_pickle(
    f"{ROOT}/data/output/features/train/ItemCountFeatures.pkl"
)

# popularity
popularity_features = pd.read_pickle(
    f"{ROOT}/data/output/features/train/PopularityFeatures.pkl"
)

# type count valid only
item_count_features2 = pd.read_pickle(
    f"{ROOT}/data/output/features/train/ItemCountFeatures2.pkl"
)

# base valid only
item_features2 = pd.read_pickle(
    f"{ROOT}/data/output/features/train/BaseItemFeatures2.pkl"
)

In [None]:
CHUNKS = 16  # foldの推論を分割する

all_candidates = []
for fold in range(4):
    print(fold)
    tmp = pd.read_csv(f"{OUTPUT_DIR}/fold{fold}_order_valid_prediction.csv")
    tmp = pl.from_pandas(tmp)
    target_session = tmp["session"].to_list()

    # target sessionに限定
    target_candidates = valid_candidates.filter(pl.col("session").is_in(target_session))
    target_candidates = target_candidates.to_pandas()

    del tmp, target_session
    gc.collect()

    chunk_len = math.ceil(len(target_candidates) / CHUNKS)
    for chunk in range(CHUNKS):
        print("chunk", chunk)

        # ここでchunk分を取り出し、特徴量を結合する
        chunk_candidates = target_candidates.iloc[
            chunk * chunk_len : (chunk + 1) * chunk_len
        ]

        # 特徴量結合
        chunk_candidates = chunk_candidates.merge(item_features, on="aid", how="left")
        chunk_candidates = chunk_candidates.merge(
            user_features, on="session", how="left"
        )
        chunk_candidates = chunk_candidates.merge(
            user_item_features, on=["session", "aid"], how="left"
        )

        chunk_candidates = chunk_candidates.merge(
            item_count_features, on="aid", how="left"
        )

        chunk_candidates = chunk_candidates.merge(
            popularity_features, on="aid", how="left"
        )
        chunk_candidates = chunk_candidates.merge(
            item_count_features2, on="aid", how="left"
        )
        chunk_candidates = chunk_candidates.merge(item_features2, on="aid", how="left")

        # castする
        chunk_candidates = reduce_mem_usage_for_numeric(chunk_candidates)

        # 前準備
        with open(f"{OUTPUT_DIR}/order_features.pkl", "rb") as f:
            FEATURES = pickle.load(f)

        with open(f"{OUTPUT_DIR}/fold{fold}_order_cbt.pkl", "rb") as f:
            model = pickle.load(f)

        chunk_candidates["prediction"] = model.predict(chunk_candidates[FEATURES])

        all_candidates.append(chunk_candidates[["session", "aid", "prediction"]])

        del chunk_candidates, model
        gc.collect()

In [None]:
del valid_candidates
gc.collect()

In [None]:
all_candidates = pd.concat(all_candidates, ignore_index=True)
# 各session上位20の予測に絞る
all_candidates = all_candidates.sort_values("prediction", ascending=False)
all_candidates_top20 = all_candidates.groupby("session").head(20)

calc_candidates_recall(all_candidates_top20, valid_label, "orders")

In [None]:
# 保存する
all_candidates.to_pickle(f"{OUTPUT_DIR}/final_order_valid_prediction.pkl")
all_candidates_top20.to_pickle(f"{OUTPUT_DIR}/max20_final_order_valid_prediction.pkl")