In [None]:
import unicodedata
import re
import pickle
import os
import gc
import copy
import time
import joblib
from pprint import pprint
import random
from collections import defaultdict, Counter
from tqdm import tqdm
import string
from typing import List, Dict, Tuple
from datetime import datetime, timedelta, timezone
import math

import numpy as np
import pandas as pd
import polars as pl
import cudf
from matplotlib import pyplot as plt

from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold

from catboost import CatBoost, CatBoostRanker
from catboost import Pool

import warnings

warnings.filterwarnings("ignore")

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

print(os.cpu_count())


In [None]:
# グローバル変数設定
N_SPLITS = 4
CHUNKS = 12  # 推論分割数

ROOT = ""  # コンペ用ディレクトリ
OUTPUT_DIR = ""
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [None]:
# メモリ削減（数値カラムのみ）
def reduce_mem_usage_for_numeric(df):
    """iterate through  the numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if "int" in str(col_type) or "float" in str(col_type):
            c_min = df[col].min()
            c_max = df[col].max()
            if "int" in str(col_type):
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            elif "float" in str(col_type):
                # if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #     df[col] = df[col].astype(np.float16)# サポート対象故
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))

    return df


In [None]:
test = pd.read_pickle(f"{ROOT}/data/input/processed_data2/test.pkl")


# click

## candidate

In [None]:
all_candidates = pd.read_pickle(f"/{OUTPUT_DIR}/test_click_candidates.pkl")
all_candidates = pl.from_pandas(all_candidates)


## 特徴量

In [None]:
item_features = pd.read_pickle(f"{ROOT}/data/output/features/test/TrimBaseItemFeatures.pkl")
item_features = item_features.reset_index()
item_features = pl.from_pandas(item_features)

user_features = pd.read_pickle(f"{ROOT}/data/output/features/test/BaseUserFeatures.pkl")
user_features = user_features.reset_index()
user_features = pl.from_pandas(user_features)

user_item_features = pd.read_pickle(f"{ROOT}/data/output/features/test/BaseInteractiveFeatures.pkl")
user_item_features = user_item_features.reset_index()
user_item_features = pl.from_pandas(user_item_features)

item_count_features = pd.read_pickle(f"{ROOT}/data/output/features/test/TrimItemCountFeatures.pkl")
item_count_features = item_count_features.reset_index()
item_count_features = pl.from_pandas(item_count_features)

popularity_features = pd.read_pickle(f"{ROOT}/data/output/features/test/PopularityFeatures.pkl")
popularity_features = pl.from_pandas(popularity_features)

item_count_features2 = pd.read_pickle(f"{ROOT}/data/output/features/test/ItemCountFeatures2.pkl")
item_count_features2 = item_count_features2.reset_index()
item_count_features2 = pl.from_pandas(item_count_features2)

item_features2 = pd.read_pickle(f"{ROOT}/data/output/features/test/BaseItemFeatures2.pkl")
item_features2 = item_features2.reset_index()
item_features2 = pl.from_pandas(item_features2)


In [None]:
def cast_cols(df, columns):
    for col in columns:
        df = df.with_column(df.get_column(col).cast(pl.Int32))
    return df


item_features = cast_cols(item_features, ["aid"])
user_features = cast_cols(user_features, ["session"])
user_item_features = cast_cols(user_item_features, ["session", "aid"])
item_count_features = cast_cols(item_count_features, ["aid"])
popularity_features = cast_cols(popularity_features, ["aid"])
item_count_features2 = cast_cols(item_count_features2, ["aid"])
item_features2 = cast_cols(item_features2, ["aid"])


## 推論

In [None]:
chunk_len = math.ceil(len(all_candidates) / CHUNKS)


In [None]:
folds_prediction = np.zeros(len(all_candidates))
for chunk in range(CHUNKS):
    print("chunk", chunk)
    # ここでchunk分を取り出し、特徴量を結合する
    target_candidates = all_candidates[chunk * chunk_len : (chunk + 1) * chunk_len]

    target_candidates = target_candidates.join(item_features, on="aid", how="left")
    target_candidates = target_candidates.join(user_features, on="session", how="left")

    target_candidates = target_candidates.join(user_item_features, on=["session", "aid"], how="left")

    target_candidates = target_candidates.join(item_count_features, on="aid", how="left")

    target_candidates = target_candidates.join(popularity_features, on="aid", how="left")
    target_candidates = target_candidates.join(item_count_features2, on="aid", how="left")
    target_candidates = target_candidates.join(item_features2, on="aid", how="left")

    # pandasにもどす
    target_candidates = target_candidates.to_pandas()
    with open(f"{OUTPUT_DIR}/click_features.pkl", "rb") as f:
        FEATURES = pickle.load(f)

    for fold in range(N_SPLITS):
        print("fold", fold)

        with open(f"{OUTPUT_DIR}/fold{fold}_click_cbt.pkl", "rb") as f:
            model = pickle.load(f)

        prediction = model.predict(target_candidates[FEATURES])

        folds_prediction[chunk * chunk_len : (chunk + 1) * chunk_len] += prediction

    folds_prediction[chunk * chunk_len : (chunk + 1) * chunk_len] /= N_SPLITS


In [None]:
del (
    item_features,
    user_features,
    user_item_features,
    item_count_features,
    popularity_features,
    item_count_features2,
    item_features2,
)
gc.collect()


In [None]:
all_candidates = all_candidates.to_pandas()
all_candidates["prediction"] = folds_prediction
all_candidates = all_candidates[["session", "aid", "prediction"]]

del folds_prediction
gc.collect()


In [None]:
# sortして上位20取り出す。
all_candidates = all_candidates.sort_values("prediction", ascending=False)
all_candidates = all_candidates.groupby("session").head(20)

# 整形
all_candidates = all_candidates.groupby("session").aid.apply(list)

all_candidates = all_candidates.reset_index()
all_candidates["session"] = all_candidates["session"].apply(lambda x: str(x) + "_clicks")
all_candidates["aid"] = all_candidates["aid"].apply(lambda x: " ".join(list(map(str, x))))

# カラム名変更
all_candidates = all_candidates.rename(columns={"session": "session_type", "aid": "labels"})

all_candidates.to_pickle(f"{OUTPUT_DIR}/test_click_top20_candidates.pkl")

del all_candidates
gc.collect()


# cart

## candidate

In [None]:
all_candidates = pd.read_pickle(f"/{OUTPUT_DIR}/test_cart_order_candidates.pkl")
all_candidates = pl.from_pandas(all_candidates)


## 特徴量

In [None]:
item_features = pd.read_pickle(f"{ROOT}/data/output/features/test/TrimBaseItemFeatures.pkl")
item_features = item_features.reset_index()
item_features = pl.from_pandas(item_features)

user_features = pd.read_pickle(f"{ROOT}/data/output/features/test/BaseUserFeatures.pkl")
user_features = user_features.reset_index()
user_features = pl.from_pandas(user_features)

user_item_features = pd.read_pickle(f"{ROOT}/data/output/features/test/BaseInteractiveFeatures.pkl")
user_item_features = user_item_features.reset_index()
user_item_features = pl.from_pandas(user_item_features)

item_count_features = pd.read_pickle(f"{ROOT}/data/output/features/test/TrimItemCountFeatures.pkl")
item_count_features = item_count_features.reset_index()
item_count_features = pl.from_pandas(item_count_features)

popularity_features = pd.read_pickle(f"{ROOT}/data/output/features/test/PopularityFeatures.pkl")
popularity_features = pl.from_pandas(popularity_features)

item_count_features2 = pd.read_pickle(f"{ROOT}/data/output/features/test/ItemCountFeatures2.pkl")
item_count_features2 = item_count_features2.reset_index()
item_count_features2 = pl.from_pandas(item_count_features2)

item_features2 = pd.read_pickle(f"{ROOT}/data/output/features/test/BaseItemFeatures2.pkl")
item_features2 = item_features2.reset_index()
item_features2 = pl.from_pandas(item_features2)


In [None]:
def cast_cols(df, columns):
    for col in columns:
        df = df.with_column(df.get_column(col).cast(pl.Int32))
    return df


item_features = cast_cols(item_features, ["aid"])
user_features = cast_cols(user_features, ["session"])
user_item_features = cast_cols(user_item_features, ["session", "aid"])
item_count_features = cast_cols(item_count_features, ["aid"])
popularity_features = cast_cols(popularity_features, ["aid"])
item_count_features2 = cast_cols(item_count_features2, ["aid"])
item_features2 = cast_cols(item_features2, ["aid"])


## 推論

In [None]:
chunk_len = math.ceil(len(all_candidates) / CHUNKS)


In [None]:
folds_prediction = np.zeros(len(all_candidates))
for chunk in range(CHUNKS):
    print("chunk", chunk)
    # ここでchunk分を取り出し、特徴量を結合する
    target_candidates = all_candidates[chunk * chunk_len : (chunk + 1) * chunk_len]

    target_candidates = target_candidates.join(item_features, on="aid", how="left")
    target_candidates = target_candidates.join(user_features, on="session", how="left")

    target_candidates = target_candidates.join(user_item_features, on=["session", "aid"], how="left")

    target_candidates = target_candidates.join(item_count_features, on="aid", how="left")

    target_candidates = target_candidates.join(popularity_features, on="aid", how="left")
    target_candidates = target_candidates.join(item_count_features2, on="aid", how="left")
    target_candidates = target_candidates.join(item_features2, on="aid", how="left")

    # pandasにもどす
    target_candidates = target_candidates.to_pandas()
    with open(f"{OUTPUT_DIR}/cart_features.pkl", "rb") as f:
        FEATURES = pickle.load(f)

    for fold in range(N_SPLITS):
        print("fold", fold)

        with open(f"{OUTPUT_DIR}/fold{fold}_cart_cbt.pkl", "rb") as f:
            model = pickle.load(f)

        prediction = model.predict(target_candidates[FEATURES])

        folds_prediction[chunk * chunk_len : (chunk + 1) * chunk_len] += prediction

    folds_prediction[chunk * chunk_len : (chunk + 1) * chunk_len] /= N_SPLITS


In [None]:
del (
    item_features,
    user_features,
    user_item_features,
    item_count_features,
    popularity_features,
    item_count_features2,
    item_features2,
)
gc.collect()


In [None]:
all_candidates = all_candidates.to_pandas()
all_candidates["prediction"] = folds_prediction
all_candidates = all_candidates[["session", "aid", "prediction"]]

del folds_prediction
gc.collect()


In [None]:
# sortして上位20取り出す。
all_candidates = all_candidates.sort_values("prediction", ascending=False)
all_candidates = all_candidates.groupby("session").head(20)

# 整形
all_candidates = all_candidates.groupby("session").aid.apply(list)

all_candidates = all_candidates.reset_index()
all_candidates["session"] = all_candidates["session"].apply(lambda x: str(x) + "_carts")
all_candidates["aid"] = all_candidates["aid"].apply(lambda x: " ".join(list(map(str, x))))

# カラム名変更
all_candidates = all_candidates.rename(columns={"session": "session_type", "aid": "labels"})

all_candidates.to_pickle(f"{OUTPUT_DIR}/test_cart_top20_candidates.pkl")

del all_candidates
gc.collect()


# order

## candidate

In [None]:
all_candidates = pd.read_pickle(f"/{OUTPUT_DIR}/test_cart_order_candidates.pkl")
all_candidates = pl.from_pandas(all_candidates)


## 特徴量

In [None]:
item_features = pd.read_pickle(f"{ROOT}/data/output/features/test/TrimBaseItemFeatures.pkl")
item_features = item_features.reset_index()
item_features = pl.from_pandas(item_features)

user_features = pd.read_pickle(f"{ROOT}/data/output/features/test/BaseUserFeatures.pkl")
user_features = user_features.reset_index()
user_features = pl.from_pandas(user_features)

user_item_features = pd.read_pickle(f"{ROOT}/data/output/features/test/BaseInteractiveFeatures.pkl")
user_item_features = user_item_features.reset_index()
user_item_features = pl.from_pandas(user_item_features)

item_count_features = pd.read_pickle(f"{ROOT}/data/output/features/test/TrimItemCountFeatures.pkl")
item_count_features = item_count_features.reset_index()
item_count_features = pl.from_pandas(item_count_features)

popularity_features = pd.read_pickle(f"{ROOT}/data/output/features/test/PopularityFeatures.pkl")
popularity_features = pl.from_pandas(popularity_features)

item_count_features2 = pd.read_pickle(f"{ROOT}/data/output/features/test/ItemCountFeatures2.pkl")
item_count_features2 = item_count_features2.reset_index()
item_count_features2 = pl.from_pandas(item_count_features2)

item_features2 = pd.read_pickle(f"{ROOT}/data/output/features/test/BaseItemFeatures2.pkl")
item_features2 = item_features2.reset_index()
item_features2 = pl.from_pandas(item_features2)


In [None]:
def cast_cols(df, columns):
    for col in columns:
        df = df.with_column(df.get_column(col).cast(pl.Int32))
    return df


item_features = cast_cols(item_features, ["aid"])
user_features = cast_cols(user_features, ["session"])
user_item_features = cast_cols(user_item_features, ["session", "aid"])
item_count_features = cast_cols(item_count_features, ["aid"])
popularity_features = cast_cols(popularity_features, ["aid"])
item_count_features2 = cast_cols(item_count_features2, ["aid"])
item_features2 = cast_cols(item_features2, ["aid"])


## 推論

In [None]:
chunk_len = math.ceil(len(all_candidates) / CHUNKS)


In [None]:
folds_prediction = np.zeros(len(all_candidates))
for chunk in range(CHUNKS):
    print("chunk", chunk)
    # ここでchunk分を取り出し、特徴量を結合する
    target_candidates = all_candidates[chunk * chunk_len : (chunk + 1) * chunk_len]

    target_candidates = target_candidates.join(item_features, on="aid", how="left")
    target_candidates = target_candidates.join(user_features, on="session", how="left")

    target_candidates = target_candidates.join(user_item_features, on=["session", "aid"], how="left")

    target_candidates = target_candidates.join(item_count_features, on="aid", how="left")

    target_candidates = target_candidates.join(popularity_features, on="aid", how="left")
    target_candidates = target_candidates.join(item_count_features2, on="aid", how="left")
    target_candidates = target_candidates.join(item_features2, on="aid", how="left")

    # pandasにもどす
    target_candidates = target_candidates.to_pandas()

    with open(f"{OUTPUT_DIR}/order_features.pkl", "rb") as f:
        FEATURES = pickle.load(f)

    for fold in range(N_SPLITS):
        print("fold", fold)

        with open(f"{OUTPUT_DIR}/fold{fold}_order_cbt.pkl", "rb") as f:
            model = pickle.load(f)

        prediction = model.predict(target_candidates[FEATURES])

        folds_prediction[chunk * chunk_len : (chunk + 1) * chunk_len] += prediction

    folds_prediction[chunk * chunk_len : (chunk + 1) * chunk_len] /= N_SPLITS


In [None]:
del (
    item_features,
    user_features,
    user_item_features,
    item_count_features,
    popularity_features,
    item_count_features2,
    item_features2,
)
gc.collect()


In [None]:
all_candidates = all_candidates.to_pandas()  # pandasに戻す
all_candidates["prediction"] = folds_prediction
all_candidates = all_candidates[["session", "aid", "prediction"]]

del folds_prediction
gc.collect()


In [None]:
# sortして上位20取り出す。
all_candidates = all_candidates.sort_values("prediction", ascending=False)
all_candidates = all_candidates.groupby("session").head(20)

# 整形
all_candidates = all_candidates.groupby("session").aid.apply(list)

all_candidates = all_candidates.reset_index()
all_candidates["session"] = all_candidates["session"].apply(lambda x: str(x) + "_orders")
all_candidates["aid"] = all_candidates["aid"].apply(lambda x: " ".join(list(map(str, x))))

# カラム名変更
all_candidates = all_candidates.rename(columns={"session": "session_type", "aid": "labels"})

all_candidates.to_pickle(f"{OUTPUT_DIR}/test_order_top20_candidates.pkl")

del all_candidates
gc.collect()


# まとめる

In [None]:
click_candidates = pd.read_pickle(f"{OUTPUT_DIR}/test_click_top20_candidates.pkl")
cart_candidates = pd.read_pickle(f"{OUTPUT_DIR}/test_cart_top20_candidates.pkl")
order_candidates = pd.read_pickle(f"{OUTPUT_DIR}/test_order_top20_candidates.pkl")

sub = pd.concat([click_candidates, cart_candidates, order_candidates], ignore_index=True)


In [None]:
sub.tail()


In [None]:
sub.to_csv(f"{OUTPUT_DIR}/submission.csv", index=False)
