In [None]:
import unicodedata
import re
import pickle
import os
import gc
import copy
import time
import joblib
from pprint import pprint
import random
from collections import defaultdict, Counter
from tqdm import tqdm
import string
from typing import List, Dict, Tuple
import datetime
from datetime import datetime, timedelta, timezone
import math

import numpy as np
import pandas as pd
import cudf
from matplotlib import pyplot as plt

import optuna

import warnings

warnings.filterwarnings("ignore")

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

print(os.cpu_count())

In [None]:
ROOT = ""  # コンペ用ディレクトリ
OUTPUT_DIR = ""
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# 理想時のcvを見ることができる。
def calc_candidates_recall(candidates, label, type):
    pred = candidates.groupby("session").aid.apply(set)
    pred = pred.reset_index()
    gt = label[label["type"] == type]

    gt_pred = gt.merge(pred, on="session", how="left")

    # negasamp後に、gtに紐づかないaidが出る可能性があるので、空のsetでfillna
    gt_pred["aid"] = gt_pred["aid"].apply(lambda d: d if isinstance(d, set) else set())

    gt_pred["hits"] = gt_pred.apply(
        lambda x: min(len(set(x["ground_truth"]) & x["aid"]), 20), axis=1
    )
    gt_pred["gt_count"] = gt_pred.apply(
        lambda x: min(len(x["ground_truth"]), 20), axis=1
    )
    return gt_pred.hits.sum() / gt_pred.gt_count.sum()

# test ensemble

In [None]:
def cust_blend_test(row, W=[1, 1]):
    # Create a list of all model predictions
    REC = []
    for i in range(len(W)):
        if i == 0:
            REC.append(row["labels"].split())
        else:
            REC.append(row[f"labels_{i}"].split())

    # Create a dictionary of items recommended.
    # Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for idx in range(len(REC)):
        for n, v in enumerate(REC[idx]):
            if v in res:
                res[v] += W[idx] / (n + 1)
            else:
                res[v] = W[idx] / (n + 1)

    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())

    return res[:20]

In [None]:
subs = []
type_weight = {
    # weights optimized by optuna
    "clicks": [1],
    "carts": [
        0.5253236565057308,
        0.5951928633820697,
        0.801464605050116,
        0.6243351208415832,
        0.7585375306136606,
    ],
    "orders": [
        0.3850340038357991,
        0.6271924079206362,
        0.8090807792929008,
        0.8202925544168471,
        0.016455787620324642,
    ],
}
type_exp = {
    "clicks": [136],
    "carts": [119, 135, 136, 141, 143],
    "orders": [119, 135, 136, 141, 143],
}
for type_ in ["clicks", "carts", "orders"]:
    print(type_)
    target_paths = [
        f"{ROOT}/data/output/exp/{exp}/test_{type_[:-1]}_top20_candidates.pkl"
        for exp in type_exp[type_]
    ]

    sub = None
    for i, path in enumerate(target_paths):
        print(i)
        df = pd.read_pickle(path)
        if i == 0:
            sub = df
        else:
            sub = sub.merge(df, on=["session_type"], how="left", suffixes=("", f"_{i}"))

    sub["prediction"] = sub.apply(cust_blend_test, W=type_weight[type_], axis=1)
    sub = sub[["session_type", "prediction"]]
    sub["prediction"] = sub["prediction"].apply(lambda x: " ".join(x))
    sub = sub.rename(columns={"prediction": "labels"})

    subs.append(sub)

In [None]:
sub = pd.concat(subs, ignore_index=True)
sub.to_csv(f"{OUTPUT_DIR}/submission.csv", index=False)