In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# ===================================================================
#  Library
# ===================================================================
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

import warnings
warnings.simplefilter("ignore")

import numpy as np
from scipy.optimize import minimize
import os
import polars as pl
import random

In [2]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    seed = 42
    n_seeds = 3
    n_trials = 2000
    save_dir = "/content/drive/MyDrive/Colab Notebooks/signate2023/exp/"
    data_dir = "/content/drive/MyDrive/Colab Notebooks/signate2023/"
    filename = "exp00057"

In [3]:
def seed_everything(seed):
    """fix random factors"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(CFG.seed)

def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [4]:
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/signate2023/train.csv')
kun_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/signate2023/exp/kun_exp00052.csv')
kun_oof = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/signate2023/exp/kun_exp00052_oof_pred.csv')
yuji_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/signate2023/exp/exp050.csv')
yuji_oof = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/signate2023/exp/oof_df_exp050.csv')
df = pd.concat([kun_oof.merge(yuji_oof, on='id'), df_train['price']], axis=1)
test = kun_test.merge(yuji_test, on='id')

In [5]:
df['pred_0']

0         6881.892301
1         3740.634027
2         2954.247573
3         8430.949224
4         3972.418866
             ...     
27527    12650.381284
27528     6494.944663
27529    12231.954239
27530     6832.933932
27531     9925.656545
Name: pred_0, Length: 27532, dtype: float64

In [10]:
def evaluate(current_best_preds, k: int, best_score: float, power: float):
    """
    評価関数

    preds = current_best_preds*(1-w) + (df[models[k]]**power)*w
    において最もスコアが良くなるようにwを探す。
    """
    best_weight = 0
    for w in np.arange(-0.5, 0.5, 0.01):
        preds = current_best_preds*(1-w) + (df[models[k]]**power)*w
        score = get_score(y_true=df["price"], y_pred=preds)
        if score < best_score:
            best_score = score
            best_weight = w
    return best_score, best_weight



stores = dict() # Hill Climbingで得た重りを保存する
orders = []     # Hill Climbingで選ばれたモデルの順番を保存する
scores = dict() # Hill Climbingするときのスコアを保存する

# single modelで最もCVが良いモデルを選択する
for col in df.columns:
    if col not in ["id", "price"]:
        scores[col] = get_score(y_true=df["price"], y_pred=df[col])
selected_model = min(scores, key=scores.get)
current_best_preds = df[selected_model]
orders.append(selected_model) # 順番を保存
stores[selected_model] = 1    # 重みを保存
models = [col for col in df.columns if col not in ["id", "price"]]

# Start Hill Climbing
i = 0
print(f"[{i}] baseline {selected_model} {min(scores.values())}")

while True:

    # 前段階のベストスコア
    best_score = min(scores.values())

    # 前段階で選ばれたモデル
    selected_model = min(scores, key=scores.get)

    # 重複しないように削除
    models.remove(selected_model)


    # 残ったモデルの数が0になれば終了
    if len(models) == 0:
        break

    #
    scores, weights = dict(), dict()
    for k in range(len(models)):
        # ここでpowerの値を変えるか、最適化の範囲として追加する
        power = 1.1  # 例として1.5に設定
        score, weight = evaluate(current_best_preds, k, best_score, power)
        scores[models[k]] = score
        weights[models[k]] = weight

    i += 1
    selected_model = min(scores, key=scores.get)
    print(f"[{i}] add {selected_model}: {min(scores.values())} {weights[selected_model]}")
    best_weight = weights[selected_model]
    stores[selected_model] = best_weight
    orders.append(selected_model)
    current_best_preds = current_best_preds*(1-best_weight) + df[selected_model]*best_weight

[0] baseline pred_12 43.805041424202756
[1] add pred_20: 43.804534156372284 0.010000000000000453
[2] add kun_pred_0: 43.79921794123266 4.440892098500626e-16
[3] add kun_pred_1: 43.79921794123266 0
[4] add kun_pred_2: 43.79921794123266 0
[5] add kun_pred_3: 43.79921794123266 0
[6] add kun_pred_4: 43.79921794123266 0
[7] add kun_pred_5: 43.79921794123266 0
[8] add kun_pred_6: 43.79921794123266 0
[9] add kun_pred_7: 43.79921794123266 0
[10] add kun_pred_8: 43.79921794123266 0
[11] add kun_pred_9: 43.79921794123266 0
[12] add pred_0: 43.79921794123266 0
[13] add pred_1: 43.79921794123266 0
[14] add pred_2: 43.79921794123266 0
[15] add pred_3: 43.79921794123266 0
[16] add pred_4: 43.79921794123266 0
[17] add pred_5: 43.79921794123266 0
[18] add pred_6: 43.79921794123266 0
[19] add pred_7: 43.79921794123266 0
[20] add pred_8: 43.79921794123266 0
[21] add pred_9: 43.79921794123266 0
[22] add pred_10: 43.79921794123266 0
[23] add pred_11: 43.79921794123266 0
[24] add pred_13: 43.79921794123266

In [11]:
# pの最適値を保存する変数を追加
optimal_p = 1.5  # 例として1.5に設定。この値は最適化の結果から設定する

def get_preds_with_power(df, power=1.0):
    best_preds = 0
    for exp in orders:
        w = stores[exp]
        best_preds = best_preds * (1-w) + (df[exp]**power) * w
    return best_preds

# スコアの計算
get_score(y_true=df["price"], y_pred=get_preds_with_power(df, power=optimal_p))

43.56925636218776

In [None]:
43.56925636218776

In [None]:
# 予測結果を保存
test_preds = get_preds_with_power(test, power=optimal_p)
test['pred'] = test_preds
test[['id', 'pred']].to_csv(CFG.save_dir + f"kun_{CFG.filename}.csv", index=False, header=None)