In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.3.0-py3-none-any.whl (404 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.2/404.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.11.3-py3-none-any.whl (225 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.4/225.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.10.0 (from optuna)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.11.3 cmaes-0.10.0 colorlog-6.7.0 optuna-3.3.0


In [1]:
# ===================================================================
#  Library
# ===================================================================
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

import warnings
warnings.simplefilter("ignore")

import optuna
import numpy as np

In [2]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    seed = 42
    n_seeds = 3
    n_trials = 2000
    save_dir = "/content/drive/MyDrive/Colab Notebooks/signate2023/exp/"
    data_dir = "/content/drive/MyDrive/Colab Notebooks/signate2023/"
    filename = "exp00059"

In [3]:
# ===================================================================
#  Utils
# ===================================================================
def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [9]:
df_exp55 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/signate2023/exp/kun_exp00055_oof_pred.csv').rename(columns={'optimized_pred':'pred_1'})
df_exp56 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/signate2023/exp/kun_exp00056_oof_pred.csv').rename(columns={'oof_pred':'pred_2'})
df_exp55_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/signate2023/exp/kun_exp00055.csv', header=None).rename(columns={0:'id', 1:'pred_1'})
df_exp56_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/signate2023/exp/kun_exp00056.csv', header=None).rename(columns={0:'id', 1:'pred_2'})
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/signate2023/train.csv')

In [12]:
df = pd.concat([df_exp55.merge(df_exp56, on='id'), df_train['price']], axis=1)
test = df_exp55_test.merge(df_exp56_test, on='id')

In [13]:
df

Unnamed: 0,id,pred_1,pred_2,price
0,0,8805.219623,7895.4160,27587
1,1,3799.844508,4177.3580,4724
2,2,2959.861854,2818.1372,10931
3,3,7934.654500,8179.8460,16553
4,4,4461.927715,4321.6885,5158
...,...,...,...,...
27527,27527,12385.902685,12967.0370,32212
27528,27528,6939.817355,6854.7417,5400
27529,27529,14186.293321,15186.2150,22227
27530,27530,6658.054764,6914.8745,3054


In [27]:
test

Unnamed: 0,id,pred_1,pred_2
0,27532,9347.528622,8806.765109
1,27533,5473.589480,5556.561678
2,27534,5542.547482,5788.564407
3,27535,17815.695204,19133.043389
4,27536,4446.383701,4014.969884
...,...,...,...
27532,55064,12602.958879,14460.748909
27533,55065,9228.375920,8297.307991
27534,55066,5535.535863,5972.723292
27535,55067,4766.881828,4963.743941


In [14]:
def objective(trial):
    a = trial.suggest_float("a", 1e-8, 1, log=True)
    b = trial.suggest_float("b", 1e-8, 1, log=True)

    df[f"pred"] = df[f"pred_1"] * a +\
                  df[f"pred_2"] * b

    score = get_score(y_true=df["price"], y_pred = df[f"pred"])
    return score

optuna.logging.set_verbosity(optuna.logging.WARNING)

# シードのリストを定義
seeds = [seed for seed in range(CFG.seed, CFG.seed+CFG.n_seeds)]

best_values = []
best_params_list = []

for seed in seeds:
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(seed=seed)
    )
    study.optimize(objective,
                   n_trials=CFG.n_trials,
                   n_jobs = -1,
                   show_progress_bar=True)

    best_value = study.best_value
    best_params = study.best_params

    best_values.append(best_value)
    best_params_list.append(best_params)

    print(f"Seed: {seed}, Best Value: {best_value}, Best Params: {best_params}")


# 最も小さい best_value を持つ Study を探索
best_index = np.argmin(best_values)
best_params_final = best_params_list[best_index]
best_value_final = best_values[best_index]

print("Final Best Value:", best_value_final)
print("Final Best Params:", best_params_final)

  0%|          | 0/2000 [00:00<?, ?it/s]

Seed: 42, Best Value: 43.50285907371161, Best Params: {'a': 0.4538484930063668, 'b': 0.54713605908276}


  0%|          | 0/2000 [00:00<?, ?it/s]

Seed: 43, Best Value: 43.52225168390487, Best Params: {'a': 0.45903088184783225, 'b': 0.5609930246741843}


  0%|          | 0/2000 [00:00<?, ?it/s]

Seed: 44, Best Value: 43.50096327542196, Best Params: {'a': 0.5611277888856756, 'b': 0.44021408514842586}
Final Best Value: 43.50096327542196
Final Best Params: {'a': 0.5611277888856756, 'b': 0.44021408514842586}


In [24]:
best_params_final["b"]

0.44021408514842586

In [16]:
test["pred"] =   test["pred_1"] * best_params_final["a"] +\
                 test["pred_2"] * best_params_final["b"]


test.to_csv(CFG.save_dir+f"{CFG.filename}_test_preds.csv", index=False)

test[["id", "pred"]].to_csv(CFG.save_dir + f"kun_{CFG.filename}.csv", index=False, header=None)
test[["id", "pred"]].head(2)

Unnamed: 0,id,pred
0,27532,9122.020113
1,27533,5517.459878


In [None]:
#二重ループで計算

In [22]:
p=1.0

wts_acc_0 = []
for w1_0 in np.arange(0,1.01,0.001):
    w2_0 = 1-w1_0
    oof_pred = w1_0*(df['pred_1']**p) + w2_0*(df['pred_2']**p)
    wts_acc_0.append( (w1_0,w2_0, get_score(df['price'], oof_pred)))
w1_0,w2_0, acc_0 = sorted(wts_acc_0, key=lambda x:x[2], reverse=False)[0]
print(w1_0,w2_0,acc_0)

0.528 0.472 43.50102152112026
