# Ensemble
---

In [1]:
# =============================
# import libraries
# =============================
import os, re, gc, copy, pickle, yaml
import sys
# import warnings
# warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import polars as pl
from tqdm import tqdm
pd.set_option("display.max_columns", 3000)
pd.set_option("display.max_rows", 3000)

# models
import lightgbm as lgb
from lightgbm import LGBMRegressor
from lightgbm import log_evaluation, early_stopping
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

from catboost import CatBoostRegressor, Pool

from sklearn.model_selection import KFold
from category_encoders import OrdinalEncoder, CountEncoder
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import mean_absolute_error

# original
sys.path.append(r"..")
import utils
from utils.data import *

In [2]:
class CFG:
    DEBUG = False # False    
    EXP = "900_Ensemble_EXP001"
    SEED = 2048
    n_trials = 10

if CFG.DEBUG:
    CFG.EXP = "900_Ensemble_DEBUG"

class PATHS:
    DATA_DIR = "../input/atmaCup#18_dataset/"

    INPUT_DIR0 = "../output/200_multimodalNN_exp019" # 0.2019
    INPUT_DIR1 = "../output/200_multimodalNN_exp018" # 0.2002
    INPUT_DIR2 = "../output/200_multimodalNN_exp017" # 0.1992
    INPUT_DIR3 = "../output/200_multimodalNN_exp013" # 0.2020
    INPUT_DIR4 = "../output/200_multimodalNN_exp010" # 0.2035

    OUTPUT_DIR = f"../output/{CFG.EXP}"


use_paths = [PATHS.INPUT_DIR0, PATHS.INPUT_DIR1, PATHS.INPUT_DIR2, PATHS.INPUT_DIR3, PATHS.INPUT_DIR4, ]
os.makedirs(PATHS.OUTPUT_DIR, exist_ok=True)

In [3]:
# seedの固定
def seed_everything(seed):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=CFG.SEED)

In [4]:
train_df = pd.read_csv(Path(PATHS.DATA_DIR)/"train_features.csv")
show_df(train_df)

(43371, 30)


Unnamed: 0,ID,vEgo,aEgo,steeringAngleDeg,steeringTorque,brake,brakePressed,gas,gasPressed,gearShifter,leftBlinker,rightBlinker,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,00066be8e20318869c38c66be466631a_320,5.701526,1.538456,-2.165777,-139.0,0.0,False,0.25,True,drive,False,False,2.82959,0.032226,0.045187,6.231999,0.065895,0.107974,9.785009,0.124972,0.203649,13.485472,0.163448,0.302818,17.574227,0.174289,0.406331,21.951269,0.199503,0.485079
1,00066be8e20318869c38c66be466631a_420,11.176292,0.279881,-11.625697,-44.0,0.0,False,0.0,False,drive,False,True,4.970268,-0.007936,0.005028,10.350489,-0.032374,-0.020701,15.770054,0.084073,0.008645,21.132415,0.391343,0.036335,26.316489,0.843124,0.065,31.383814,1.42507,0.073083
2,00066be8e20318869c38c66be466631a_520,10.472548,0.231099,-2.985105,-132.0,0.0,False,0.18,True,drive,False,False,4.815701,-0.000813,0.017577,10.153522,-0.0278,0.026165,15.446539,-0.155987,0.040397,20.61816,-0.356932,0.058765,25.677387,-0.576985,0.102859,30.460033,-0.841894,0.152889


In [5]:
pd.read_csv(Path(PATHS.INPUT_DIR0)/"oof_df.csv")

Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,2.744141,0.045715,0.052246,6.125000,0.091614,0.127686,9.773438,0.127075,0.216187,13.601562,0.153442,0.308350,17.625000,0.180176,0.403076,21.843750,0.200806,0.499268
1,4.941406,-0.013428,0.002136,10.382812,-0.007671,0.015038,15.757812,0.111450,0.021805,21.062500,0.410645,0.034546,26.265625,0.876953,0.044495,31.375000,1.513672,0.053894
2,4.804688,-0.037262,0.015091,10.171875,-0.114502,0.039032,15.476562,-0.230469,0.066650,20.625000,-0.381104,0.089844,25.656250,-0.560547,0.106750,30.640625,-0.754883,0.119080
3,2.816406,0.046234,0.010376,5.960938,0.161499,0.021240,9.156250,0.353516,0.031921,12.429688,0.618164,0.038208,15.765625,0.940918,0.045624,19.203125,1.315430,0.053345
4,1.587891,-0.041077,-0.002090,3.675781,-0.116821,-0.006882,6.085938,-0.236938,-0.006733,8.742188,-0.392578,-0.003193,11.617188,-0.573730,0.010620,14.640625,-0.761719,0.019379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43366,0.020462,-0.001877,-0.009254,0.034821,-0.001938,-0.020813,0.039062,0.001259,-0.030518,0.030396,0.004917,-0.041260,0.010986,0.008553,-0.046539,-0.010498,0.014427,-0.054810
43367,0.710449,0.001896,-0.007576,1.543945,-0.005447,-0.013351,2.421875,-0.009392,-0.018539,3.285156,-0.007553,-0.028412,4.082031,-0.020752,-0.025787,4.750000,-0.030640,-0.030838
43368,0.025513,0.002621,-0.008110,0.050659,0.003746,-0.015587,0.072021,0.004852,-0.023239,0.081970,0.005417,-0.032227,0.099182,-0.001230,-0.037506,0.120422,-0.006264,-0.045441
43369,0.002777,-0.001938,-0.003456,0.007202,-0.003876,-0.006546,0.016418,-0.004204,-0.010094,0.025635,-0.001625,-0.012527,0.034424,0.004051,-0.017059,0.056396,0.014252,-0.020325


In [6]:
TARGET_COLUMNS = [
    'x_0',
    'y_0',
    'z_0',
    'x_1',
    'y_1',
    'z_1',
    'x_2',
    'y_2',
    'z_2',
    'x_3',
    'y_3',
    'z_3',
    'x_4',
    'y_4',
    'z_4',
    'x_5',
    'y_5',
    'z_5'
    ]

In [7]:
def read_df(input_dir):
    sep(input_dir)
    df  = pd.read_csv(input_dir/"oof_df.csv", usecols=TARGET_COLUMNS)
    # df  = df0.sort_values(by=["Clothing ID", "Age", "Rating", "Positive Feedback Count"], ascending=True) 
    # df = df0.drop(["Clothing ID", "Age", "Rating", "Positive Feedback Count"], axis=1)
    df.reset_index(drop=True, inplace=True) 
    sub = pd.read_csv(input_dir/"sub.csv")
    show_df(df)
    show_df(sub)
    return df, sub

In [8]:
df_list, sub_list = [], []

for path in use_paths:
    df, sub = read_df(Path(path))
    df_list.append(df)
    sub_list.append(sub)   

../output/200_multimodalNN_exp019
(43371, 18)


Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,2.744141,0.045715,0.052246,6.125,0.091614,0.127686,9.773438,0.127075,0.216187,13.601562,0.153442,0.30835,17.625,0.180176,0.403076,21.84375,0.200806,0.499268
1,4.941406,-0.013428,0.002136,10.382812,-0.007671,0.015038,15.757812,0.11145,0.021805,21.0625,0.410645,0.034546,26.265625,0.876953,0.044495,31.375,1.513672,0.053894
2,4.804688,-0.037262,0.015091,10.171875,-0.114502,0.039032,15.476562,-0.230469,0.06665,20.625,-0.381104,0.089844,25.65625,-0.560547,0.10675,30.640625,-0.754883,0.11908


(1727, 18)


Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,1.510938,-0.061296,-0.005244,3.188867,-0.140811,-0.012317,4.866406,-0.197461,-0.019052,6.530859,-0.208664,-0.025737,8.15625,-0.148474,-0.03071,9.714844,0.011008,-0.034967
1,0.980713,0.339893,0.005789,1.896973,0.905664,0.014883,2.644336,1.615332,0.023221,3.262109,2.430859,0.039029,3.827539,3.326172,0.056429,4.390625,4.289258,0.076122
2,1.623535,0.014553,0.002849,3.338281,0.033572,0.003493,4.953125,0.038583,0.000473,6.483594,0.020758,-0.004544,7.937891,-0.018253,-0.011262,9.311719,-0.075925,-0.02161


../output/200_multimodalNN_exp018
(43371, 18)


Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,2.761719,0.035706,0.042206,6.121094,0.092102,0.106628,9.71875,0.123779,0.201538,13.578125,0.172485,0.298828,17.578125,0.17749,0.402588,21.84375,0.209351,0.486816
1,4.96875,0.006973,0.004658,10.421875,0.005322,0.011765,15.820312,0.098572,0.018585,21.109375,0.360107,0.030518,26.296875,0.817383,0.047791,31.4375,1.472656,0.05899
2,4.796875,-0.001727,0.010849,10.171875,-0.064697,0.028809,15.421875,-0.169678,0.056152,20.609375,-0.33374,0.076477,25.5625,-0.547852,0.099976,30.53125,-0.881348,0.108643


(1727, 18)


Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,1.528809,-0.032549,0.005716,3.199609,-0.069516,0.012828,4.833984,-0.088037,0.022949,6.385937,-0.08029,0.03703,7.842578,-0.030913,0.047469,9.21875,0.070865,0.066534
1,1.014111,0.359351,0.009001,2.025586,0.992773,0.022472,2.871875,1.868066,0.027857,3.650195,2.932227,0.051936,4.313867,4.163867,0.07683,4.937109,5.550781,0.100159
2,1.626953,0.025259,-0.00372,3.325781,0.04387,-0.013398,4.912891,0.053944,-0.025221,6.390234,0.044824,-0.033766,7.786328,0.023442,-0.040805,9.103125,-0.008088,-0.061823


../output/200_multimodalNN_exp017
(43371, 18)


Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,2.667969,0.037018,0.052734,5.9375,0.052979,0.150146,9.484375,0.124512,0.206055,13.375,0.149536,0.331055,17.46875,0.231201,0.408691,21.9375,0.24231,0.541016
1,4.960938,0.033966,0.004414,10.414062,0.069275,0.016403,15.8125,0.161255,0.031097,21.09375,0.331543,0.034729,26.296875,0.637207,0.049805,31.359375,1.058594,0.042816
2,4.691406,-0.021286,0.0215,9.960938,-0.07605,0.046326,15.257812,-0.185791,0.076843,20.515625,-0.342773,0.109253,25.796875,-0.544434,0.127441,31.09375,-0.791016,0.135986


(1727, 18)


Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,1.516724,-0.053839,-0.010664,3.176758,-0.107006,-0.015203,4.808105,-0.156,-0.024798,6.37793,-0.172259,-0.029809,7.868652,-0.125813,-0.040474,9.303711,-0.015583,-0.042565
1,0.972839,0.36499,0.001394,1.935059,0.991089,0.009871,2.771973,1.878662,0.007523,3.519287,2.944092,0.016431,4.207275,4.224854,0.0308,4.87915,5.631348,0.049563
2,1.623413,0.01775,-0.005462,3.334717,0.036302,-0.007302,4.943359,0.042696,-0.015741,6.481445,0.036593,-0.029288,7.925781,0.038991,-0.029786,9.327148,0.028403,-0.03458


../output/200_multimodalNN_exp013
(43371, 18)


Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,2.679688,-0.01088,0.004997,5.949219,-0.019287,0.005322,9.539062,-0.064941,0.014236,13.460938,-0.114868,0.017487,17.6875,-0.154053,0.038666,22.234375,-0.248779,0.067261
1,5.015625,-0.052307,-0.000577,10.53125,-0.150391,-0.002401,15.992188,-0.25708,-0.008812,21.390625,-0.351074,-0.017746,26.734375,-0.425049,-0.030365,32.03125,-0.493408,-0.043152
2,4.765625,-0.036591,0.001671,10.140625,-0.164917,0.006927,15.578125,-0.383301,0.009956,21.03125,-0.678711,0.013054,26.515625,-1.052734,0.030991,32.03125,-1.425781,0.029663


(1727, 18)


Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,1.480713,-0.038952,0.00161,3.097656,-0.087357,0.006561,4.676758,-0.106888,0.008352,6.195312,-0.093113,0.013573,7.631836,-0.040176,0.016578,8.964844,0.060794,0.024148
1,0.965088,0.331421,-0.004208,1.871338,0.935913,-0.011543,2.649414,1.732422,-0.020658,3.285645,2.720703,-0.024492,3.969727,3.86377,-0.028913,4.633789,5.137695,-0.02516
2,1.608154,0.025997,-0.002378,3.306641,0.031275,-0.007015,4.921875,0.009785,-0.015617,6.457031,-0.045033,-0.027975,7.90918,-0.120512,-0.043938,9.289062,-0.217285,-0.06419


../output/200_multimodalNN_exp010
(43371, 18)


Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,2.648438,-0.003227,0.029175,5.960938,-0.015549,0.085632,9.609375,-0.054626,0.137573,13.609375,-0.106384,0.210571,18.0,-0.165894,0.296143,22.578125,-0.210815,0.372559
1,5.011719,-0.026428,0.015671,10.554688,-0.095581,0.027588,16.0,-0.118958,0.026093,21.390625,-0.059509,0.028,26.6875,0.079773,0.042145,31.890625,0.341797,0.007912
2,4.742188,-0.020874,-0.006546,10.125,-0.104065,-0.020142,15.601562,-0.237305,-0.018616,21.171875,-0.436768,-0.022415,26.765625,-0.685059,-0.032166,32.375,-0.989746,-0.036713


(1727, 18)


Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,1.471875,-0.042694,-0.001044,3.087109,-0.097357,-0.000404,4.666406,-0.135303,0.007832,6.177344,-0.139429,-0.004208,7.648438,-0.097717,0.002769,9.004687,-0.016714,0.00761
1,1.018359,0.353125,-0.000532,2.029297,0.970996,-0.006831,2.937891,1.826758,-0.002113,3.829687,2.886719,-0.004466,4.717187,4.103125,0.018973,5.608594,5.535156,0.00295
2,1.618555,0.027814,-0.002793,3.320703,0.045447,-0.012654,4.950781,0.048373,-0.019638,6.494531,0.060681,-0.036943,7.973438,0.042111,-0.0502,9.364062,0.018147,-0.074507


# Nealder-mead

In [9]:
from scipy.optimize import minimize
from sklearn.metrics import mean_absolute_error
import numpy as np

# 各DataFrameのRecommended_predを結合 (横方向に結合)
preds_list = [df[TARGET_COLUMNS].values for df in df_list]  # 各モデルの予測値 (リストに保存)
preds = np.stack(preds_list, axis=-1)  # (N_samples, N_columns, N_models)
target = train_df[TARGET_COLUMNS].values  # (N_samples, N_columns)

# MAEを計算する関数（最小化対象）
def objective_mae(weights):
    weights = np.array(weights).reshape(-1, 1)  # (N_models, 1)
    # 各ターゲット列に対応する重み付き平均を計算
    ensemble_pred = np.dot(preds, weights).squeeze(-1)  # (N_samples, N_columns)
    mae = mean_absolute_error(target, ensemble_pred, multioutput="raw_values").mean()
    return mae

# 初期値と制約条件
initial_weights = np.ones(preds.shape[-1]) / preds.shape[-1]  # 各モデルに均等な重みを設定
constraints = {
    'type': 'eq', 
    'fun': lambda w: 1 - sum(w)  # 重みの合計が1になるよう制約
}
bounds = [(0, 1) for _ in range(preds.shape[-1])]  # 各重みは0～1の範囲

# 最適化
result = minimize(objective_mae, initial_weights, method='SLSQP', bounds=bounds, constraints=constraints)

# 最適化結果
optimal_weights = result.x
print("Optimal Weights:", optimal_weights)
print("Minimum MAE:", result.fun)  # MAEは最小化しているため、そのまま表示

Optimal Weights: [0.65001865 0.33173041 0.00740081 0.00934847 0.00150166]
Minimum MAE: 0.026037593065093418


In [10]:
ensemble_sub_pred = np.zeros_like(sub_list[0])

for i, sub in enumerate(sub_list): 
    ensemble_sub_pred += np.dot(sub, optimal_weights[i])

# 新しいsubmissionファイルの作成
submission = pd.DataFrame(ensemble_sub_pred, columns=TARGET_COLUMNS)

# 保存する場合
output_path = Path(PATHS.OUTPUT_DIR) / "submission_nealder_mead.csv"
submission.to_csv(output_path, index=False)

# 結果の確認
show_df(submission)

(1727, 18)


Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,1.516567,-0.051468,-0.001578,3.191336,-0.116345,-0.003803,4.853146,-0.159915,-0.004865,6.477985,-0.164625,-0.004546,8.044402,-0.108219,-0.004356,9.539154,0.031091,-0.000736
1,0.991644,0.346474,0.006719,1.939879,0.935574,0.017084,2.72125,1.702533,0.024194,3.393825,2.604371,0.042484,3.994344,3.616904,0.062153,4.579628,4.727479,0.082843
2,1.624517,0.018255,0.000551,3.333786,0.037005,-0.002313,4.93941,0.043455,-0.008351,6.452376,0.028304,-0.014688,7.887308,-0.004863,-0.021563,9.242503,-0.053829,-0.035523


# Simple equation by Optuna

In [11]:
import optuna
import numpy as np
from sklearn.metrics import mean_absolute_error

def objective(trial, train_df, df_list):
    num_data = len(df_list)
    weight = []

    # 各データフレームに対する重みを試行する
    for i in range(num_data):  # range(num_data) でインデックスを取得
        weight.append(trial.suggest_float(f"weight{i}", 0.0, 1.0))  # 重みをリストに追加

    # 重みの合計が1になるように正規化する
    weight_sum = sum(weight)  # weight内の全ての要素を加算
    weight = [w / weight_sum for w in weight]  # リスト全体を正規化

    # 重み付けアンサンブル
    ensemble_pred = np.zeros_like(df_list[0])  # ベースとなる配列をゼロで初期化
    for i in range(num_data):
        ensemble_pred += weight[i] * df_list[i]  # 重み付き加算

    # MAEの計算
    target = train_df[TARGET_COLUMNS].values
    mae = mean_absolute_error(target, ensemble_pred, multioutput="raw_values").mean()
    
    return mae

In [12]:
# Optunaで最適化を実行
study = optuna.create_study(direction="minimize")  # MAEを最小化するので "minimize"
study.optimize(lambda trial: objective(trial, train_df, df_list), n_trials=CFG.n_trials)  # 3000回の試行を実行

# 最適化された重みを取得
optimal_weights = study.best_params
print("Optimal Weights:", optimal_weights)

[I 2024-11-24 02:50:17,152] A new study created in memory with name: no-name-6793ec21-5f46-4177-af1c-0fd5a88c906f
[I 2024-11-24 02:50:17,214] Trial 0 finished with value: 0.05468634998168141 and parameters: {'weight0': 0.9388187718262657, 'weight1': 0.5393631595371667, 'weight2': 0.23023165025521575, 'weight3': 0.35658126184704986, 'weight4': 0.25997772819257126}. Best is trial 0 with value: 0.05468634998168141.
[I 2024-11-24 02:50:17,259] Trial 1 finished with value: 0.08103975328183305 and parameters: {'weight0': 0.6424090689704218, 'weight1': 0.11593399603710042, 'weight2': 0.09220850006854564, 'weight3': 0.17755634144910892, 'weight4': 0.5890618508601653}. Best is trial 0 with value: 0.05468634998168141.
[I 2024-11-24 02:50:17,300] Trial 2 finished with value: 0.08210653340020585 and parameters: {'weight0': 0.18396929802524098, 'weight1': 0.5635391180165019, 'weight2': 0.054195518749424076, 'weight3': 0.0715250362980131, 'weight4': 0.6932976743887522}. Best is trial 0 with value: 0

Optimal Weights: {'weight0': 0.9388187718262657, 'weight1': 0.5393631595371667, 'weight2': 0.23023165025521575, 'weight3': 0.35658126184704986, 'weight4': 0.25997772819257126}


In [13]:
# 重みの合計を計算
total_weight = sum(optimal_weights.values())

# 正規化
normalized_weights = {key: value / total_weight for key, value in optimal_weights.items()}

print("Normalized Weights:", normalized_weights)
print("Sum of Normalized Weights:", sum(normalized_weights.values()))

Normalized Weights: {'weight0': 0.4037977837977935, 'weight1': 0.2319868914206029, 'weight2': 0.09902553391888179, 'weight3': 0.15337009399329, 'weight4': 0.11181969686943194}
Sum of Normalized Weights: 1.0


In [14]:
ensemble_sub_pred = np.zeros_like(sub_list[0])

for i, sub in enumerate(sub_list): 
    ensemble_sub_pred += np.dot(sub, normalized_weights[f"weight{i}"])

# 新しいsubmissionファイルの作成
submission = pd.DataFrame(ensemble_sub_pred, columns=TARGET_COLUMNS)

# 保存する場合
output_path = Path(PATHS.OUTPUT_DIR) / "submission_optuna.csv"
submission.to_csv(output_path, index=False)

# 結果の確認
show_df(submission)

(1727, 18)


Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,1.506653,-0.048382,-0.001718,3.164793,-0.107867,-0.002542,4.801661,-0.147129,-0.002668,6.391103,-0.149814,-0.003143,7.91779,-0.096672,-0.002544,9.364607,0.026797,0.001655
1,0.989494,0.347072,0.003859,1.941446,0.946276,0.009666,2.743365,1.741639,0.013179,3.444683,2.693421,0.02518,4.099252,3.778829,0.041346,4.739266,4.984257,0.055353
2,1.6214,0.020591,-0.000931,3.32821,0.037207,-0.004912,4.937769,0.039232,-0.01181,6.458872,0.022283,-0.02099,7.901102,-0.011845,-0.029316,9.267234,-0.061018,-0.044669


# Ridge回帰

In [15]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
# 各DataFrameのRecommended_pred列を結合して配列に変換
preds_list = [df[TARGET_COLUMNS].values for df in df_list]  # 各モデルの予測値 (リストに保存)
preds = np.hstack(preds_list)  # (N_samples, N_columns, N_models)
X = preds
y = train_df[TARGET_COLUMNS].values  # avg_ratingはどのDataFrameでも同じと仮定
print(X.shape, y.shape)

(43371, 90) (43371, 18)


In [16]:
# 学習データとテストデータに分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Optunaで最適化を実行する関数
def objective(trial):
    alpha = trial.suggest_loguniform('alpha', 1e-5, 10)
    
    model = Ridge(alpha=alpha)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    mae = mean_absolute_error(y_val, y_pred)
    return mae

In [17]:
# OptunaでRidge回帰のパラメータを最適化
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=CFG.n_trials)

[I 2024-11-24 02:50:17,719] A new study created in memory with name: no-name-25cb36b4-7eeb-4476-807a-31ab9b604568
  alpha = trial.suggest_loguniform('alpha', 1e-5, 10)
[I 2024-11-24 02:50:17,778] Trial 0 finished with value: 0.026851482455750045 and parameters: {'alpha': 0.021892950796012934}. Best is trial 0 with value: 0.026851482455750045.
  alpha = trial.suggest_loguniform('alpha', 1e-5, 10)
[I 2024-11-24 02:50:17,882] Trial 1 finished with value: 0.026612693787315764 and parameters: {'alpha': 2.672576072166833}. Best is trial 1 with value: 0.026612693787315764.
  alpha = trial.suggest_loguniform('alpha', 1e-5, 10)
[I 2024-11-24 02:50:18,022] Trial 2 finished with value: 0.02660342650805613 and parameters: {'alpha': 3.178080207265841}. Best is trial 2 with value: 0.02660342650805613.
  alpha = trial.suggest_loguniform('alpha', 1e-5, 10)
[I 2024-11-24 02:50:18,144] Trial 3 finished with value: 0.026872353248970585 and parameters: {'alpha': 0.000508208462168539}. Best is trial 2 with

In [18]:
# 最適化されたパラメータでRidgeモデルを学習
optimal_alpha = study.best_params['alpha']
ridge_model = Ridge(alpha=optimal_alpha)
ridge_model.fit(X, y)

In [19]:
# submissionファイルのRecommended_pred列を結合して配列に変換
preds = np.hstack(sub_list)  # (N_samples, N_columns, N_models)
X_sub = preds

# 最適化されたRidgeモデルでアンサンブル予測を作成
sub_ensemble = ridge_model.predict(X_sub)

# アンサンブル結果を新しいsubmissionファイルに保存
submission = pd.DataFrame(sub_ensemble, columns=TARGET_COLUMNS)
submission.to_csv(Path(PATHS.OUTPUT_DIR) / "submission_ridge.csv", index=False)
show_df(submission)

(1727, 18)


Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,1.516745,-0.078,0.000549,3.194691,-0.150091,-0.002104,4.840304,-0.186203,-0.007082,6.416299,-0.171396,-0.012037,7.928573,-0.083404,-0.013079,9.383639,0.088921,-0.01296
1,0.97679,0.285308,0.01565,1.92703,0.852884,0.032604,2.735143,1.670636,0.046973,3.474979,2.652289,0.062397,4.180607,3.76414,0.082308,4.838366,4.96492,0.103285
2,1.631463,0.02095,0.0013,3.340783,0.03813,0.000347,4.93699,0.044204,-0.002854,6.431202,0.032901,-0.009241,7.83727,0.00231,-0.018999,9.173926,-0.047218,-0.03084


# Simple_ensemble

In [23]:
# 最適化された重みでアンサンブル予測を作成
weight = [0.00, 0.30, 0.40, 0.30, 0.00]

ensemble_sub_pred = np.zeros_like(sub_list[0])

for i, sub in enumerate(sub_list): 
    ensemble_sub_pred += np.dot(sub, weight[i])

# 新しいsubmissionファイルの作成
submission = pd.DataFrame(ensemble_sub_pred, columns=TARGET_COLUMNS)

# 保存する場合
output_path = Path(PATHS.OUTPUT_DIR) / "submission_simple_ens.csv"
submission.to_csv(output_path, index=False)

# 結果の確認
show_df(submission)

(1727, 18)


Unnamed: 0,x_0,y_0,z_0,x_1,y_1,z_1,x_2,y_2,z_2,x_3,y_3,z_3,x_4,y_4,z_4,x_5,y_5,z_5
0,1.509546,-0.042986,-0.002068,3.159883,-0.089864,-0.000264,4.776465,-0.120878,-0.000529,6.325547,-0.120925,0.003257,7.789785,-0.071652,0.003024,9.176562,0.033264,0.010179
1,0.982896,0.353228,0.001996,1.943101,0.975042,0.007227,2.765176,1.831611,0.005169,3.488467,2.873516,0.014806,4.167988,4.098232,0.026695,4.82293,5.459082,0.042325
2,1.619897,0.022477,-0.004015,3.323613,0.037064,-0.009045,4.927773,0.036197,-0.018548,6.446758,0.014575,-0.030237,7.878965,-0.013525,-0.037337,9.248516,-0.056251,-0.051636


In [22]:
len(sub_list)

5