In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore')

train = pd.read_csv('playground-series-s5e5/train.csv')
test = pd.read_csv('playground-series-s5e5/test.csv')

In [2]:
numerical_features = ["Age","Height","Weight","Duration","Heart_Rate","Body_Temp"]

In [3]:
import numpy as np 
import pandas as pd
import itertools 
from sklearn.preprocessing import LabelEncoder,PolynomialFeatures,StandardScaler
#積の列を作る
def add_feature_cross_terms(df,features):
    df = df.copy()
    df = df.loc[:,~df.columns.duplicated()]#df.columns.duplicated()で重複したのをtrue/falseで判定し~でビット反転でfalseのみ残る
    for i in range(len(features)):
        for j in range(i+1,len(features)):
            f1 = features[i]
            f2 = features[j]
            df[f"{f1}_x_{f2}"] = df[f1]*df[f1]
    return df
#和、差、商の列を作る
def add_intersection_features(df,features):
    df_new = df.copy()
    for f1,f2 in itertools.combinations(features,2):
        df_new[f"{f1}_plus_{f2}"] = df_new[f1]+df_new[f2]
        df_new[f"{f1}_minus_{f2}"] = df_new[f1]-df_new[f2]
        df_new[f"{f2}_minus_{f1}"] = df_new[f2]-df_new[f1]
        df_new[f"{f1}_div_{f2}"] = df_new[f1]/df_new[f2]
        df_new[f"{f2}_div_{f1}"] = df_new[f2]/df_new[f1]
    return df_new
#平均、標準偏差、最大値、最小値、中央値の列を作る
def add_statistical_features(df,features):
    df_new = df.copy()
    df_new["row_mean"] = df[features].mean(axis=1)
    df_new["row_std"] = df[features].std(axis=1)
    df_new["row_max"] = df[features].max(axis=1)
    df_new["row_min"] = df[features].min(axis=1)
    df_new["row_median"]  = df[features].median(axis=1)
    return df_new
#変換
train = add_feature_cross_terms(train,numerical_features)
test = add_feature_cross_terms(test,numerical_features)

train = add_intersection_features(train,numerical_features)
test = add_intersection_features(test,numerical_features)

train = add_statistical_features(train,numerical_features)
test = add_statistical_features(test,numerical_features)
#カテゴリデータをラベルエンコーダで変換
le = LabelEncoder()
train["Sex"] = le.fit_transform(train["Sex"])
test["Sex"] = le.transform(test["Sex"])
#モデルがカテゴリーと分かるようにする
train["Sex"] = train["Sex"].astype("category")
test["Sex"] = test["Sex"].astype("category")

#多項式で表現できるようにする
poly = PolynomialFeatures(degree=2,interaction_only = True,include_bias=False)
poly_train = poly.fit_transform(train[numerical_features])
poly_test = poly.transform(test[numerical_features])
poly_feature_names = poly.get_feature_names_out(numerical_features)

#データフレームに変換
poly_train_df = pd.DataFrame(poly_train,columns=poly_feature_names)
poly_test_df = pd.DataFrame(poly_test,columns=poly_feature_names)

#横方向（列方向）にくっつけてる
train = pd.concat([train.reset_index(drop=True),poly_train_df],axis=1)
test = pd.concat([test.reset_index(drop=True),poly_test_df],axis=1)

#X,yを作ってyはスケールを変えておく
X = train.drop(columns=['id','Calories'])
y = np.log1p(train['Calories'])
X_test = test.drop(columns=['id'])

FEATURES = X.columns.tolist()

In [4]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.


In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import time

FOLDS = 4
kf = KFold(n_splits=FOLDS,shuffle=True,random_state=42)
models = {
    'CatBoostRegressor':CatBoostRegressor(
        verbose=1,#ログ
        random_state=42,
        cat_features=['Sex'],#カテゴリカル変数の指定
        early_stopping_rounds=20#検証スコアがX回連続で変化しなかったら停止
        ),
    'XGBoost':XGBRegressor(
        max_depth=10,#決定木の深さ
        col_sample_bytree=.7,#各決定木を作る時の特徴量の割合
        subsample=.9,#各木を作成する際に使用するサンプル数の割合
        n_estimators=500,#作る木の本数
        learning_rate=.02,#各ステップの学習率
        gamma=.01,#ノード分割のために必要な損失関数の最小減少量
        max_delta_step=2,#一回の重み更新で変化できる最大ステップ量
        early_stopping_rounds=50,
        eval_metric='rmse',#モデルの性能を評価する指標
        enable_categorical=True,#カテゴリ変数を直接扱えるようにする
        random_state=42
    ),
    'LightGBM':LGBMRegressor(
        n_estimators=500,#作る木の本数
        learning_rate=.02,#各ステップの学習率
        max_depth=10,#決定木の深さ
        colsample_bytree=.7,#各決定木を作る時の特徴量の割合
        subsample=.9,#各木を作成する際に使用するサンプル数の割合
        random_state=42,
        verbose=-1#ログを一切出さない
    )
}

In [6]:
#モデルごとに初期化
results = {
    name:{
        'oof':np.zeros(len(train)),#Out Of Fold予測値（訓練データ用）
        'pred':np.zeros(len(test)),#テストデータへの予測の平均
        'rmsle':[]#各FoldのRMSLEを格納
        } 
        for name in models}

#モデルごとにKFold学習を実行
for name,model in models.items():
    print(f"\n=== Training {name} ===")

    for i,(train_idx,valid_idx) in enumerate(kf.split(X,y)):
        print(f"\nFold {i+1}")
        
        #学習用データと検証用データに分割
        x_train,y_train = X.iloc[train_idx],y[train_idx]
        x_valid,y_valid = X.iloc[valid_idx],y[valid_idx]

        #重複カラムを除去
        x_train = x_train.loc[:,~x_train.columns.duplicated()]
        x_valid = x_valid.loc[:,~x_valid.columns.duplicated()]
        x_test = X_test.loc[:,~X_test.columns.duplicated()].copy()

        start = time.time()

        #モデルに応じたfit方法を選択
        if name == 'XGBoost':
            model.fit(x_train,y_train,eval_set=[(x_valid,y_valid)],verbose=1)
        elif name == 'CatBoost':#CatBoostはタプルで渡す
            model.fit(x_train,y_train,eval_set=(x_valid,y_valid))
        else:
            model.fit(x_train,y_train)
        
        #検証用データとテストデータの予測
        oof_pred = model.predict(x_valid)
        test_pred = model.predict(x_test)

        #oof予測を該当インデックスに保存
        results[name]['oof'][valid_idx] =oof_pred

        #テスト予測はFOLD数で割って平均（アンサンブル用）
        results[name]['pred'] += test_pred/FOLDS

        #RMSLE計算(log誤差)を格納
        rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_valid),np.expm1(oof_pred)))
        results[name]['rmsle'].append(rmsle)

        #各Foldのログ出力
        print(f"Fold {i+1} RMSLE: {rmsle:.4f}")
        print(f"Training time: {time.time()-start:.1f} sec")

#モデルごとのRMSLE平均・標準偏差を出力
print("\n=== Model Comparison ===")
for name in models:
    mean_rmsle = np.mean(results[name]['rmsle'])
    std_rmsle = np.std(results[name]['rmsle'])
    print(f"{name} - Mean RMSLE: {mean_rmsle:.4f} ± {std_rmsle:.4f}")        


=== Training CatBoostRegressor ===

Fold 1
Learning rate set to 0.111352
0:	learn: 0.8627118	total: 112ms	remaining: 1m 52s
1:	learn: 0.7733736	total: 150ms	remaining: 1m 14s
2:	learn: 0.6938896	total: 187ms	remaining: 1m 2s
3:	learn: 0.6232190	total: 224ms	remaining: 55.7s
4:	learn: 0.5603128	total: 260ms	remaining: 51.7s
5:	learn: 0.5040283	total: 291ms	remaining: 48.3s
6:	learn: 0.4544974	total: 322ms	remaining: 45.6s
7:	learn: 0.4096903	total: 355ms	remaining: 44.1s
8:	learn: 0.3700868	total: 395ms	remaining: 43.5s
9:	learn: 0.3354074	total: 432ms	remaining: 42.8s
10:	learn: 0.3035157	total: 470ms	remaining: 42.3s
11:	learn: 0.2751375	total: 506ms	remaining: 41.6s
12:	learn: 0.2501790	total: 542ms	remaining: 41.1s
13:	learn: 0.2280906	total: 576ms	remaining: 40.6s
14:	learn: 0.2082323	total: 612ms	remaining: 40.2s
15:	learn: 0.1905951	total: 647ms	remaining: 39.8s
16:	learn: 0.1752108	total: 680ms	remaining: 39.3s
17:	learn: 0.1615690	total: 710ms	remaining: 38.7s
18:	learn: 0.149

In [7]:
from scipy.optimize import minimize
from sklearn.metrics import mean_squared_log_error

#oof予測・テスト予測・正解データを対数変換から戻す
oof_preds = {name: np.expm1(results[name]['oof']) for name in results}
test_preds = {name: np.expm1(results[name]['pred']) for name in results}
y_true = np.expm1(y)

#モデル名のリストを抽出
model_names = list(oof_preds.keys())

#最適化の目的関数(RSMLEを最小化)
def rmsle_loss(weights):
    #各モデルの重み付き平均で予測値をブレンド
    blended = sum(w * oof_preds[name] for w, name in zip(weights, model_names))
    #RSMLEを計算
    loss = np.sqrt(mean_squared_log_error(y_true, blended))
    print(f"weights: {weights}, RMSLE: {loss:.6f}")
    return loss.item()

#初期の重みは等分
initial_weights = [1 / len(model_names)] * len(model_names)
#重みの合計を1に制約
constraints = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)})
#各重みは0~1の範囲に制限
bounds = [(0, 1)] * len(model_names)

#最適化実行(SLSQP法で連続最適化)
res = minimize(
    rmsle_loss, #最小化したい関数
    initial_weights, #初期値
    method='SLSQP', #最適化手法(連続制約付き最適化)
    bounds=bounds, #各重みの範囲
    constraints=constraints) #重みの合計が１である制約

#最適化された重みを取得
best_weights = res.x

# res.fun       # 最小化されたRMSLE
# res.success   # 最適化が成功したか（True/False）
# res.message   # 結果に関するメッセージ（失敗時に特に重要）

#重みの出力
print(f"\n✅ Optimized Weights:")
print(f"CatBoost = {best_weights[0]:.4f}")
print(f"XGBoost  = {best_weights[1]:.4f}")
print(f"LightGBM = {best_weights[2]:.4f}")

#最適重みによるテストデータの予測
blended_preds = sum(w * test_preds[name] for w, name in zip(best_weights, model_names))
#予測を適切な範囲(1~314)にクリップ
blended_preds = np.clip(blended_preds, 1, 314)

submission = pd.read_csv("playground-series-s5e5/sample_submission.csv")

submission['Calories'] = blended_preds
submission.to_csv('submission.csv', index=False)

print("\nSubmission Head:")
print(submission.head())

print(f"\nPredict Mean: {blended_preds.mean():.2f}")
print(f"Predict Median: {np.median(blended_preds):.2f}")

weights: [0.33333333 0.33333333 0.33333333], RMSLE: 0.059538
weights: [0.33333335 0.33333333 0.33333333], RMSLE: 0.059538
weights: [0.33333333 0.33333335 0.33333333], RMSLE: 0.059538
weights: [0.33333333 0.33333333 0.33333335], RMSLE: 0.059538

✅ Optimized Weights:
CatBoost = 0.3333
XGBoost  = 0.3333
LightGBM = 0.3333

Submission Head:
       id    Calories
0  750000   27.332461
1  750001  107.594434
2  750002   87.242586
3  750003  125.389014
4  750004   75.814062

Predict Mean: 88.16
Predict Median: 76.40


In [8]:
print(res)

 message: Optimization terminated successfully
 success: True
  status: 0
     fun: 0.05953801030803326
       x: [ 3.333e-01  3.333e-01  3.333e-01]
     nit: 1
     jac: [ 7.554e-04  4.907e-04  1.255e-03]
    nfev: 4
    njev: 1


In [9]:
for name in oof_preds:
    print(f"{name}: mean={oof_preds[name].mean():.4f}, std={oof_preds[name].std():.4f}")

CatBoostRegressor: mean=88.2187, std=62.2950
XGBoost: mean=88.1876, std=62.2402
LightGBM: mean=88.1995, std=62.2547
