In [2]:
# 基本的なライブラリ
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import random
import os

# モデル関連
import lightgbm as lgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb

# モデル評価・前処理関連
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

# ハイパーパラメータ最適化関連
import optuna
from optuna.integration import LightGBMPruningCallback, XGBoostPruningCallback
import optuna.visualization as vis

# その他のユーティリティ
import shap
import joblib

# 乱数のシードを固定
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

# 警告を無視
warnings.filterwarnings("ignore")

In [3]:
# データの読み込み
all_data = pd.read_csv('../../data_processed/all_data1.csv')

train_data = all_data[0:27532]
X = train_data.drop(['price'], axis=1)
y = train_data['price']

# データの分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [4]:
def catboost_params(trial):
    return {
        'depth': trial.suggest_int('cat_depth', 4, 16),
        'learning_rate': trial.suggest_loguniform('cat_learning_rate', 0.00001, 0.1),
        'iterations': trial.suggest_int('cat_iterations', 50, 2000),
        'l2_leaf_reg': trial.suggest_int('cat_l2_leaf_reg', 1, 20),
        'border_count': trial.suggest_categorical('cat_border_count', [32, 64, 128, 256, 512]),
        'thread_count': 4
    }

def lightgbm_params(trial):
    return {
        'num_leaves': trial.suggest_int('lgb_num_leaves', 10, 50),
        'max_depth': trial.suggest_int('lgb_max_depth', 2, 7),
        'learning_rate': trial.suggest_float('lgb_learning_rate', 0.001, 0.1, log=True),
        'n_estimators': trial.suggest_int('lgb_n_estimators', 50, 100),
        'min_child_samples': trial.suggest_int('lgb_min_child_samples', 20, 100),
        'subsample': trial.suggest_float('lgb_subsample', 0.5, 1.0, step=0.1),
        'colsample_bytree': trial.suggest_float('lgb_colsample_bytree', 0.4, 1.0, step=0.1),
        'reg_alpha': trial.suggest_float('lgb_reg_alpha', 0.0, 0.5, step=0.1),
        'reg_lambda': trial.suggest_float('lgb_reg_lambda', 0.0, 0.5, step=0.1),
        'force_col_wise': True,  
        'verbose': -1  # 出力を無効にする
    }

def xgboost_params(trial):
    return {
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 9),
        'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 0.001, 0.1),
        'n_estimators': trial.suggest_int('xgb_n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('xgb_min_child_weight', 1, 10),
        'subsample': trial.suggest_float('xgb_subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('xgb_reg_alpha', 0.0, 0.5),
        'reg_lambda': trial.suggest_float('xgb_reg_lambda', 0.0, 0.5),
        'verbosity': 0,  # 出力を無効にする
        'objective': 'reg:squarederror',
        'booster': trial.suggest_categorical('xgb_booster', ['gbtree', 'gblinear', 'dart']),
        'tree_method': 'hist',
        'seed': 42
    }

# def tabnet_params(trial):
#     return {
#         'n_d': trial.suggest_int('tab_n_d', 8, 64),
#         'n_a': trial.suggest_int('tab_n_a', 8, 64),
#         'n_steps': trial.suggest_int('tab_n_steps', 3, 10),
#         'gamma': trial.suggest_float('tab_gamma', 1.0, 2.0),
#         'n_independent': trial.suggest_int('tab_n_independent', 1, 5),
#         'n_shared': trial.suggest_int('tab_n_shared', 1, 5),
#         'lambda_sparse': trial.suggest_float('tab_lambda_sparse', 1e-5, 1e-1, log=True),
#         'optimizer_fn': torch.optim.Adam,
#         'scheduler_params': {"step_size":10, "gamma":0.9},
#         'scheduler_fn': torch.optim.lr_scheduler.StepLR,
#         'verbose': 0
#     }

def randomforest_params(trial):
    return  {
            'n_estimators': trial.suggest_int('rdf_n_estimators', 2, 150),
            'max_depth': trial.suggest_int('rdf_max_depth', 1, 32, log=True),
            'min_samples_split': trial.suggest_float('rdf_min_samples_split', 0.1, 1),
            'min_samples_leaf': trial.suggest_float('rdf_min_samples_leaf', 0.1, 0.5),                
            'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
            'bootstrap': trial.suggest_categorical('rdf_bootstrap', [True, False]),
            'random_state': 42
    }
    
def gbm_params(trial):
    return {
        'n_estimators': trial.suggest_int('gbm_n_estimators', 50, 1000),
        'max_depth': trial.suggest_int('gbm_max_depth', 3, 9),
        'learning_rate': trial.suggest_loguniform('gbm_learning_rate', 0.001, 0.1),
        'min_samples_split': trial.suggest_int('gbm_min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('gbm_min_samples_leaf', 1, 10),
        'subsample': trial.suggest_float('gbm_subsample', 0.5, 1.0),
        'max_features': trial.suggest_categorical('gbm_max_features', ['auto', 'sqrt', 'log2']),
        'random_state': 42
    }
    
def ridge_params(trial):
    return {
        'alpha': trial.suggest_float('ridge_alpha', 0.001, 10.0, log=True),
        'fit_intercept': trial.suggest_categorical('ridge_fit_intercept', [True, False]),
        'normalize': trial.suggest_categorical('ridge_normalize', [True, False]),
        'random_state': 42
    }
    
def lasso_params(trial):
    return {
        'alpha': trial.suggest_float('lasso_alpha', 0.001, 10.0, log=True),
        'fit_intercept': trial.suggest_categorical('lasso_fit_intercept', [True, False]),
        'normalize': trial.suggest_categorical('lasso_normalize', [True, False]),
        'random_state': 42
    }

    
def objective(trial):
    # 交差検証の設定
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    mape_scores = []

    for train_idx, val_idx in kf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # CatBoost
        cat_model = CatBoostRegressor(**catboost_params(trial), random_seed=42, verbose=0)
        cat_model.fit(X_train, y_train, 
                      eval_set=[(X_val, y_val)], 
                      early_stopping_rounds=10, 
                      cat_features=[])
        
        # LightGBM
        lgb_model = LGBMRegressor(**lightgbm_params(trial), random_state=42)
        lgb_model.fit(X_train, y_train, eval_metric='mape', 
                      eval_set=[(X_val, y_val)],
                      callbacks=[LightGBMPruningCallback(trial, 'l2')]) # 出力を無効にする
        
        # XGBoost
        dval = xgb.DMatrix(X_val, label=y_val)
        xgb_model = xgb.train(xgboost_params(trial), dval, evals=[(dval, 'eval')], 
                              early_stopping_rounds=10, 
                              callbacks=[XGBoostPruningCallback(trial, 'eval-rmse')], verbose_eval=False)
        
        # # tannet
        # tab_model = TabNetRegressor(**tabnet_params(trial))
        # tab_model.fit(X_train.values, y_train.values.reshape(-1, 1), 
        #               eval_set=[(X_val.values, y_val.values.reshape(-1, 1))])
        
        # RandomForest
        rdf_model = RandomForestRegressor(**randomforest_params(trial))
        rdf_model.fit(X_train, y_train)
        
        # GBM
        gbm_model = GradientBoostingRegressor(**gbm_params(trial))
        gbm_model.fit(X_train, y_train)
        
        # Ridge Regression
        ridge_model = Ridge(**ridge_params(trial))
        ridge_model.fit(X_train, y_train)
        
        # Lasso Regression
        lasso_model = Lasso(**lasso_params(trial))
        lasso_model.fit(X_train, y_train)
        
        # 各モデルの予測を取得
        cat_preds = cat_model.predict(X_val)
        lgb_preds = lgb_model.predict(X_val)
        dval = xgb.DMatrix(X_val)
        xgb_preds = xgb_model.predict(dval)
        # tab_preds = tab_model.predict(X_val.values)
        gbm_preds = gbm_model.predict(X_val)
        rdf_preds = rdf_model.predict(X_val)
        ridge_preds = ridge_model.predict(X_val)
        lasso_preds = lasso_model.predict(X_val)
        
        # 6つのモデルの予測の重みのハイパーパラメータを設定
        cat_weight = trial.suggest_float('cat_weight', 0, 0.5)
        lgb_weight = trial.suggest_float('lgb_weight', 0, 1 - cat_weight)
        xgb_weight = trial.suggest_float('xgb_weight', 0, 1 - cat_weight - lgb_weight)
        rdf_weight = trial.suggest_float('rdf_weight', 0, 1 - cat_weight - lgb_weight - xgb_weight)
        ridge_weight = trial.suggest_float('ridge_weight', 0, 1 - cat_weight - lgb_weight - xgb_weight - rdf_weight)
        lasso_weight = trial.suggest_float('lasso_weight', 0, 1 - cat_weight - lgb_weight - xgb_weight - rdf_weight - ridge_weight )
        gbm_weight = trial.suggest_float('gbm_weight', 0, 1 - cat_weight - lgb_weight - xgb_weight - rdf_weight - ridge_weight - lasso_weight)

        # 6つのモデルの予測を加重平均して最終的な予測を得る
        final_preds = (cat_preds * cat_weight + lgb_preds * lgb_weight + xgb_preds * xgb_weight + rdf_preds * rdf_weight + ridge_preds * ridge_weight + lasso_preds * lasso_weight + gbm_preds * gbm_weight) / (cat_weight + lgb_weight + xgb_weight + rdf_weight + ridge_weight + lasso_weight + gbm_weight)

    
        mape = mean_absolute_percentage_error(y_val, final_preds)
        mape_scores.append(mape)
    
    return sum(mape_scores) / n_splits

In [6]:
# Optunaのスタディを作成
study = optuna.create_study(direction='minimize', 
                            pruner=optuna.pruners.MedianPruner(n_warmup_steps=30), 
                            sampler=optuna.samplers.TPESampler(seed=SEED))

# 初期のトライアルを追加
study.enqueue_trial({'cat_depth': 4, 'cat_learning_rate': 0.0030494571483717183, 'cat_iterations': 1241, 'cat_l2_leaf_reg': 18, 'cat_border_count': 128, 'lgb_num_leaves': 39, 'lgb_max_depth': 5, 'lgb_learning_rate': 0.001543892983209765, 'lgb_n_estimators': 87, 'lgb_min_child_samples': 71, 'lgb_subsample': 0.8, 'lgb_colsample_bytree': 0.4, 'lgb_reg_alpha': 0.1, 'lgb_reg_lambda': 0.4, 'xgb_max_depth': 8, 'xgb_learning_rate': 0.014602575561312671, 'xgb_n_estimators': 941, 'xgb_min_child_weight': 8, 'xgb_subsample': 0.5113888436769296, 'xgb_colsample_bytree': 0.770853966308576, 'xgb_reg_alpha': 0.3197429291354004, 'xgb_reg_lambda': 0.13183424517556913, 'xgb_booster': 'gbtree', 'rdf_n_estimators': 26, 'rdf_max_depth': 3, 'rdf_min_samples_split': 0.21320458702870007, 'rdf_min_samples_leaf': 0.3485339409136515, 'max_features': 'auto', 'rdf_bootstrap': True, 'gbm_n_estimators': 618, 'gbm_max_depth': 6, 'gbm_learning_rate': 0.013105330317351494, 'gbm_min_samples_split': 4, 'gbm_min_samples_leaf': 3, 'gbm_subsample': 0.7884121303425939, 'gbm_max_features': 'log2', 'ridge_alpha': 0.07250219633049332, 'ridge_fit_intercept': True, 'ridge_normalize': False, 'lasso_alpha': 3.019733595109704, 'lasso_fit_intercept': True, 'lasso_normalize': True, 'cat_weight': 0.4967316834305898, 'lgb_weight': 0.025093775479547248, 'xgb_weight': 0.32711741508350634, 'rdf_weight': 0.12708918323428908, 'ridge_weight': 0.01612093140957428, 'lasso_weight': 0.005720489423377935, 'gbm_weight': 0.0011549902928533762})

# 最適化を実行  
study.optimize(objective, 
               n_trials=500, 
               timeout=None, 
               n_jobs=4, 
               gc_after_trial=False, 
               show_progress_bar=True)  # 合計のトライアル数とタイムアウトを調整

[I 2023-08-19 13:57:14,364] A new study created in memory with name: no-name-0e413594-271a-4d16-8811-020973490755


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2023-08-19 13:58:16,034] Trial 2 finished with value: 0.686991672850312 and parameters: {'cat_depth': 4, 'cat_learning_rate': 0.00595722417689089, 'cat_iterations': 743, 'cat_l2_leaf_reg': 2, 'cat_border_count': 256, 'lgb_num_leaves': 47, 'lgb_max_depth': 3, 'lgb_learning_rate': 0.007452027174286386, 'lgb_n_estimators': 83, 'lgb_min_child_samples': 77, 'lgb_subsample': 0.5, 'lgb_colsample_bytree': 0.7000000000000001, 'lgb_reg_alpha': 0.5, 'lgb_reg_lambda': 0.30000000000000004, 'xgb_max_depth': 9, 'xgb_learning_rate': 0.07755817907135486, 'xgb_n_estimators': 313, 'xgb_min_child_weight': 8, 'xgb_subsample': 0.6153677035780876, 'xgb_colsample_bytree': 0.8280815021084594, 'xgb_reg_alpha': 0.3307294910194956, 'xgb_reg_lambda': 0.49663918187468525, 'xgb_booster': 'dart', 'rdf_n_estimators': 103, 'rdf_max_depth': 21, 'rdf_min_samples_split': 0.455827205911816, 'rdf_min_samples_leaf': 0.15236574669452524, 'max_features': 'log2', 'rdf_bootstrap': True, 'gbm_n_estimators': 311, 'gbm_max_depth

ValueError: The `low` value must be smaller than or equal to the `high` value (low=0, high=-0.15892913389151941).

In [9]:
print('  Value: ', study.best_trial.value)
print('Best trial:', study.best_trial.params)

print('cat_weight=', study.best_trial.params['cat_weight'])
print('lgb_weight=', study.best_trial.params['lgb_weight'])
print('xgb_weight=', study.best_trial.params['xgb_weight'])
print('rdf_weight=', study.best_trial.params['rdf_weight'])
print('ridge_weight=', study.best_trial.params['ridge_weight'])
print('lasso_weight=', 1 - study.best_trial.params['cat_weight'] - study.best_trial.params['lgb_weight'] - study.best_trial.params['xgb_weight'] - study.best_trial.params['rdf_weight'] - study.best_trial.params['ridge_weight'])

  Value:  0.474496592104912
Best trial: {'cat_depth': 7, 'cat_learning_rate': 0.011274355582039503, 'cat_iterations': 1226, 'cat_l2_leaf_reg': 11, 'cat_border_count': 32, 'lgb_num_leaves': 24, 'lgb_max_depth': 5, 'lgb_learning_rate': 0.06286827443155239, 'lgb_n_estimators': 99, 'lgb_min_child_samples': 75, 'lgb_subsample': 0.8, 'lgb_colsample_bytree': 0.6000000000000001, 'lgb_reg_alpha': 0.0, 'lgb_reg_lambda': 0.4, 'xgb_max_depth': 5, 'xgb_learning_rate': 0.0013235141041058147, 'xgb_n_estimators': 525, 'xgb_min_child_weight': 4, 'xgb_subsample': 0.5271654583372686, 'xgb_colsample_bytree': 0.9996640851311875, 'xgb_reg_alpha': 0.013522549859538102, 'xgb_reg_lambda': 0.022319512859360957, 'xgb_booster': 'gbtree', 'rdf_n_estimators': 44, 'rdf_max_depth': 30, 'rdf_min_samples_split': 0.9718041652771578, 'rdf_min_samples_leaf': 0.49375371323353406, 'max_features': 'auto', 'rdf_bootstrap': False, 'ridge_alpha': 0.0547728225240236, 'ridge_fit_intercept': True, 'ridge_normalize': False, 'lasso_

In [5]:
path = 'weighted_ave(all_data1)'

# 最適なトライアルのハイパーパラメータを取得
best_params = study.best_trial.params

# モデルを保存するディレクトリのパスを設定
directory_path = f'../../model/{path}/'
# 指定したディレクトリが存在しない場合、ディレクトリを作成
if not os.path.exists(directory_path):
    os.makedirs(directory_path)
    
# CatBoostモデルのハイパーパラメータを設定して保存
cat_params = {
    'depth': best_params['cat_depth'],
    'learning_rate': best_params['cat_learning_rate'],
    'iterations': best_params['cat_iterations'],
    'l2_leaf_reg': best_params['cat_l2_leaf_reg'],
    'border_count': best_params['cat_border_count'],
    'thread_count': 4,
    'verbose': False
}
cat_best_model = CatBoostRegressor(**cat_params)
cat_best_model.fit(X, y, verbose=False)
# モデルを指定したパスに保存
cat_best_model.save_model(f'../../model/{path}/catboost.cbm')

# LightGBMモデルのハイパーパラメータを設定して保存
lgb_params = {
    'num_leaves': best_params['lgb_num_leaves'],
    'max_depth': best_params['lgb_max_depth'],
    'learning_rate': best_params['lgb_learning_rate'],
    'n_estimators': best_params['lgb_n_estimators'],
    'min_child_samples': best_params['lgb_min_child_samples'],
    'subsample': best_params['lgb_subsample'],
    'colsample_bytree': best_params['lgb_colsample_bytree'],
    'reg_alpha': best_params['lgb_reg_alpha'],
    'reg_lambda': best_params['lgb_reg_lambda'],
    'force_col_wise': True,
    'verbose': -1
}
lgb_best_model = LGBMRegressor(**lgb_params)
lgb_best_model.fit(X, y)
# モデルを指定したパスに保存
lgb_best_model.booster_.save_model(f'../../model/{path}/lightGBM.bin')

# XGBoostモデルのハイパーパラメータを設定して保存
xgb_params = {
    'max_depth': best_params['xgb_max_depth'],
    'learning_rate': best_params['xgb_learning_rate'],
    'n_estimators': best_params['xgb_n_estimators'],
    'min_child_weight': best_params['xgb_min_child_weight'],
    'subsample': best_params['xgb_subsample'],
    'colsample_bytree': best_params['xgb_colsample_bytree'],
    'reg_alpha': best_params['xgb_reg_alpha'],
    'reg_lambda': best_params['xgb_reg_lambda'],
    'verbosity': 0,
    'objective': 'reg:squarederror',
    'booster': best_params['xgb_booster'],
    'tree_method': 'hist',
    'seed': 42
}
dtrain = xgb.DMatrix(X, label=y)
xgb_best_model = xgb.train(xgb_params, dtrain, verbose_eval=False)
# モデルを指定したパスに保存
xgb_best_model.save_model(f'../../model/{path}/XGBoost.xgb')


SyntaxError: f-string: invalid syntax (2218431985.py, line 7)

: 