In [1]:
# 基本的なライブラリ
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import random
import os

# モデル関連
import lightgbm as lgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
from sklearn.metrics import mean_absolute_percentage_error
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
from sklearn.metrics import mean_absolute_percentage_error

# モデル評価・前処理関連
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

# ハイパーパラメータ最適化関連
import optuna
from optuna.integration import LightGBMPruningCallback, XGBoostPruningCallback
import optuna.visualization as vis

# その他のユーティリティ
import shap
import joblib

# 乱数のシードを固定
def torch_fix_seed(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = 'true'
    os.environ['TF_CUDNN_DETERMINISTIC'] = 'true'
    
    # Python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms = True
    torch.backends.cudnn.benchmark = False
    
torch_fix_seed()

# 警告を無視
warnings.filterwarnings("ignore")

In [2]:
# データの読み込み
all_data = pd.read_csv('../../data_processed/all_data.csv')

train_data = all_data[0:27532]
X = train_data.drop(['price'], axis=1)
y = train_data['price']

# データの分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [3]:
def catboost_params(trial):
    return {
        'depth': trial.suggest_int('cat_depth', 4, 16),
        'learning_rate': trial.suggest_loguniform('cat_learning_rate', 0.00001, 0.1),
        'iterations': trial.suggest_int('cat_iterations', 50, 2000),
        'l2_leaf_reg': trial.suggest_int('cat_l2_leaf_reg', 1, 20),
        'border_count': trial.suggest_categorical('cat_border_count', [32, 64, 128, 256, 512]),
        'thread_count': 4
    }

def lightgbm_params(trial):
    return {
        'num_leaves': trial.suggest_int('lgb_num_leaves', 10, 50),
        'max_depth': trial.suggest_int('lgb_max_depth', 2, 7),
        'learning_rate': trial.suggest_float('lgb_learning_rate', 0.001, 0.1, log=True),
        'n_estimators': trial.suggest_int('lgb_n_estimators', 50, 100),
        'min_child_samples': trial.suggest_int('lgb_min_child_samples', 20, 100),
        'subsample': trial.suggest_float('lgb_subsample', 0.5, 1.0, step=0.1),
        'colsample_bytree': trial.suggest_float('lgb_colsample_bytree', 0.4, 1.0, step=0.1),
        'reg_alpha': trial.suggest_float('lgb_reg_alpha', 0.0, 0.5, step=0.1),
        'reg_lambda': trial.suggest_float('lgb_reg_lambda', 0.0, 0.5, step=0.1),
        'force_col_wise': True,  
        'verbose': -1  # 出力を無効にする
    }

def xgboost_params(trial):
    return {
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 9),
        'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 0.001, 0.1),
        'n_estimators': trial.suggest_int('xgb_n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('xgb_min_child_weight', 1, 10),
        'subsample': trial.suggest_float('xgb_subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('xgb_reg_alpha', 0.0, 0.5),
        'reg_lambda': trial.suggest_float('xgb_reg_lambda', 0.0, 0.5),
        'verbosity': 0,  # 出力を無効にする
        'objective': 'reg:squarederror',
        'booster': trial.suggest_categorical('xgb_booster', ['gbtree', 'gblinear', 'dart']),
        'tree_method': 'hist',
        'seed': 42
    }

# def tabnet_params(trial):
#     return {
#         'n_d': trial.suggest_int('tab_n_d', 8, 64),
#         'n_a': trial.suggest_int('tab_n_a', 8, 64),
#         'n_steps': trial.suggest_int('tab_n_steps', 3, 10),
#         'gamma': trial.suggest_float('tab_gamma', 1.0, 2.0),
#         'n_independent': trial.suggest_int('tab_n_independent', 1, 5),
#         'n_shared': trial.suggest_int('tab_n_shared', 1, 5),
#         'lambda_sparse': trial.suggest_float('tab_lambda_sparse', 1e-5, 1e-1, log=True),
#         'optimizer_fn': torch.optim.Adam,
#         'scheduler_params': {"step_size":10, "gamma":0.9},
#         'scheduler_fn': torch.optim.lr_scheduler.StepLR,
#         'verbose': 0
#     }

def randomforest_params(trial):
    return  {
            'n_estimators': trial.suggest_int('rdf_n_estimators', 2, 150),
            'max_depth': trial.suggest_int('rdf_max_depth', 1, 32, log=True),
            'min_samples_split': trial.suggest_float('rdf_min_samples_split', 0.1, 1),
            'min_samples_leaf': trial.suggest_float('rdf_min_samples_leaf', 0.1, 0.5),                
            'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
            'bootstrap': trial.suggest_categorical('rdf_bootstrap', [True, False]),
            'random_state': 42
    }
    
def objective(trial):
    # 交差検証の設定
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    mape_scores = []

    for train_idx, val_idx in kf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # CatBoost
        cat_model = CatBoostRegressor(**catboost_params(trial), random_seed=42, verbose=0)
        cat_model.fit(X_train, y_train, 
                      eval_set=[(X_val, y_val)], 
                      early_stopping_rounds=10, 
                      cat_features=[])
        
        # LightGBM
        lgb_model = LGBMRegressor(**lightgbm_params(trial), random_state=42)
        lgb_model.fit(X_train, y_train, eval_metric='mape', 
                      eval_set=[(X_val, y_val)],
                      callbacks=[LightGBMPruningCallback(trial, 'l2')]) # 出力を無効にする
        
        # XGBoost
        dval = xgb.DMatrix(X_val, label=y_val)
        xgb_model = xgb.train(xgboost_params(trial), dval, evals=[(dval, 'eval')], 
                              early_stopping_rounds=10, 
                              callbacks=[XGBoostPruningCallback(trial, 'eval-rmse')], verbose_eval=False)
        
        # # tannet
        # tab_model = TabNetRegressor(**tabnet_params(trial))
        # tab_model.fit(X_train.values, y_train.values.reshape(-1, 1), 
        #               eval_set=[(X_val.values, y_val.values.reshape(-1, 1))])
        
        # RandomForest
        rdf_model = RandomForestRegressor(**randomforest_params(trial))
        rdf_model.fit(X_train, y_train)
        
        # 各モデルの予測を取得
        cat_preds = cat_model.predict(X_val)
        lgb_preds = lgb_model.predict(X_val)
        dval = xgb.DMatrix(X_val)
        xgb_preds = xgb_model.predict(dval)
        # tab_preds = tab_model.predict(X_val.values)
        rdf_preds = rdf_model.predict(X_val)
        
        # 5つのモデルの予測を平均して最終的な予測を得る
        final_preds = (cat_preds + lgb_preds + xgb_preds  + rdf_preds) / 4 #+ tab_preds
       
        mape = mean_absolute_percentage_error(y_val, final_preds)
        mape_scores.append(mape)
    
    return sum(mape_scores) / n_splits

In [5]:
# optunaのスタディを作成
study = optuna.create_study(direction="minimize")

# 初期のトライアルを追加
study.enqueue_trial({'cat_depth': 8, 'cat_learning_rate': 0.00651224680897653, 'cat_iterations': 1821, 'cat_l2_leaf_reg': 5, 'cat_border_count': 256, 'lgb_num_leaves': 50, 'lgb_max_depth': 6, 'lgb_learning_rate': 0.091099381427228, 'lgb_n_estimators': 89, 'lgb_min_child_samples': 54, 'lgb_subsample': 0.5, 'lgb_colsample_bytree': 0.6000000000000001, 'lgb_reg_alpha': 0.5, 'lgb_reg_lambda': 0.30000000000000004, 'xgb_max_depth': 9, 'xgb_learning_rate': 0.001031754501018457, 'xgb_n_estimators': 235, 'xgb_min_child_weight': 5, 'xgb_subsample': 0.9871415109340169, 'xgb_colsample_bytree': 0.6444006263081316, 'xgb_reg_alpha': 0.11089498111190996, 'xgb_reg_lambda': 0.30379454797790195, 'xgb_booster': 'dart', 'rdf_n_estimators': 6, 'rdf_max_depth': 11, 'rdf_min_samples_split': 0.7610018174059954, 'rdf_min_samples_leaf': 0.10024596126396466, 'max_features': 'auto', 'rdf_bootstrap': False})

# 最適化を実行  
study.optimize(objective, 
               n_trials=200, 
               timeout=60*60*5, 
               n_jobs=1, 
               gc_after_trial=False, 
               show_progress_bar=True)  # 合計のトライアル数とタイムアウトを調整

print('Value: ', study.best_trial.value)
print('Best trial:', study.best_trial.params)

[I 2023-08-18 15:08:35,885] A new study created in memory with name: no-name-7fcaf4c3-4351-4726-a019-ee517527161c


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2023-08-18 15:09:03,689] Trial 0 finished with value: 0.5393012455634046 and parameters: {'cat_depth': 8, 'cat_learning_rate': 0.00651224680897653, 'cat_iterations': 1821, 'cat_l2_leaf_reg': 5, 'cat_border_count': 256, 'lgb_num_leaves': 50, 'lgb_max_depth': 6, 'lgb_learning_rate': 0.091099381427228, 'lgb_n_estimators': 89, 'lgb_min_child_samples': 54, 'lgb_subsample': 0.5, 'lgb_colsample_bytree': 0.6000000000000001, 'lgb_reg_alpha': 0.5, 'lgb_reg_lambda': 0.30000000000000004, 'xgb_max_depth': 9, 'xgb_learning_rate': 0.001031754501018457, 'xgb_n_estimators': 235, 'xgb_min_child_weight': 5, 'xgb_subsample': 0.9871415109340169, 'xgb_colsample_bytree': 0.6444006263081316, 'xgb_reg_alpha': 0.11089498111190996, 'xgb_reg_lambda': 0.30379454797790195, 'xgb_booster': 'dart', 'rdf_n_estimators': 6, 'rdf_max_depth': 11, 'rdf_min_samples_split': 0.7610018174059954, 'rdf_min_samples_leaf': 0.10024596126396466, 'max_features': 'auto', 'rdf_bootstrap': False}. Best is trial 0 with value: 0.5393012

: 

In [6]:
# Optunaのスタディを作成
study = optuna.create_study(direction='minimize', 
                            pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), 
                            sampler=optuna.samplers.TPESampler(seed=42))

# 初期のトライアルを追加
study.enqueue_trial({'cat_depth': 10, 'cat_learning_rate': 0.003270535459538743, 'cat_iterations': 1597, 'cat_l2_leaf_reg': 9, 'cat_border_count': 128, 'lgb_num_leaves': 48, 'lgb_max_depth': 5, 'lgb_learning_rate': 0.07011111779424622, 'lgb_n_estimators': 70, 'lgb_min_child_samples': 28, 'lgb_subsample': 0.5, 'lgb_colsample_bytree': 0.8, 'lgb_reg_alpha': 0.4, 'lgb_reg_lambda': 0.5, 'xgb_max_depth': 3, 'xgb_learning_rate': 0.0010770550063251198, 'xgb_n_estimators': 382, 'xgb_min_child_weight': 7, 'xgb_subsample': 0.7897564443091667, 'xgb_colsample_bytree': 0.7831846751932928, 'xgb_reg_alpha': 0.3547068459255278, 'xgb_reg_lambda': 0.4902728190181931, 'xgb_booster': 'dart', 'tab_n_d': 35, 'tab_n_a': 9, 'tab_n_steps': 8, 'tab_gamma': 1.4286132018231517, 'tab_n_independent': 1, 'tab_n_shared': 2, 'tab_lambda_sparse': 0.01624194472948726, 'tab_mask_type': 'tab_sparsemax'})

# 最適化を実行  
study.optimize(objective, 
               n_trials=100, 
               timeout=60*60*5, 
               n_jobs=5, 
               gc_after_trial=False, 
               show_progress_bar=True)  # 合計のトライアル数とタイムアウトを調整

print('  Value: ', study.best_trial.value)
print('Best trial:', study.best_trial.params)

[I 2023-08-18 05:53:09,134] A new study created in memory with name: no-name-d9589bd9-61aa-4d84-9388-ec00ea246a8a


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2023-08-18 05:53:46,255] Trial 3 finished with value: 0.8800996835735749 and parameters: {'cat_depth': 7, 'cat_learning_rate': 0.00022268670782859057, 'cat_iterations': 54, 'cat_l2_leaf_reg': 17, 'cat_border_count': 128, 'lgb_num_leaves': 33, 'lgb_max_depth': 2, 'lgb_learning_rate': 0.0028751486358065824, 'lgb_n_estimators': 66, 'lgb_min_child_samples': 66, 'lgb_subsample': 1.0, 'lgb_colsample_bytree': 0.4, 'lgb_reg_alpha': 0.30000000000000004, 'lgb_reg_lambda': 0.1, 'xgb_max_depth': 9, 'xgb_learning_rate': 0.0961147202989928, 'xgb_n_estimators': 751, 'xgb_min_child_weight': 5, 'xgb_subsample': 0.8945547305123578, 'xgb_colsample_bytree': 0.7865698117461932, 'xgb_reg_alpha': 0.10178173790712203, 'xgb_reg_lambda': 0.07089053261746464, 'xgb_booster': 'dart', 'rdf_n_estimators': 145, 'rdf_max_depth': 1, 'rdf_min_samples_split': 0.7711621421809062, 'rdf_min_samples_leaf': 0.4007885675588351, 'max_features': 'auto', 'rdf_bootstrap': False}. Best is trial 3 with value: 0.8800996835735749.


KeyboardInterrupt: 

[I 2023-08-18 06:01:58,864] Trial 4 finished with value: 0.6044813671219804 and parameters: {'cat_depth': 15, 'cat_learning_rate': 0.006235287096638865, 'cat_iterations': 357, 'cat_l2_leaf_reg': 12, 'cat_border_count': 256, 'lgb_num_leaves': 48, 'lgb_max_depth': 3, 'lgb_learning_rate': 0.03418072523334249, 'lgb_n_estimators': 68, 'lgb_min_child_samples': 72, 'lgb_subsample': 0.9, 'lgb_colsample_bytree': 0.4, 'lgb_reg_alpha': 0.5, 'lgb_reg_lambda': 0.0, 'xgb_max_depth': 6, 'xgb_learning_rate': 0.0014212682667410286, 'xgb_n_estimators': 928, 'xgb_min_child_weight': 9, 'xgb_subsample': 0.5188817440636713, 'xgb_colsample_bytree': 0.8868382490778153, 'xgb_reg_alpha': 0.2759174617337231, 'xgb_reg_lambda': 0.10979927766468761, 'xgb_booster': 'gbtree', 'rdf_n_estimators': 4, 'rdf_max_depth': 6, 'rdf_min_samples_split': 0.3671727646426943, 'rdf_min_samples_leaf': 0.2352770083901374, 'max_features': 'auto', 'rdf_bootstrap': False}. Best is trial 0 with value: 0.556925847591695.


In [None]:
path = 'ave1'

# 最適なトライアルのハイパーパラメータを取得
best_params = study.best_trial.params

# モデルを保存するディレクトリのパスを設定
directory_path = f'../../model/{path}/'
# 指定したディレクトリが存在しない場合、ディレクトリを作成
if not os.path.exists(directory_path):
    os.makedirs(directory_path)
    
# CatBoostモデルのハイパーパラメータを設定して保存
cat_params = {
    'depth': best_params['cat_depth'],
    'learning_rate': best_params['cat_learning_rate'],
    'iterations': best_params['cat_iterations'],
    'l2_leaf_reg': best_params['cat_l2_leaf_reg'],
    'border_count': best_params['cat_border_count'],
    'thread_count': 4,
    'verbose': False
}
cat_best_model = CatBoostRegressor(**cat_params)
cat_best_model.fit(X, y, verbose=False)
# モデルを指定したパスに保存
cat_best_model.save_model(f'../../model/{path}/catboost.cbm')

# LightGBMモデルのハイパーパラメータを設定して保存
lgb_params = {
    'num_leaves': best_params['lgb_num_leaves'],
    'max_depth': best_params['lgb_max_depth'],
    'learning_rate': best_params['lgb_learning_rate'],
    'n_estimators': best_params['lgb_n_estimators'],
    'min_child_samples': best_params['lgb_min_child_samples'],
    'subsample': best_params['lgb_subsample'],
    'colsample_bytree': best_params['lgb_colsample_bytree'],
    'reg_alpha': best_params['lgb_reg_alpha'],
    'reg_lambda': best_params['lgb_reg_lambda'],
    'force_col_wise': True,
    'verbose': -1
}
lgb_best_model = LGBMRegressor(**lgb_params)
lgb_best_model.fit(X, y)
# モデルを指定したパスに保存
lgb_best_model.booster_.save_model(f'../../model/{path}/lightGBM.bin')

# XGBoostモデルのハイパーパラメータを設定して保存
xgb_params = {
    'max_depth': best_params['xgb_max_depth'],
    'learning_rate': best_params['xgb_learning_rate'],
    'n_estimators': best_params['xgb_n_estimators'],
    'min_child_weight': best_params['xgb_min_child_weight'],
    'subsample': best_params['xgb_subsample'],
    'colsample_bytree': best_params['xgb_colsample_bytree'],
    'reg_alpha': best_params['xgb_reg_alpha'],
    'reg_lambda': best_params['xgb_reg_lambda'],
    'verbosity': 0,
    'objective': 'reg:squarederror',
    'booster': best_params['xgb_booster'],
    'tree_method': 'hist',
    'seed': 42
}
dtrain = xgb.DMatrix(X, label=y)
xgb_best_model = xgb.train(xgb_params, dtrain, verbose_eval=False)
# モデルを指定したパスに保存
xgb_best_model.save_model(f'../../model/{path}/XGBoost.xgb')

# TabNetモデルのハイパーパラメータを設定して保存
tab_params = {
    'n_d': best_params['tab_n_d'],
    'n_a': best_params['tab_n_a'],
    'n_steps': best_params['tab_n_steps'],
    'gamma': best_params['tab_gamma'],
    'n_independent': best_params['tab_n_independent'],
    'n_shared': best_params['tab_n_shared'],
    'lambda_sparse': best_params['tab_lambda_sparse'],
    'optimizer_fn': torch.optim.Adam,
    'mask_type': best_params['tab_mask_type'],
    'scheduler_params': {"step_size":10, "gamma":0.9},
    'scheduler_fn': torch.optim.lr_scheduler.StepLR,
    'verbose': 0
}
tab_best_model = TabNetRegressor(**tab_params)
tab_best_model.fit(X.values, y.values.reshape(-1, 1), patience=10, batch_size=256, virtual_batch_size=128, max_epochs=1000, num_workers=0, drop_last=False)
# モデルを指定したパスに保存
torch.save(f'../model/{path}/tabnet')
