In [2]:
# 基本的なライブラリ
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import random
import joblib

from pytorch_tabnet.tab_model import TabNetRegressor
import torch
from sklearn.metrics import mean_absolute_percentage_error

    
# モデル関連
import lightgbm as lgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb

# モデル評価・前処理関連
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

# ハイパーパラメータ最適化関連
import optuna
from optuna.integration import LightGBMPruningCallback, XGBoostPruningCallback
import optuna.visualization as vis

# その他のユーティリティ
import shap
import joblib

# 乱数のシードを固定
SEED = 42

np.random.seed(SEED)
random.seed(SEED)

torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# 警告を無視
warnings.filterwarnings("ignore")

In [3]:
# データの読み込み
all_data = pd.read_csv('../data_processed/all_data1.csv')

train_data = all_data[0:27532]
X = train_data.drop(['price'], axis=1)
y = train_data['price']

# データの分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [4]:
def objective(trial):
    # モデルの選択はRandomForestのみとする
    model_type = trial.suggest_categorical('model_type', ['random_forest'])
    
    # 交差検証の設定
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    mape_scores = []
    
    for train_idx, val_idx in kf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        if model_type == 'random_forest':
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 2, 150),
                'max_depth': trial.suggest_int('max_depth', 1, 32, log=True),
                'min_samples_split': trial.suggest_float('min_samples_split', 0.1, 1),
                'min_samples_leaf': trial.suggest_float('min_samples_leaf', 0.1, 0.5),
                'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
                'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
                'random_state': 42
            }
            model = RandomForestRegressor(**params)
            model.fit(X_train, y_train)
        
        preds = model.predict(X_val)
        
        mape = mean_absolute_percentage_error(y_val, preds)
        mape_scores.append(mape)
    
    # 交差検証のMAPEの平均を返す
    return sum(mape_scores) / n_splits

In [5]:
# Optunaのスタディを作成
study = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), sampler=optuna.samplers.TPESampler(seed=SEED))

# 初期のトライアルを追加
study.enqueue_trial({'model_type': 'random_forest', 'n_estimators': 62, 'max_depth': 27, 'min_samples_split': 0.21284146256400435, 'min_samples_leaf': 0.10495836843669494, 'max_features': 'auto', 'bootstrap': False})

# 最適化を実行
# 最適化を実行  
study.optimize(objective, 
               n_trials=200, 
               timeout=60*60*1, 
               n_jobs=4, 
               gc_after_trial=False, 
               show_progress_bar=True)   # 合計のトライアル数とタイムアウトを調整

print('Value: ', study.best_trial.value)
print('Best trial:', study.best_trial.params)

[I 2023-08-16 04:13:51,226] A new study created in memory with name: no-name-c217dc81-b548-4472-be85-db6f9ad290ed


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2023-08-16 04:13:51,584] Trial 3 finished with value: 1.1425984790102883 and parameters: {'model_type': 'random_forest', 'n_estimators': 6, 'max_depth': 1, 'min_samples_split': 0.6772800570023035, 'min_samples_leaf': 0.14672034704504938, 'max_features': 'log2', 'bootstrap': True}. Best is trial 3 with value: 1.1425984790102883.
[I 2023-08-16 04:13:52,884] Trial 4 finished with value: 1.057983556730407 and parameters: {'model_type': 'random_forest', 'n_estimators': 48, 'max_depth': 3, 'min_samples_split': 0.9804627422258493, 'min_samples_leaf': 0.44091353393913224, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 4 with value: 1.057983556730407.
[I 2023-08-16 04:13:53,254] Trial 2 finished with value: 1.1428800036864821 and parameters: {'model_type': 'random_forest', 'n_estimators': 147, 'max_depth': 4, 'min_samples_split': 0.6684469240323986, 'min_samples_leaf': 0.14347418315141258, 'max_features': 'log2', 'bootstrap': True}. Best is trial 4 with value: 1.057983556730407.


In [7]:
# 最適なモデルの取得

best_params = study.best_trial.params

model_type = best_params['model_type']

del best_params['model_type']  # モデルタイプをパラメータから削除

model_name = model_type

if model_type == 'random_forest':
    best_model = RandomForestRegressor(**best_params)
    best_model.fit(X.values, y.values) 
    # ランダムフォレストのモデルを保存
    joblib.dump(best_model, f'../model/{model_name}.pkl')
