In [1]:
# 基本的なライブラリ
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

# モデル関連
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb

# モデル評価・前処理関連
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

# ハイパーパラメータ最適化関連
import optuna
from optuna.integration import LightGBMPruningCallback, XGBoostPruningCallback
import optuna.visualization as vis

# その他のユーティリティ
import shap
import joblib

# 乱数のシードを固定
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

# 警告を無視
warnings.filterwarnings("ignore")

In [2]:
# データの読み込み
all_data = pd.read_csv('../data_processed/all_data.csv')

train_data = all_data[0:27532]
X = train_data.drop(['price'], axis=1)
y = train_data['price']

# データの分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

In [9]:
def objective(trial):
    # モデルの選択
    model_type = trial.suggest_categorical('model_type', ['catboost', 'lightgbm','xgboost'])
    
    # 交差検証の設定
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    mape_scores = []
    
    for train_idx, val_idx in kf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        if model_type == 'catboost':
            params = {
                'depth': trial.suggest_int('depth', 4, 16),
                'learning_rate': trial.suggest_loguniform('learning_rate', 0.00001, 0.1),
                'iterations': trial.suggest_int('iterations', 50, 2000),
                'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 20),
                'border_count': trial.suggest_categorical('border_count', [32, 64, 128, 256, 512]),
                'thread_count': 4
            }
            model = CatBoostRegressor(**params, random_seed=42, verbose=0)
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, 
                      cat_features=['region', 'manufacturer', 'condition', 'cylinders', 'fuel', 'title_status', 'transmission', 'drive', 'size', 'type', 'paint_color', 'state'])
        
        elif model_type == 'lightgbm':
            params = {
                'num_leaves': trial.suggest_int('num_leaves', 10, 50),
                'max_depth': trial.suggest_int('max_depth', -1, 5),
                'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
                'n_estimators': trial.suggest_int('n_estimators', 50, 100),
                'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.1),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0, step=0.1),
                'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 0.5, step=0.1),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 0.5, step=0.1),
                'force_col_wise': True,  
                'verbose': -1
            }
            model = LGBMRegressor(**params, random_state=42)
            model.fit(X_train, y_train, eval_metric='mape', eval_set=[(X_val, y_val)],
                      callbacks=[LightGBMPruningCallback(trial, 'l2')])
        
        else:  # xgboost
            params = {
                'max_depth': trial.suggest_int('max_depth', 3, 9),
                'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
                'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 0.5),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 0.5),
                'verbosity': 0,
                'objective': 'reg:squarederror',
                'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
                'tree_method': 'hist',
                'seed': 42
            }
            dtrain = xgb.DMatrix(X_train, label=y_train)
            dval = xgb.DMatrix(X_val, label=y_val)
            model = xgb.train(params, dtrain, evals=[(dval, 'eval')], early_stopping_rounds=10, 
                      callbacks=[XGBoostPruningCallback(trial, 'eval-rmse')], verbose_eval=False)
        
        if model_type == 'xgboost':
            preds = model.predict(dval)
        else:
            preds = model.predict(X_val)
        
        mape = mean_absolute_percentage_error(y_val, preds)
        mape_scores.append(mape)
    
    # 交差検証のMAPEの平均を返す
    return sum(mape_scores) / n_splits


In [12]:
# Optunaのスタディを作成
study = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), sampler=optuna.samplers.TPESampler(seed=SEED))

# 初期のトライアルを追加
study.enqueue_trial({'model_type': 'xgboost', 'max_depth': 8, 'learning_rate': 0.0825864670684569, 'n_estimators': 211, 'min_child_weight': 7, 'subsample': 0.7142500766353812, 'colsample_bytree': 0.9906135485711739, 'reg_alpha': 0.43156230610011287, 'reg_lambda': 0.085552341124279, 'booster': 'dart'})

# 最適化を実行
study.optimize(objective, n_trials=100, timeout=9000)  # 合計のトライアル数とタイムアウトを調整

print('Best trial:', study.best_trial.params)

[I 2023-08-12 15:32:56,636] A new study created in memory with name: no-name-a640599e-3c83-480c-a174-f671e564f69a
[I 2023-08-12 15:32:56,966] Trial 0 finished with value: 0.46960209714306494 and parameters: {'model_type': 'xgboost', 'max_depth': 8, 'learning_rate': 0.0825864670684569, 'n_estimators': 211, 'min_child_weight': 7, 'subsample': 0.7142500766353812, 'colsample_bytree': 0.9906135485711739, 'reg_alpha': 0.43156230610011287, 'reg_lambda': 0.085552341124279, 'booster': 'dart'}. Best is trial 0 with value: 0.46960209714306494.
[I 2023-08-12 15:32:57,459] Trial 1 finished with value: 1.097742299675781 and parameters: {'model_type': 'lightgbm', 'num_leaves': 34, 'max_depth': 0, 'learning_rate': 0.002051110418843397, 'n_estimators': 52, 'min_child_samples': 90, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_alpha': 0.0, 'reg_lambda': 0.5}. Best is trial 0 with value: 0.46960209714306494.
[I 2023-08-12 15:35:29,040] Trial 2 finished with value: 1.0682381898086646 and parameters: {'m

In [12]:
# 最適なモデルの取得
best_params = study.best_trial.params
model_type = best_params['model_type']
del best_params['model_type']  # モデルタイプをパラメータから削除

model_name = "model2"

if model_type == 'catboost':
    best_model = CatBoostRegressor(**best_params, random_seed=42, verbose=0)
    best_model.fit(X, y, cat_features=['region', 'manufacturer', 'condition', 'cylinders', 'fuel', 'title_status', 'transmission', 'drive', 'size', 'type', 'paint_color', 'state'])
    # モデルを保存
    best_model.save_model(f'../model/{model_name}.cbm')
    
elif model_type == 'lightgbm':
    best_model = LGBMRegressor(**best_params, random_state=42)
    best_model.fit(X, y)
    # LightGBMのモデルを保存
    best_model.booster_.save_model(f'../model/{model_name}.bin')

elif model_type == 'xgboost':
    dtrain = xgb.DMatrix(X, label=y)
    best_model = xgb.train(best_params, dtrain)
    # XGBoostのモデルを保存
    best_model.save_model(f'../model/{model_name}.xgb')
