Bayesian Optimization

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

def convert_time_to_minutes(time_str): 
    hours, minutes = time_str.split('h')
    hours = float(hours.strip()) 
    minutes = minutes.replace('m', '').strip()
    minutes = int(minutes) if minutes else 0
    total_minutes = int(round(hours * 60)) + minutes
    return total_minutes

def time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'
    
reference_date = pd.to_datetime('2022-02-11')
train_data = pd.read_csv('./gist-mldl24f-hw3/train.csv')

train_data['date'] = pd.to_datetime(train_data['date'])
train_data['num_code'] = train_data['num_code'].astype(str)
train_data['time_taken'] = train_data['time_taken'].apply(convert_time_to_minutes)
train_data['dep_time'] = pd.to_datetime(train_data['dep_time'], format='%H:%M')
train_data['arr_time'] = pd.to_datetime(train_data['arr_time'], format='%H:%M')
train_data['dep_time_period'] = train_data['dep_time'].dt.hour.apply(time_of_day)
train_data['arr_time_period'] = train_data['arr_time'].dt.hour.apply(time_of_day)
train_data['stop_num'] = train_data['stop'].str.split('-').str[0].map({
    'non': 0,
    '1': 1,
    '2+': 2
})
train_data['days_since'] = (pd.to_datetime(train_data['date']) - reference_date).dt.days
train_data['day_of_week'] = train_data['date'].dt.day_name()
train_data['hour_dep'] = train_data['dep_time'].dt.hour
train_data['hour_arr'] = train_data['arr_time'].dt.hour
test_data = pd.read_csv('./gist-mldl24f-hw3/test.csv')
test_data['date'] = pd.to_datetime(test_data['date'])
test_data['num_code'] = test_data['num_code'].astype(str)
test_data['time_taken'] = test_data['time_taken'].apply(convert_time_to_minutes)
test_data['dep_time'] = pd.to_datetime(test_data['dep_time'], format='%H:%M')
test_data['arr_time'] = pd.to_datetime(test_data['arr_time'], format='%H:%M')
test_data['dep_time_period'] = test_data['dep_time'].dt.hour.apply(time_of_day)
test_data['arr_time_period'] = test_data['arr_time'].dt.hour.apply(time_of_day)
test_data['stop_num'] = test_data['stop'].str.split('-').str[0].map({
    'non': 0,
    '1': 1,
    '2+': 2
})
test_data['days_since'] = (pd.to_datetime(test_data['date']) - reference_date).dt.days
test_data['day_of_week'] = test_data['date'].dt.day_name()
test_data['hour_dep'] = test_data['dep_time'].dt.hour
test_data['hour_arr'] = test_data['arr_time'].dt.hour
cat_predictors = ['airline', 'from', 'to', 'class', 'dep_time_period', 'arr_time_period', 'day_of_week']
num_predictors = ['time_taken', 'days_since','stop_num', 'hour_dep', 'hour_arr']

X_cat_train = pd.get_dummies(train_data[cat_predictors], drop_first=True)
X_num_train = train_data[num_predictors]
X_train = pd.concat([X_num_train, X_cat_train], axis=1)
y_train = train_data['price']

X_cat_test = pd.get_dummies(test_data[cat_predictors], drop_first=True)
X_num_test = test_data[num_predictors]
X_test = pd.concat([X_num_test, X_cat_test], axis=1)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)


In [None]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
import pandas as pd

# Train 데이터를 Train/Validation으로 나누기
X_train_split, X_valid, y_train_split, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Optuna 목적 함수 정의
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 250, 350),  # GridSearch 결과 중심으로 범위 설정
        'max_depth': trial.suggest_int('max_depth', 12, 18),         # GridSearch 결과 기반 범위
        'learning_rate': trial.suggest_float('learning_rate', 0.08, 0.12),
        'subsample': trial.suggest_float('subsample', 0.9, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.9, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),    # 규제 추가로 튜닝
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0)
    }

    # 모델 초기화
    model = XGBRegressor(random_state=42, **params)
    model.fit(X_train_split, y_train_split)

    # Validation 데이터 평가
    y_valid_pred = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, y_valid_pred)
    return mae  # MAE를 최소화

# Optuna 실행
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)  # 50번 반복

# 최적 파라미터 출력
best_params = study.best_params
print("Best Parameters from Optuna:", best_params)

# 최적 모델 학습
best_model = XGBRegressor(random_state=42, **best_params)
best_model.fit(X_train, y_train)

# 테스트 데이터 예측
y_test_pred = best_model.predict(X_test)

# 결과 저장
result = pd.DataFrame({'id': test_data['id'], 'price': y_test_pred})
result.to_csv('optuna_pred.csv', index=False)
print("optuna_pred.csv 파일이 생성되었습니다.")
