## xgboosting + hyper parameter tunung

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

def convert_time_to_minutes(time_str): 
    hours, minutes = time_str.split('h')
    hours = float(hours.strip()) 
    minutes = minutes.replace('m', '').strip()
    minutes = int(minutes) if minutes else 0
    total_minutes = int(round(hours * 60)) + minutes
    return total_minutes

def time_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'
    
reference_date = pd.to_datetime('2022-02-11')
train_data = pd.read_csv('./gist-mldl24f-hw3/train.csv')

train_data['date'] = pd.to_datetime(train_data['date'])
train_data['num_code'] = train_data['num_code'].astype(str)
train_data['time_taken'] = train_data['time_taken'].apply(convert_time_to_minutes)
train_data['dep_time'] = pd.to_datetime(train_data['dep_time'], format='%H:%M')
train_data['arr_time'] = pd.to_datetime(train_data['arr_time'], format='%H:%M')
train_data['dep_time_period'] = train_data['dep_time'].dt.hour.apply(time_of_day)
train_data['arr_time_period'] = train_data['arr_time'].dt.hour.apply(time_of_day)
train_data['stop_num'] = train_data['stop'].str.split('-').str[0].map({
    'non': 0,
    '1': 1,
    '2+': 2
})
train_data['days_since'] = (pd.to_datetime(train_data['date']) - reference_date).dt.days
train_data['day_of_week'] = train_data['date'].dt.day_name()
train_data['hour_dep'] = train_data['dep_time'].dt.hour
train_data['hour_arr'] = train_data['arr_time'].dt.hour
test_data = pd.read_csv('./gist-mldl24f-hw3/test.csv')
test_data['date'] = pd.to_datetime(test_data['date'])
test_data['num_code'] = test_data['num_code'].astype(str)
test_data['time_taken'] = test_data['time_taken'].apply(convert_time_to_minutes)
test_data['dep_time'] = pd.to_datetime(test_data['dep_time'], format='%H:%M')
test_data['arr_time'] = pd.to_datetime(test_data['arr_time'], format='%H:%M')
test_data['dep_time_period'] = test_data['dep_time'].dt.hour.apply(time_of_day)
test_data['arr_time_period'] = test_data['arr_time'].dt.hour.apply(time_of_day)
test_data['stop_num'] = test_data['stop'].str.split('-').str[0].map({
    'non': 0,
    '1': 1,
    '2+': 2
})
test_data['days_since'] = (pd.to_datetime(test_data['date']) - reference_date).dt.days
test_data['day_of_week'] = test_data['date'].dt.day_name()
test_data['hour_dep'] = test_data['dep_time'].dt.hour
test_data['hour_arr'] = test_data['arr_time'].dt.hour
cat_predictors = ['airline', 'from', 'to', 'class', 'dep_time_period', 'arr_time_period', 'day_of_week']
num_predictors = ['time_taken', 'days_since','stop_num', 'hour_dep', 'hour_arr']

X_cat_train = pd.get_dummies(train_data[cat_predictors], drop_first=True)
X_num_train = train_data[num_predictors]
X_train = pd.concat([X_num_train, X_cat_train], axis=1)
y_train = train_data['price']

X_cat_test = pd.get_dummies(test_data[cat_predictors], drop_first=True)
X_num_test = test_data[num_predictors]
X_test = pd.concat([X_num_test, X_cat_test], axis=1)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [6, 10, 15],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(XGBRegressor(random_state=0), param_grid, scoring='neg_mean_absolute_error', cv=5)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test)

result = pd.DataFrame({'id': test_data['id'], 'price': y_pred_best})
result.to_csv('7_pred.csv', index=False)
print("7_pred.csv 파일이 생성되었습니다.")

7_pred.csv 파일이 생성되었습니다.


In [None]:
print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 300, 'subsample': 1.0}
