In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
import optuna
import gc
from datetime import datetime
from pathlib import Path
print("pandas version:", pd.__version__)  # 2.2.3
print("numpy version:", np.__version__)   # 1.26.4
print("optuna version:", optuna.__version__)   # 1.26.4
print("xgboost version:", xgboost.__version__)   # 2.0.3

In [None]:
# 평가산신, scaling 함수 
def SMAPE(true, pred):
    return np.mean((np.abs(true - pred)) / (np.abs(true) + np.abs(pred))) * 200

def weighted_mse(alpha=3):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual > 0, -2 * alpha * residual, -2 * residual)
        hess = np.where(residual > 0, 2 * alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

def inv_boxcox(y, lam, boxcox_offset):
    if lam == 0:
        return np.exp(y) - boxcox_offset
    else:
        return np.power(y * lam + 1, 1 / lam) - boxcox_offset

In [None]:
# 데이터 로드
train_df = pd.read_csv('/kaggle/input/preprocess-2025-electricity-consumption/prep_train.csv')
test_df = pd.read_csv('/kaggle/input/preprocess-2025-electricity-consumption/prep_test.csv')
sub_df = pd.read_csv('/kaggle/input/preprocess-2025-electricity-consumption/sample_submission.csv')

In [None]:
# 이상치 제거
out_date_dict = {
    5: ['5_20240804 07', '5_20240804 08'],
    8: ['8_20240721 08'],
    12: ['12_20240721 08', '12_20240721 09', '12_20240721 10', '12_20240721 11'],
    30: ['30_20240713 20', '30_20240725 00'],
    40: ['40_20240714 00'],
    41: ['41_20240622 01', '41_20240622 04', '41_20240717 14', '41_20240717 15'],
    42: ['42_20240717 14'],
    43: ['43_20240610 17', '43_20240610 18', '43_20240812 16', '43_20240812 17'],
    44: ['44_20240630 00', '44_20240630 02', '44_20240606 13', '44_20240606 14'],
    52: ['52_20240810 00', '52_20240810 02'],
    53: ['53_20240615 08', '53_20240615 11'],
    67: ['67_20240610 17', '67_20240610 18', '67_20240812 16', '67_20240812 17'],
    68: ['68_20240628 23', '68_20240629 01'],
    70: ['70_20240605 09', '70_20240603 11', '70_20240603 12'],
    72: ['72_20240721 11'],
    76: ['76_20240603 13', '76_20240620 12', '76_20240620 16'],
    79: ['79_20240819 04', '79_20240819 03', '79_20240819 05'],
    80: ['80_20240720 10', '80_20240720 11', '80_20240720 12', '80_20240706 10', '80_20240706 13', '80_20240706 14'],
    81: ['81_20240717 14'],
    90: ['90_20240605 18'],
    92: ['92_20240717 18', '92_20240717 19', '92_20240717 21'],
    94: ['94_20240727 09', '94_20240727 12'],
    97: ['97_20240605 05'],
}

all_remove_values = []
for date_times in out_date_dict.values():
    all_remove_values.extend(date_times)

train_df = train_df[~train_df['num_date_time'].isin(all_remove_values)]
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Submission shape: {sub_df.shape}")

In [None]:
# target 스케일링(Box-Cox)
target_col = '전력소비량(kWh)'
boxcox_offset = 1
train_df[target_col], lam = stats.boxcox(train_df[target_col] + boxcox_offset)


# 범주형 변환
categorical_cols = [
    '건물번호','건물유형', 'is_holiday', 'is_building_holiday','peak_flag','rain_flag',
    'discomfort_category','heat_wave_flag','tropical_night_flag','cluster_id', 'is_weekday', 'business_hours'
]

for col in categorical_cols:
    if col in train_df.columns:
        train_df[col] = train_df[col].astype('category')
    if col in test_df.columns:
        test_df[col] = test_df[col].astype('category')

In [None]:
# Feature Select
exclude_cols = ['일시', 'num_date_time', 'is_weekday', 'time_period', 'business_hours', 'rain_flag', 'max_days',
                'high_temp_flag', 'wind_chill_effect', 'temp_change_hour', 'temp_change_day', '기온_C_diff1', 
                '기온_C_diff3', '기온_C_diff24', '기온_C_lag1', '기온_C_lag3', '기온_C_lag24', '풍속_ms_diff1', 
                '풍속_ms_diff3', '풍속_ms_diff24', '풍속_ms_lag1', '풍속_ms_lag3', '풍속_ms_lag24', '습도_pct_diff1',
                '습도_pct_diff3', '습도_pct_diff24', '습도_pct_lag1', '습도_pct_lag3', '습도_pct_lag24', '기온_C_mean3', 
                '기온_C_median3', '기온_C_range3', '기온_C_median_mean_diff3', '기온_C_mean24', '기온_C_median24', '기온_C_range24', 
                '기온_C_median_mean_diff24', '풍속_ms_mean3', '풍속_ms_median3', '풍속_ms_range3', '풍속_ms_median_mean_diff3', 
                '풍속_ms_mean24', '풍속_ms_median24', '풍속_ms_range24', '풍속_ms_median_mean_diff24', '습도_pct_mean3', '습도_pct_median3', 
                '습도_pct_range3', '습도_pct_median_mean_diff3', '습도_pct_mean24', '습도_pct_median24', '습도_pct_range24', 
                '습도_pct_median_mean_diff24']

feature_cols = [col for col in train_df.columns if col not in exclude_cols + [target_col]]
print(f"Selected features: {feature_cols}")
print(f"Number of features: {len(feature_cols)}")

In [None]:
# K fold

train_df['일시'] = pd.to_datetime(train_df['일시'])
split_date = pd.Timestamp('2024-08-18') # 마지막 주차
unique_types = train_df['건물유형'].unique()
n_splits = len(unique_types)

rng = np.random.default_rng(42)
unique_types = np.array(unique_types)
rng.shuffle(unique_types)
type_folds = np.array_split(unique_types, n_splits)

X_test = test_df[feature_cols]

# 출력 디렉토리 생성
out_dir = Path('./cv_outputs')
out_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# optuna 정의
def cv_objective(trial):
    param = {
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
        'gamma': trial.suggest_float('gamma', 1e-3, 10.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.9, 1.0),
        'subsample': trial.suggest_categorical('subsample', [0.7, 0.8, 0.9, 1.0]),
        'max_depth': trial.suggest_categorical('max_depth', [11, 12, 13, 14, 15, 16, 17, 18, 19]),
        'min_child_weight': trial.suggest_int('min_child_weight', 20, 100),
        'eta': trial.suggest_float('eta', 0.01, 0.3),
    }
    fold_scores = []
    
    for fold_idx in range(n_splits):
        valid_types = set(type_folds[fold_idx])
        valid_mask = train_df['건물유형'].isin(valid_types) & (train_df['일시'] >= split_date)
        train_mask = (~train_df['건물유형'].isin(valid_types)) | (train_df['일시'] < split_date)
        
        if valid_mask.sum() == 0:
            continue
            
        X_tr = train_df.loc[train_mask, feature_cols]
        y_tr = train_df.loc[train_mask, target_col]
        X_va = train_df.loc[valid_mask, feature_cols]
        y_va = train_df.loc[valid_mask, target_col]
        
        model = XGBRegressor(
            n_estimators=10000,
            learning_rate=param['eta'],
            max_depth=param['max_depth'],
            subsample=param['subsample'],
            colsample_bytree=param['colsample_bytree'],
            min_child_weight=param['min_child_weight'],
            reg_alpha=param['reg_alpha'],
            reg_lambda=param['reg_lambda'],
            gamma=param['gamma'],
            objective=weighted_mse(3),
            random_state=42,
            device="cuda",
            tree_method="hist",
            enable_categorical=True,
            early_stopping_rounds=50,
        )
        
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            verbose=0
        )
        
        y_pred_val = model.predict(X_va)
        smape_score = SMAPE(y_va, y_pred_val)
        fold_scores.append(smape_score)
        
        del model
        gc.collect()
    
    if len(fold_scores) == 0:
        return float('inf')
    
    return np.mean(fold_scores)

In [None]:
# Optuna 하이퍼파라미터 최적화
print(f"\n=== Starting Global Optuna Hyperparameter Optimization ===")
study = optuna.create_study(
    direction='minimize',
    pruner=optuna.pruners.MedianPruner(n_startup_trials=10, n_warmup_steps=30)
)

study.optimize(
    cv_objective,
    n_trials=None,
    timeout=3600*11,
    n_jobs=8,
    gc_after_trial=True,
    show_progress_bar=True
)

print("=== Global Optuna Results ===")
print(f"Best CV SMAPE: {study.best_value:.4f}")
print("Best parameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

# Optuna 히스토리 저장
df_trials = study.trials_dataframe().sort_values(by=['value'], ascending=[True]).reset_index(drop=True)
optuna_csv_path = './optuna_trials_global_cv.csv'
df_trials.to_csv(optuna_csv_path, index=False)
print(f"Optuna trials saved to: {optuna_csv_path}")


# 상위 3개 하이퍼 파라미터 저장
param_columns = ['reg_lambda', 'gamma', 'reg_alpha', 'colsample_bytree', 'subsample', 'max_depth', 'min_child_weight', 'eta']
top3_trials = df_trials.head(3).copy()

# 파라미터 컬럼들을 찾아서 추출 (params_ prefix가 있는 경우)
param_cols_in_df = [col for col in df_trials.columns if any(param in col for param in param_columns)]
if not param_cols_in_df:
    # params_ prefix가 있는 경우
    param_cols_in_df = [col for col in df_trials.columns if col.startswith('params_')]

# 상위 3개의 파라미터만 추출
best_3_params_data = []
for idx, row in top3_trials.iterrows():
    param_dict = {'rank': len(best_3_params_data) + 1, 'cv_score': row['value']}
    
    # 파라미터 추출
    for param_name in param_columns:
        # params_{param_name} 컬럼에서 값 추출
        col_name = f'params_{param_name}'
        if col_name in df_trials.columns:
            param_dict[param_name] = row[col_name]
        else:
            # 대안으로 user_attrs에서 찾기
            for col in df_trials.columns:
                if param_name in col:
                    param_dict[param_name] = row[col]
                    break
    
    best_3_params_data.append(param_dict)

# Best 3 params CSV 저장
best_3_params_df = pd.DataFrame(best_3_params_data)
best_3_params_path = './best_3_optuna_params.csv'
best_3_params_df.to_csv(best_3_params_path, index=False)
print(f"Best 3 parameters saved to: {best_3_params_path}")
print("\n=== Top 3 Parameter Sets ===")
print(best_3_params_df)

In [None]:
print(f"\n=== Training with each of the top 3 parameter sets ===")

all_param_results = []  # 전체 결과 저장용
all_test_predictions = []  # 테스트 예측 저장용
"""
# best 1 
{
    'colsample_bytree': 0.9506169035060151,
    'eta': 0.030648887064687796,
    'gamma': 0.09894819199058635,
    'max_depth': 14,
    'min_child_weight': 83,
    'reg_alpha': 1.4459133024977084,
    'reg_lambda': 5.55055188250579,
    'subsample': 1
}

# best 2
{
    'colsample_bytree': 0.951206460894219,
    'eta': 0.0320356578612075,
    'gamma': 0.051206765719055,
    'max_depth': 14,
    'min_child_weight': 81,
    'reg_alpha': 1.47674401806305,
    'reg_lambda': 5.42854453302006,
    'subsample': 1
}

# best 3
{
    'colsample_bytree': 0.948943295392428,
    'eta': 0.0329996779509758,
    'gamma': 0.314764846013268,
    'max_depth': 14,
    'min_child_weight': 80,
    'reg_alpha': 0.946560191231365,
    'reg_lambda': 5.2825285099683,
    'subsample': 0.7
}"""

for param_idx, param_row in best_3_params_df.iterrows():
    print(f"\n--- Training with Parameter Set {param_idx + 1} (Rank {int(param_row['rank'])}) ---")

    current_params = {
        'reg_lambda': param_row['reg_lambda'],
        'gamma': param_row['gamma'],
        'reg_alpha': param_row['reg_alpha'],
        'colsample_bytree': param_row['colsample_bytree'],
        'subsample': param_row['subsample'],
        'max_depth': int(param_row['max_depth']),
        'min_child_weight': int(param_row['min_child_weight']),
        'eta': param_row['eta'],
    }

    param_fold_metrics = []
    param_test_preds = []
    param_fold_summaries = []
    
    # K-Fold 학습
    for fold_idx in range(n_splits):
        valid_types = set(type_folds[fold_idx])
        valid_mask = train_df['건물유형'].isin(valid_types) & (train_df['일시'] >= split_date)
        train_mask = (~train_df['건물유형'].isin(valid_types)) | (train_df['일시'] < split_date)
        
        if valid_mask.sum() == 0:
            continue
            
        X_tr = train_df.loc[train_mask, feature_cols]
        y_tr = train_df.loc[train_mask, target_col]
        X_va = train_df.loc[valid_mask, feature_cols]
        y_va = train_df.loc[valid_mask, target_col]
        
        # 현재 파라미터로 모델 학습
        model = XGBRegressor(
            n_estimators=10000,
            learning_rate=current_params['eta'],
            max_depth=current_params['max_depth'],
            subsample=current_params['subsample'],
            colsample_bytree=current_params['colsample_bytree'],
            min_child_weight=current_params['min_child_weight'],
            reg_alpha=current_params['reg_alpha'],
            reg_lambda=current_params['reg_lambda'],
            gamma=current_params['gamma'],
            objective=weighted_mse(3),
            random_state=42,
            device="cuda",
            tree_method="hist",
            enable_categorical=True,
            early_stopping_rounds=100,
        )
        
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            verbose=100 if param_idx == 0 else 0  # 첫 번째 파라미터만 verbose
        )
        
        # 검증 성능 계산
        va_pred = model.predict(X_va)
        smape = SMAPE(y_va, va_pred)
        inv_smape = SMAPE(inv_boxcox(y_va, lam, boxcox_offset), inv_boxcox(va_pred, lam, boxcox_offset))
        mae = mean_absolute_error(y_va, va_pred)
        rmse = np.sqrt(mean_squared_error(y_va, va_pred))
        
        param_fold_metrics.append((smape, inv_smape, mae, rmse))
        
        # 폴드 요약 저장
        param_fold_summaries.append({
            'param_rank': int(param_row['rank']),
            'fold': fold_idx + 1,
            'valid_types': ','.join(map(str, sorted(list(valid_types)))),
            'valid_samples': int(len(X_va)),
            'train_samples': int(len(X_tr)),
            'smape': float(smape),
            'inv_smape': float(inv_smape),
            'mae': float(mae),
            'rmse': float(rmse),
        })
        
        print(f"  [Fold {fold_idx+1}] SMAPE: {smape:.4f} | inv_SMAPE: {inv_smape:.4f} | MAE: {mae:.4f} | RMSE: {rmse:.4f}")
        
        # 테스트 예측
        te_pred = model.predict(X_test)
        te_pred_inv = inv_boxcox(te_pred, lam, boxcox_offset)
        param_test_preds.append(te_pred_inv)
        
        del model
        gc.collect()
    
    # 현재 파라미터 셋의 전체 성능 요약
    if param_fold_metrics:
        metrics_arr = np.array(param_fold_metrics)
        avg_smape = metrics_arr[:, 0].mean()
        std_smape = metrics_arr[:, 0].std()
        avg_inv_smape = metrics_arr[:, 1].mean()
        std_inv_smape = metrics_arr[:, 1].std()
        avg_mae = metrics_arr[:, 2].mean()
        avg_rmse = metrics_arr[:, 3].mean()
        
        print(f"  Parameter Set {param_idx + 1} Summary:")
        print(f"    SMAPE: {avg_smape:.4f} ± {std_smape:.4f}")
        print(f"    inv_SMAPE: {avg_inv_smape:.4f} ± {std_inv_smape:.4f}")
        print(f"    MAE: {avg_mae:.4f}")
        print(f"    RMSE: {avg_rmse:.4f}")
        
        # 전체 결과에 추가
        param_result = {
            'param_rank': int(param_row['rank']),
            'cv_score_optuna': param_row['cv_score'],
            'actual_cv_smape_mean': avg_smape,
            'actual_cv_smape_std': std_smape,
            'actual_cv_inv_smape_mean': avg_inv_smape,
            'actual_cv_inv_smape_std': std_inv_smape,
            'actual_cv_mae_mean': avg_mae,
            'actual_cv_rmse_mean': avg_rmse,
            **current_params
        }
        all_param_results.append(param_result)
        
        # 전체 폴드 요약을 all_param_results에 추가
        all_param_results.extend(param_fold_summaries)
    
    # 테스트 예측 앙상블 (현재 파라미터 셋)
    if param_test_preds:
        test_pred_ensemble = np.mean(np.column_stack(param_test_preds), axis=1)
        all_test_predictions.append({
            'param_rank': int(param_row['rank']),
            'predictions': test_pred_ensemble
        })

In [None]:
# 결과 저장
print(f"\n=== Saving Results ===")

# 파라미터별 성능 요약 저장
param_performance_df = pd.DataFrame([r for r in all_param_results if 'fold' not in r])
param_performance_path = './top3_params_performance_summary.csv'
param_performance_df.to_csv(param_performance_path, index=False)
print(f"Parameter performance summary saved to: {param_performance_path}")

# 파라미터별 폴드별 상세 결과 저장  
param_fold_details_df = pd.DataFrame([r for r in all_param_results if 'fold' in r])
param_fold_details_path = './top3_params_fold_details.csv'
param_fold_details_df.to_csv(param_fold_details_path, index=False)
print(f"Parameter fold details saved to: {param_fold_details_path}")

# 각 파라미터별 테스트 예측 저장
for pred_data in all_test_predictions:
    rank = pred_data['param_rank']
    predictions = pred_data['predictions']
    
    # 제출 형식으로 저장
    submission_param = pd.DataFrame({
        'num_date_time': test_df['num_date_time'],
        'answer': predictions
    })
    submission_param = sub_df[['num_date_time']].merge(submission_param, on='num_date_time', how='left')
    
    submission_path = f'./submission_param_rank_{rank}.csv'
    submission_param.to_csv(submission_path, index=False)
    print(f"Submission for parameter rank {rank} saved to: {submission_path}")

# 상위 3개 파라미터의 앙상블 예측도 생성
if len(all_test_predictions) >= 2:
    ensemble_predictions = np.mean([pred['predictions'] for pred in all_test_predictions], axis=0)
    
    submission_ensemble = pd.DataFrame({
        'num_date_time': test_df['num_date_time'],
        'answer': ensemble_predictions
    })
    submission_ensemble = sub_df[['num_date_time']].merge(submission_ensemble, on='num_date_time', how='left')
    
    ensemble_path = './submission_top3_ensemble.csv'
    submission_ensemble.to_csv(ensemble_path, index=False)
    print(f"Top 3 ensemble submission saved to: {ensemble_path}")

print(f"\n=== Final Summary ===")
print("Top 3 Parameter Performance:")
print(param_performance_df[['param_rank', 'cv_score_optuna', 'actual_cv_smape_mean', 'actual_cv_inv_smape_mean']].to_string(index=False))

print("\n=== All files saved successfully! ===")

pandas version: 2.2.3
numpy version: 1.26.4
optuna version: 4.4.0
xgboost version: 2.0.3
Train shape: (203943, 97)
Test shape: (16800, 96)
Submission shape: (16800, 2)


[I 2025-08-27 08:14:11,580] A new study created in memory with name: no-name-136f9e30-e319-45a3-989f-545903af403d


Selected features: ['건물번호', '기온(°C)', '강수량(mm)', '풍속(m/s)', '습도(%)', '건물유형', '연면적(m2)', '냉방면적(m2)', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)', '연면적구간', '냉방면적구간', 'month', 'week_of_year', 'dow', 'day', 'hour', 'is_holiday', 'is_building_holiday', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'dow_sin', 'dow_cos', 'peak_flag', 'cooling_ratio', 'temp_cool_ratio', 'heat_index', 'CDD', 'discomfort_index', 'discomfort_category', 'heat_wave_flag', 'tropical_night_flag', 'feels_like_temp', 'wet_bulb', 'dow_hour_mean', 'hour_mean', 'hour_std', 'cluster_id']
Number of features: 43

=== Starting Global Optuna Hyperparameter Optimization ===


   0%|          | 00:00/05:00

[I 2025-08-27 08:16:08,739] Trial 0 finished with value: 1.3460565254022039 and parameters: {'reg_lambda': 6.057287516569718, 'gamma': 2.26245997086395, 'reg_alpha': 2.370207024975265, 'colsample_bytree': 0.9423327236118363, 'subsample': 0.8, 'max_depth': 19, 'min_child_weight': 42, 'eta': 0.08228552323536995}. Best is trial 0 with value: 1.3460565254022039.
[I 2025-08-27 08:17:35,516] Trial 1 finished with value: 1.4182775077272982 and parameters: {'reg_lambda': 8.66653670098045, 'gamma': 8.695611345090008, 'reg_alpha': 9.232872661211694, 'colsample_bytree': 0.9234838724401777, 'subsample': 0.7, 'max_depth': 19, 'min_child_weight': 40, 'eta': 0.18539628234244288}. Best is trial 0 with value: 1.3460565254022039.
[I 2025-08-27 08:19:02,232] Trial 2 finished with value: 1.3816527385104487 and parameters: {'reg_lambda': 7.449041961632654, 'gamma': 6.2190883691291, 'reg_alpha': 3.4360132655427007, 'colsample_bytree': 0.9966843914081787, 'subsample': 0.9, 'max_depth': 12, 'min_child_weight'