In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.2-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.2 colorlog-6.9.0 optuna-4.4.0


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import joblib
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import optuna
from tqdm.auto import tqdm
import warnings
import holidays
import os

warnings.filterwarnings('ignore')

# The GPU-specific installation for Google Colab has been removed.
# This script will now use the standard CPU version of LightGBM.

try:
    from google.colab import drive
    drive.mount('/content/drive')
    PATH = "/content/drive/MyDrive/열수요예측/KT/"
    if not os.path.exists(PATH):
        os.makedirs(PATH)
except ImportError:
    PATH = "./"


SEED = 42
np.random.seed(SEED)


def load_and_preprocess(df):
    """기본 컬럼명 정리 및 타입 변환 (tm 컬럼 자동 탐지 기능 추가)"""
    df = df.copy()
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])

    df.columns = [col.lower() for col in df.columns]


    df.columns = [col.replace('train_heat.', '').replace('test_heat.', '') for col in df.columns]

    if 'tm' not in df.columns:
        time_col = [col for col in df.columns if 'tm' in col]
        if time_col:
            print(f"'{time_col[0]}' 컬럼을 'tm' 컬럼으로 변경합니다.")
            df.rename(columns={time_col[0]: 'tm'}, inplace=True)
        else:
            print(f"현재 데이터프레임의 컬럼 목록: {df.columns.tolist()}")
            raise KeyError("데이터에서 'tm' 또는 'tm'을 포함하는 시간 관련 컬럼을 찾을 수 없습니다.")


    if df['tm'].dtype == 'object':
        df['tm'] = pd.to_datetime(df['tm'])
    else:
        df['tm'] = pd.to_datetime(df['tm'], format='%Y%m%d%H')


    if 'branch_id' in df.columns:
        df['branch_id'] = df['branch_id'].astype('category')


    return df

def create_missing_flags_and_replace(df):
    """결측치 플래그 생성 및 NaN으로 변환"""
    df = df.copy()
    missing_cols = ['ta', 'ws', 'rn_day', 'rn_hr1', 'hm', 'si', 'ta_chi', 'heat_demand']

    for col in missing_cols:
        if col in df.columns:
            missing_mask = (df[col] == -99)
            df[f'{col}_missing'] = missing_mask.astype(int)
            df[col] = df[col].replace(-99, np.nan)

    if 'wd' in df.columns:
        df['wd'] = df['wd'].replace(-9.9, np.nan)

    return df

def apply_interpolation(df):
    """지사별 시계열 보간"""
    df = df.copy()
    print("지사별 시계열 보간 적용 중...")


    interp_cols = ['ta', 'ws', 'rn_day', 'rn_hr1', 'hm', 'si', 'ta_chi', 'wd']
    if 'heat_demand' in df.columns:
        interp_cols.append('heat_demand')


    current_interp_cols = [col for col in interp_cols if col in df.columns]


    df = df.sort_values(by=['branch_id', 'tm']).reset_index(drop=True)


    for col in current_interp_cols:
        df[col] = df.groupby('branch_id')[col].transform(lambda group: group.interpolate(method='linear', limit_direction='both'))


    remaining_nan_before_fill = df[current_interp_cols].isnull().sum().sum()
    if remaining_nan_before_fill > 0:
        print(f"경고: 보간 후에도 {remaining_nan_before_fill}개의 결측치가 남아있습니다. 0으로 채웁니다.")


        cols_to_fill_zero = [col for col in current_interp_cols if col != 'branch_id']
        df[cols_to_fill_zero] = df[cols_to_fill_zero].fillna(0)


    return df
def add_heating_season(df):
    """난방 시즌 컬럼 추가"""
    df = df.copy()
    df['month'] = df['tm'].dt.month
    df['heating_season'] = 0
    heating_months = [10, 11, 12, 1, 2, 3, 4]
    df.loc[df['month'].isin(heating_months), 'heating_season'] = 1
    return df

def create_advanced_features(df):
    """고도화된 파생변수 생성"""
    df = df.copy()


    df['year'] = df['tm'].dt.year
    df['day'] = df['tm'].dt.day
    df['hour'] = df['tm'].dt.hour
    df['dayofweek'] = df['tm'].dt.dayofweek
    df['dayofyear'] = df['tm'].dt.dayofyear

    # 순환 인코딩
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
    df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)

    # 고급 기상 변수
    df['HDD18'] = np.maximum(0, 18 - df['ta']) # 난방도일

    # 체감온도 계산 (겨울용)
    def calculate_apparent_temp(ta, ws):
        return 13.12 + 0.6215 * ta - 11.37 * (ws * 3.6)**0.16 + 0.3965 * ta * (ws * 3.6)**0.16
    df['apparent_temp'] = calculate_apparent_temp(df['ta'], df['ws'])

    # 시차 및 이동평균 (지사별)
    for lag in [3, 6, 24]:
        if 'ta' in df.columns:
            df[f'ta_lag_{lag}h'] = df.groupby('branch_id')['ta'].shift(lag)
    for window in [6, 12, 24]:
        if 'ta' in df.columns:
            df[f'ta_ma_{window}h'] = df.groupby('branch_id')['ta'].transform(lambda x: x.rolling(window, min_periods=1).mean())
    if 'ta' in df.columns:
        df['ta_diff_3h'] = df.groupby('branch_id')['ta'].diff(3)
        df['ta_diff_6h'] = df.groupby('branch_id')['ta'].diff(6)

    # 공휴일
    kr_holidays = holidays.KR()
    df['is_holiday'] = df['tm'].dt.date.apply(lambda x: x in kr_holidays).astype(int)

    # 다시 보간 (시차 변수 생성 후 생긴 NaN 처리)
    # heat_demand가 없는 경우를 고려하여 drop할 컬럼 목록 동적 생성
    cols_to_drop = ['tm']
    if 'heat_demand' in df.columns:
        cols_to_drop.append('heat_demand')

    # 모든 컬럼을 대상으로 NaN을 처리하기 전에, drop할 컬럼만 제외
    feature_cols_for_fillna = [col for col in df.columns if col not in cols_to_drop]

    df[feature_cols_for_fillna] = df.groupby('branch_id')[feature_cols_for_fillna].transform(lambda x: x.ffill().bfill())

    # branch_id 컬럼을 제외하고 fillna(0) 적용
    cols_for_global_fillna = [col for col in df.columns if col not in cols_to_drop and df[col].dtype != 'category']
    df[cols_for_global_fillna] = df[cols_for_global_fillna].fillna(0)

    return df

def create_weather_outlier_flags(train_df, test_df):
    """시즌별 기상 이상치 플래그 생성 (훈련 세트 기준)"""
    print("시즌별 기상 이상치 플래그 생성 중...")
    outlier_thresholds = {}

    for branch in train_df['branch_id'].unique():
        branch_data = train_df[train_df['branch_id'] == branch]
        outlier_thresholds[branch] = {}
        for season in [0, 1]:
            season_data = branch_data[branch_data['heating_season'] == season]
            if len(season_data) > 10:
                outlier_thresholds[branch][season] = {
                    'ta_q10': season_data['ta'].quantile(0.10), # 극한 추위
                    'ws_q90': season_data['ws'].quantile(0.90), # 강풍
                    'rn_day_q90': season_data['rn_day'].quantile(0.90) # 폭우
                }

    def apply_thresholds(df, thresholds):
        df = df.copy()
        df['cold_extreme'] = 0
        df['strong_wind'] = 0
        df['heavy_rain'] = 0

        for branch in df['branch_id'].unique():
            if branch in thresholds:
                branch_mask = df['branch_id'] == branch
                for season in [0, 1]:
                    if season in thresholds[branch]:
                        season_mask = branch_mask & (df['heating_season'] == season)
                        season_thresholds = thresholds[branch][season]

                        if 'ta' in df.columns:
                            df.loc[season_mask, 'cold_extreme'] = (df.loc[season_mask, 'ta'] < season_thresholds['ta_q10']).astype(int)
                        if 'ws' in df.columns:
                            df.loc[season_mask, 'strong_wind'] = (df.loc[season_mask, 'ws'] > season_thresholds['ws_q90']).astype(int)
                        if 'rn_day' in df.columns:
                            df.loc[season_mask, 'heavy_rain'] = (df.loc[season_mask, 'rn_day'] > season_thresholds['rn_day_q90']).astype(int)
        return df

    train_result = apply_thresholds(train_df, outlier_thresholds)
    test_result = apply_thresholds(test_df, outlier_thresholds)

    return train_result, test_result

def get_feature_names(df):
    """사용할 피처 이름 정의"""
    features = [col for col in df.columns if col not in ['tm', 'heat_demand', 'year']]
    categorical_features = [
        'branch_id', 'month', 'hour', 'dayofweek', 'heating_season', 'is_holiday',
        'cold_extreme', 'strong_wind', 'heavy_rain'
    ]
    categorical_features = [f for f in categorical_features if f in df.columns]

    return features, categorical_features

# ---------------------------------------------------
# 2. 모델 훈련 및 최적화
# ---------------------------------------------------

def create_year_based_cv_splits(df):
    """연도 기반 3-Fold CV 분할 생성"""
    cv_splits = []
    years = sorted(df['year'].unique())
    val_years_to_use = years[-3:] if len(years) >= 3 else years

    if not val_years_to_use:
        print("경고: 연도 데이터가 충분하지 않아 연도 기반 CV를 생성할 수 없습니다. 일반 KFold(3-Fold)를 사용합니다.")
        kf = KFold(n_splits=3, shuffle=True, random_state=SEED)
        cv_splits = list(kf.split(df))
        return cv_splits

    for val_year in val_years_to_use:
        train_mask = (df['year'] != val_year)
        val_mask = (df['year'] == val_year)

        train_indices = df[train_mask].index.tolist()
        val_indices = df[val_mask].index.tolist()

        if len(train_indices) > 0 and len(val_indices) > 0:
            cv_splits.append((train_indices, val_indices))
            print(f"   Fold: Val Year={val_year}, Train size={len(train_indices)}, Val size={len(val_indices)}")

    if not cv_splits:
        print("연도 기반 CV를 생성할 데이터가 부족하여 일반 KFold(3-Fold)를 사용합니다.")
        kf = KFold(n_splits=3, shuffle=True, random_state=SEED)
        cv_splits = list(kf.split(df))

    return cv_splits

def get_monotone_constraints(feature_names):
    """LightGBM 단조성 제약 설정"""
    constraints = [0] * len(feature_names)
    for i, feature in enumerate(feature_names):
        if 'ta' in feature or 'apparent_temp' in feature:
            constraints[i] = -1 # 온도/체감온도 상승 시 열수요 감소
        elif feature == 'HDD18':
            constraints[i] = 1 # 난방도일 상승 시 열수요 증가
        elif 'ws' in feature:
            constraints[i] = 1 # 풍속 상승 시 열수요 증가
    return constraints

def optimize_and_train(df_group, group_name):
    """Optuna로 하이퍼파라미터 최적화 및 모델 훈련 (CPU)"""
    print(f"\n===== {group_name.upper()} 그룹 모델 훈련 시작 (CPU 사용) =====")

    if 'heat_demand' not in df_group.columns:
        print(f"경고: {group_name} 그룹 데이터에 'heat_demand' 컬럼이 없습니다. 모델 훈련을 건너뜁니다.")
        return None

    X = df_group.drop(columns=['tm', 'heat_demand'])
    y = df_group['heat_demand']
    features, categorical_features = get_feature_names(X)
    X = X[features]

    if 'branch_id' not in X.columns:
        print(f"오류: {group_name} 그룹 데이터에 'branch_id' 컬럼이 없습니다. 모델 훈련 중단.")
        return None

    cv_splits = create_year_based_cv_splits(df_group)

    if not cv_splits:
        print(f"경고: {group_name} 그룹에 유효한 CV 분할이 없습니다. 모델 훈련을 건너뜁니다.")
        return None

    def objective(trial):
        params = {
            'objective': 'regression_l1',
            'metric': 'rmse',
            # --- GPU 파라미터('device': 'gpu')가 제거되었습니다. ---
            'n_estimators': 10000,
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 31, 256),
            'max_depth': trial.suggest_int('max_depth', 7, 15),
            'min_child_samples': trial.suggest_int('min_child_samples', 20, 100),
            'subsample': trial.suggest_float('subsample', 0.7, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
            'random_state': SEED,
            'n_jobs': -1, # CPU 코어를 모두 사용합니다.
            'verbose': -1,
        }

        cv_rmses = []
        for train_idx, val_idx in cv_splits:
            X_train, X_val = X.loc[train_idx], X.loc[val_idx]
            y_train, y_val = y.loc[train_idx], y.loc[val_idx]

            model = lgb.LGBMRegressor(**params)
            model.fit(X_train, y_train,
                      eval_set=[(X_val, y_val)],
                      eval_metric='rmse',
                      callbacks=[lgb.early_stopping(100, verbose=False)],
                      categorical_feature=[f for f in categorical_features if f in X_train.columns])
            preds = model.predict(X_val)
            cv_rmses.append(np.sqrt(mean_squared_error(y_val, preds)))

        return np.mean(cv_rmses)

    study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=SEED),
                                pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=30))
    study.optimize(objective, n_trials=41, show_progress_bar=True)
    print(f"   {group_name} 그룹 최적 RMSE: {study.best_value:.4f}")
    print(f"   최적 하이퍼파라미터: {study.best_params}")

    # 최종 모델 훈련
    best_params = study.best_params
    best_params['n_estimators'] = 10000
    best_params['objective'] = 'regression_l1'
    best_params['metric'] = 'rmse'
    best_params['random_state'] = SEED
    # --- 최종 모델에서 'device':'gpu' 파라미터가 제거되었습니다. ---

    final_model = lgb.LGBMRegressor(**best_params)

    final_model.fit(X, y,
                    categorical_feature=categorical_features,
                    eval_set=[(X, y)],
                    callbacks=[lgb.early_stopping(100), lgb.log_evaluation(1000)],
                    feature_name=features)

    # 모델 저장
    joblib.dump(final_model, f'{PATH}lgb_model_{group_name}_cpu.pkl')
    joblib.dump(features, f'{PATH}features_{group_name}.pkl')
    joblib.dump(categorical_features, f'{PATH}categorical_features_{group_name}.pkl')

    return final_model


# ---------------------------------------------------
# 3. 메인 실행 로직
# ---------------------------------------------------

if __name__ == "__main__":
    # 1. 데이터 로드 및 전체 전처리
    print("1. 데이터 로드 및 전처리 시작...")
    train_df = pd.read_csv(PATH + "train_heat.csv")
    test_df = pd.read_csv(PATH + "test_heat.csv")

    print(f"Test DataFrame 초기 컬럼: {test_df.columns.tolist()}")

    train_df = load_and_preprocess(train_df)
    test_df = load_and_preprocess(test_df)

    train_df = create_missing_flags_and_replace(train_df)

    temp_missing_cols_test = [col for col in ['ta', 'ws', 'rn_day', 'rn_hr1', 'hm', 'si', 'ta_chi'] if col in test_df.columns]
    for col in temp_missing_cols_test:
        missing_mask = (test_df[col] == -99)
        test_df[f'{col}_missing'] = missing_mask.astype(int)
        test_df[col] = test_df[col].replace(-99, np.nan)
    if 'wd' in test_df.columns:
        test_df['wd'] = test_df['wd'].replace(-9.9, np.nan)

    train_df = apply_interpolation(train_df)
    test_df = apply_interpolation(test_df)

    train_df = add_heating_season(train_df)
    test_df = add_heating_season(test_df)

    train_df, test_df = create_weather_outlier_flags(train_df, test_df)

    train_df = create_advanced_features(train_df)
    test_df = create_advanced_features(test_df)

    print("전처리 완료!")

    # 2. 시즌별 그룹 분할
    train_groups = {name: data for name, data in train_df.groupby('heating_season')}
    test_groups = {name: data for name, data in test_df.groupby('heating_season')}

    # 3. 그룹별 모델 훈련 및 예측
    print("\n2. 시즌별 모델 훈련 시작...")
    models = {}
    for group_id, df_group in train_groups.items():
        group_name = 'heating' if group_id == 1 else 'non_heating'
        if len(df_group) > 0:
            trained_model = optimize_and_train(df_group, group_name)
            if trained_model is not None:
                models[group_name] = trained_model

    # 4. 테스트 데이터 예측 및 통합
    print("\n3. 테스트 데이터 예측 및 제출 파일 생성...")
    final_predictions = pd.Series(np.zeros(len(test_df)), index=test_df.index)

    for group_id, df_group in test_groups.items():
        group_name = 'heating' if group_id == 1 else 'non_heating'
        if len(df_group) > 0 and group_name in models:
            print(f"   {group_name} 그룹 예측 중...")
            model = models[group_name]

            try:
                train_features = joblib.load(f'{PATH}features_{group_name}.pkl')
            except FileNotFoundError:
                print(f"경고: {group_name} 그룹의 피처 파일이 없습니다. 예측을 건너뜁니다.")
                continue

            X_test_group = df_group.reindex(columns=train_features, fill_value=0)

            if not all(f in X_test_group.columns for f in train_features):
                print(f"경고: {group_name} 그룹 예측 시 훈련 피처와 테스트 피처의 불일치 발생. 예측을 건너뜁니다.")
                continue

            preds = model.predict(X_test_group)
            final_predictions.loc[df_group.index] = preds
        else:
            print(f"   {group_name} 그룹에 대한 모델이 없거나 데이터가 없어 예측을 건너뜁니다.")

    final_predictions = np.maximum(final_predictions, 0)
    final_predictions = np.round(final_predictions, 1)

    original_test_submission = pd.read_csv(PATH + "test_heat.csv")
    submission = original_test_submission[['TM', 'branch_ID']].copy()
    submission.columns = ['tm', 'branch_id']

    submission['heat_demand'] = final_predictions.values

    output_filename = "submission_lgbm_cpu_upgraded.csv"
    submission.to_csv(f"{PATH}{output_filename}", index=False)

    print(f"\n🎉 예측 완료! 제출 파일 '{output_filename}'이(가) '{PATH}' 경로에 저장되었습니다.")

Mounted at /content/drive
1. 데이터 로드 및 전처리 시작...
Test DataFrame 초기 컬럼: ['TM', 'branch_ID', 'TA', 'WD', 'WS', 'RN_DAY', 'RN_HR1', 'HM', 'SI', 'ta_chi', 'heat_demand']
지사별 시계열 보간 적용 중...
지사별 시계열 보간 적용 중...
경고: 보간 후에도 166915개의 결측치가 남아있습니다. 0으로 채웁니다.
시즌별 기상 이상치 플래그 생성 중...
전처리 완료!

2. 시즌별 모델 훈련 시작...

===== NON_HEATING 그룹 모델 훈련 시작 (CPU 사용) =====
   Fold: Val Year=2021, Train size=139536, Val size=69768


[I 2025-06-25 16:28:26,423] A new study created in memory with name: no-name-29586679-08f7-4fa4-af0f-3e413e9b0ce8


   Fold: Val Year=2022, Train size=139536, Val size=69768
   Fold: Val Year=2023, Train size=139536, Val size=69768


  0%|          | 0/41 [00:00<?, ?it/s]

[I 2025-06-25 16:40:44,251] Trial 0 finished with value: 9.477276950175998 and parameters: {'learning_rate': 0.023688639503640783, 'num_leaves': 245, 'max_depth': 13, 'min_child_samples': 68, 'subsample': 0.7468055921327309, 'colsample_bytree': 0.7467983561008608, 'reg_alpha': 0.0017073967431528124, 'reg_lambda': 2.9154431891537547}. Best is trial 0 with value: 9.477276950175998.
[I 2025-06-25 16:44:48,448] Trial 1 finished with value: 9.455694595753966 and parameters: {'learning_rate': 0.039913058785616795, 'num_leaves': 191, 'max_depth': 7, 'min_child_samples': 98, 'subsample': 0.9497327922401265, 'colsample_bytree': 0.7637017332034828, 'reg_alpha': 0.005337032762603957, 'reg_lambda': 0.00541524411940254}. Best is trial 1 with value: 9.455694595753966.
[I 2025-06-25 16:54:31,088] Trial 2 finished with value: 9.425165735740231 and parameters: {'learning_rate': 0.02014847788415866, 'num_leaves': 149, 'max_depth': 10, 'min_child_samples': 43, 'subsample': 0.8835558684167139, 'colsample_

[I 2025-06-25 20:11:27,504] A new study created in memory with name: no-name-2b6a6935-56ec-49f1-a568-8c81253ea380


   Fold: Val Year=2021, Train size=193344, Val size=96653
   Fold: Val Year=2022, Train size=193325, Val size=96672
   Fold: Val Year=2023, Train size=193325, Val size=96672


  0%|          | 0/41 [00:00<?, ?it/s]

[I 2025-06-25 20:19:46,051] Trial 0 finished with value: 22.222188440479798 and parameters: {'learning_rate': 0.023688639503640783, 'num_leaves': 245, 'max_depth': 13, 'min_child_samples': 68, 'subsample': 0.7468055921327309, 'colsample_bytree': 0.7467983561008608, 'reg_alpha': 0.0017073967431528124, 'reg_lambda': 2.9154431891537547}. Best is trial 0 with value: 22.222188440479798.
[I 2025-06-25 20:24:30,445] Trial 1 finished with value: 22.3583039904058 and parameters: {'learning_rate': 0.039913058785616795, 'num_leaves': 191, 'max_depth': 7, 'min_child_samples': 98, 'subsample': 0.9497327922401265, 'colsample_bytree': 0.7637017332034828, 'reg_alpha': 0.005337032762603957, 'reg_lambda': 0.00541524411940254}. Best is trial 0 with value: 22.222188440479798.
[I 2025-06-25 20:33:32,953] Trial 2 finished with value: 22.243135641896913 and parameters: {'learning_rate': 0.02014847788415866, 'num_leaves': 149, 'max_depth': 10, 'min_child_samples': 43, 'subsample': 0.8835558684167139, 'colsamp