## Import 

In [1]:
import os 
import numpy as np 
import pandas as pd 
from datetime import datetime
from typing import List, Tuple, Optional
import warnings 
warnings.filterwarnings('ignore')

WEEK_H = 168      # 1주일 = 168시간
EPS    = 1e-3     # 0 나눔 방지

## Preprocess

In [2]:
# ── 컬럼명 매핑(dict)을 파일 내부에 정의 ──
TRAIN_COL_RENAMES = {
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(°C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
}
TEST_COL_RENAMES = TRAIN_COL_RENAMES.copy()

BUILDING_INFO_RENAMES = {
    '건물번호': 'building_number',
    '건물유형': 'building_type',
    '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area',
    '태양광용량(kW)': 'solar_power_capacity',
    'ESS저장용량(kWh)': 'ess_capacity',
    'PCS용량(kW)': 'pcs_capacity'
}
TYPE_TRANSLATION = {
    '건물기타': 'Other Buildings',
    '공공': 'Public',
    '학교': 'School',
    '백화점': 'Department Store',
    '병원': 'Hospital',
    '상용': 'Commercial',
    '아파트': 'Apartment',
    '연구소': 'Research Institute',
    '호텔': 'Hotel',
    'IDC(전화국)': 'IDC'
}

def load_raw(data_dir: str = "../data/raw"):
    train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
    test  = pd.read_csv(os.path.join(data_dir, 'test.csv'))
    info  = pd.read_csv(os.path.join(data_dir, 'building_info.csv'))
    return train, test, info

def rename_columns(df: pd.DataFrame, mapping: dict):
    df = df.rename(columns=mapping)
    if 'num_date_time' in df.columns:
        df = df.drop('num_date_time', axis=1)
    return df

def preprocess_building_info(info: pd.DataFrame) -> pd.DataFrame:
    info = info.rename(columns=BUILDING_INFO_RENAMES)
    info['building_type'] = info['building_type'].replace(TYPE_TRANSLATION)
    return info

def merge_datasets(train: pd.DataFrame, test: pd.DataFrame, info: pd.DataFrame):
    train = train.merge(info, on='building_number', how='left')
    test  = test.merge(info, on='building_number', how='left')
    return train, test

def save_processed(df: pd.DataFrame, name: str, out_dir: str = "../data/processed"):
    os.makedirs(out_dir, exist_ok=True)
    path = os.path.join(out_dir, f"{name}.pkl")
    df.to_pickle(path)
    print(f"Saved processed data to: {path}")

def load_and_process(data_dir: str = "../data/raw") -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    1) raw CSV 로드
    2) 컬럼명 리네임
    3) building_info 전처리 & 병합
    4) processed/train.pkl, processed/test.pkl 저장
    5) train_df, test_df 반환
    """
    train, test, info = load_raw(data_dir)
    train = rename_columns(train, TRAIN_COL_RENAMES)
    test  = rename_columns(test, TEST_COL_RENAMES)
    info  = preprocess_building_info(info)
    train, test = merge_datasets(train, test, info)

    save_processed(train, "train")
    save_processed(test,  "test")
    return train, test


In [3]:
def create_datetime(df: pd.DataFrame) -> pd.DataFrame:
    """
    date_time 컬럼을 datetime 타입으로 변환하고
    시간, 일, 월, 요일, 주말 여부, 연중 일(day_of_year) 피처 및
    하루를 4분할하는 시간대(time_of_day) 피처 추가
    """
    df['date_time'] = pd.to_datetime(df['date_time'], format='%Y%m%d %H')
    df['hour'] = df['date_time'].dt.hour
    df['day'] = df['date_time'].dt.day
    df['month'] = df['date_time'].dt.month
    df['day_of_week'] = df['date_time'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['day_of_year'] = df['date_time'].dt.dayofyear
    conditions = [
        (df['hour'] >= 0) & (df['hour'] < 6),
        (df['hour'] >= 6) & (df['hour'] < 12),
        (df['hour'] >= 12) & (df['hour'] < 18),
        (df['hour'] >= 18) & (df['hour'] < 24)
    ]
    choices = [0, 1, 3, 2]  # 0: 새벽, 1: 오전, 2: 오후, 3: 저녁
    df['time_of_day'] = np.select(conditions, choices, default=np.nan)
    # -------------------
    
    return df

KR_HOLIDAYS_2024 = {"2024-06-06", "2024-08-15"}
def _ensure_dt(df):
    if not np.issubdtype(df["date_time"].dtype, np.datetime64):
        df["date_time"] = pd.to_datetime(df["date_time"])
    return df

def _nth_weekday_in_month(series_dt, weekday_target):
    # 월 내 해당 요일의 n번째 (1=첫째, 2=둘째, ...)
    first_of_month = series_dt.values.astype("datetime64[M]").astype("datetime64[ns]")
    first_weekday = pd.to_datetime(first_of_month).weekday
    weekday = series_dt.dt.weekday.values
    day = series_dt.dt.day.values
    first_occ_day = 1 + ((weekday_target - first_weekday) % 7)
    nth = ((day - first_occ_day) // 7) + 1
    nth = np.where(day >= first_occ_day, nth, 0)
    return nth

def add_holiday(df: pd.DataFrame, kr_holidays: set[str] = None) -> pd.DataFrame:
    df = df.copy()
    _ensure_dt(df)
    if kr_holidays is None:
        kr_holidays = KR_HOLIDAYS_2024

    # 기본 파생 (is_weekend은 '계산만' 하고 이후 어떤 건물에도 수정하지 않음)
    df["weekday"] = df["date_time"].dt.weekday          # 0=Mon..6=Sun
    df["date"]    = df["date_time"].dt.date
    df["is_weekend"] = (df["weekday"] >= 5).astype(int) # 그대로 유지
    df["holiday"] = 0

    # 공휴일 여부는 컬럼으로 저장하지 않고, 로컬 불리언으로만 사용
    is_kr = df["date"].astype(str).isin(kr_holidays).values

    bt = df["building_type"]

    # ── Apartment: 항상 영업
    mm = bt == "Apartment"
    df.loc[mm, "holiday"] = 0

    # ── Hospital: 주말 or 공휴일 휴식
    mm = bt == "Hospital"
    if mm.any():
        df.loc[mm, "holiday"] = (df.loc[mm, "is_weekend"].values | is_kr[mm]).astype(int)

    # ── Public: 기본 주말 or 공휴일 휴식, 단 33/92는 항상 영업
    mm = bt == "Public"
    if mm.any():
        df.loc[mm, "holiday"] = (df.loc[mm, "is_weekend"].values | is_kr[mm]).astype(int)
        mm_always_open = df["building_number"].isin([33, 92])
        df.loc[mm_always_open, "holiday"] = 0

    # ── Hotel: 항상 영업
    mm = bt == "Hotel"
    df.loc[mm, "holiday"] = 0

    # ── School: 주말 or 공휴일 휴식
    mm = bt == "School"
    if mm.any():
        df.loc[mm, "holiday"] = (df.loc[mm, "is_weekend"].values | is_kr[mm]).astype(int)

    # ── IDC(전화국): 개별 규칙
    mm_idc = bt == "IDC"
    if mm_idc.any():
        # 36,43,52: 주말 or 공휴일
        ids = [36, 43, 52]
        mmx = df["building_number"].isin(ids)
        df.loc[mmx, "holiday"] = (df.loc[mmx, "is_weekend"].values | is_kr[mmx]).astype(int)
        # 64: 주말만
        mmx = df["building_number"].eq(64)
        df.loc[mmx, "holiday"] = df.loc[mmx, "is_weekend"].astype(int)
        # 67: 주말 + 8/15
        mmx = df["building_number"].eq(67)
        if mmx.any():
            df.loc[mmx, "holiday"] = df.loc[mmx, "is_weekend"].astype(int)
            df.loc[mmx & (df["date"].astype(str) == "2024-08-15"), "holiday"] = 1
        # 30,35,57: 휴일 없음 → holiday=0 유지

    # ── Commercial: 개별 규칙
    mm = bt == "Commercial"
    if mm.any():
        # 2: 주말만
        mmx = df["building_number"].eq(2)
        df.loc[mmx, "holiday"] = df.loc[mmx, "is_weekend"].astype(int)
        # 6,16,20,51,86: 주말 or 공휴일
        ids = [6, 16, 20, 51, 86]
        mmx = df["building_number"].isin(ids)
        df.loc[mmx, "holiday"] = (df.loc[mmx, "is_weekend"].values | is_kr[mmx]).astype(int)
        # 41,56,76,99: 휴일 없음 → holiday=0 유지

    # ── Other Buildings: 개별 규칙
    # 26: 월/화
    mmx = df["building_number"].eq(26)
    df.loc[mmx, "holiday"] = df.loc[mmx, "weekday"].isin([0, 1]).astype(int)
    # 82: 월
    mmx = df["building_number"].eq(82)
    df.loc[mmx, "holiday"] = df.loc[mmx, "weekday"].eq(0).astype(int)
    # 47,69: 주말 or 공휴일
    mmx = df["building_number"].isin([47, 69])
    df.loc[mmx, "holiday"] = (df.loc[mmx, "is_weekend"].values | is_kr[mmx]).astype(int)
    # 58,61,78: 주말에도 영업 → holiday=0 유지 (is_weekend는 건드리지 않음)
    # 97: 토요일만
    mmx = df["building_number"].eq(97)
    df.loc[mmx, "holiday"] = df.loc[mmx, "weekday"].eq(5).astype(int)

    # ── Department Store: 공휴일에도 영업. 개별 규칙만 휴일 처리.
    mm = bt == "Department Store"
    if mm.any():
        df.loc[mm, "holiday"] = 0

        nth_sun = _nth_weekday_in_month(df["date_time"], 6)  # Sun
        nth_mon = _nth_weekday_in_month(df["date_time"], 0)  # Mon

        def mark_nth_weekday(building, weekday, nth_set):
            if weekday == 6:
                nth = nth_sun
            elif weekday == 0:
                nth = nth_mon
            else:
                nth = _nth_weekday_in_month(df["date_time"], weekday)
            sel = df["building_number"].eq(building) & df["weekday"].eq(weekday) & pd.Series(nth).isin(list(nth_set)).values
            df.loc[sel, "holiday"] = 1

        # 매주/격주/특정일
        df.loc[df["building_number"].eq(18) & df["weekday"].eq(6), "holiday"] = 1  # 18: 매주 일요일

        special = {
            19: ["2024-06-10", "2024-07-08", "2024-08-19"],
            45: ["2024-06-10", "2024-07-08", "2024-08-19"],
            54: ["2024-06-17", "2024-07-01", "2024-08-19"],
            74: ["2024-06-17", "2024-07-01"],
            79: ["2024-06-17", "2024-07-01", "2024-08-19"],
            95: ["2024-07-08", "2024-08-05"],
            29: ["2024-06-10", "2024-07-10", "2024-08-10"],
        }
        for b, dates in special.items():
            sel = df["building_number"].eq(b) & df["date"].astype(str).isin(dates)
            df.loc[sel, "holiday"] = 1

        # 격주 규칙
        mark_nth_weekday(27, 6, {2, 4})  # 27: 2·4번째 일요일
        mark_nth_weekday(29, 6, {4})     # 29: 4번째 일요일
        mark_nth_weekday(32, 0, {2, 4})  # 32: 2·4번째 월요일
        for b in [40, 59, 63]:           # 2·4번째 일요일
            mark_nth_weekday(b, 6, {2, 4})

        # 34,73,88: 휴일 없음 → holiday=0 유지

    # 안전 재확인: IDC 67의 8/15
    df.loc[(df["building_number"].eq(67)) & (df["date"].astype(str) == "2024-08-15"), "holiday"] = 1

    return df

def remove_outliers(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    _ensure_dt(df)

    rules_lt = [
        # (building_number, threshold)
        # Apartment
        (25, 0, "eq"), (70, 200, "lt"),
        # Hospital
        (44, 800, "lt"), (90, 800, "lt"), (42, 2000, "lt"), (17, 1000, "lt"),
        # Public
        (68, 600, "lt"), (72, 600, "lt"), (80, 600, "lt"), (92, 200, "lt"),
        # Hotel
        (98, 500, "lt"),
        # Other
        (97, 500, "lt"), (78, 400, "lt"), (26, 300, "lt"), (7, 2000, "lt"),
        # Commercial
        (76, 2000, "lt"), (41, 2200, "lt"), (20, 1600, "lt"),
        # School
        (5, 2000, "lt"), (8, 250, "lt"), (12, 3500, "lt"),
        # IDC
        (67, 7333, "lt"), (81, 800, "lt"), (52, 2000, "lt"), (43, 6000, "lt"), (30, 8000, "lt"),
    ]

    # 값 기반 제거
    mask_ok = pd.Series(True, index=df.index)
    pc = df["power_consumption"]
    bnum = df["building_number"]

    for bn, th, op in rules_lt:
        if op == "lt":
            mask_ok &= ~((bnum.eq(bn)) & (pc < th))
        elif op == "eq":
            mask_ok &= ~((bnum.eq(bn)) & (pc == th))

    # 기간 기반 제거
    # Hotel 10: 2024-07-05 ~ 2024-08-22
    mask_ok &= ~(
        (bnum.eq(10)) &
        (df["date_time"].between(pd.Timestamp("2024-07-05"), pd.Timestamp("2024-08-22")))
    )
    # IDC 57: 2024-06-07 이전
    mask_ok &= ~(
        (bnum.eq(57)) & (df["date_time"] < pd.Timestamp("2024-06-07"))
    )
    # Research 94: 2024-07-27 09:00 ~ 2024-08-04 23:00
    mask_ok &= ~(
        (bnum.eq(94)) &
        (df["date_time"].between(pd.Timestamp("2024-07-27 09:00"), pd.Timestamp("2024-08-04 23:00")))
    )

    return df.loc[mask_ok].reset_index(drop=True)

def add_summer_cycle_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    여름 기간(6/1~9/14)을 주기로 하는 sin/cos 특성을 생성합니다.
    """
    df_copy = df.copy()
    start_date = datetime.strptime("2024-06-01 00:00:00", "%Y-%m-%d %H:%M:%S")
    end_date = datetime.strptime("2024-09-14 00:00:00", "%Y-%m-%d %H:%M:%S")
    period_seconds = (end_date - start_date).total_seconds()
    
    def summer_cos(date):
        return np.cos(2 * np.pi * (date - start_date).total_seconds() / period_seconds)
    
    def summer_sin(date):
        return np.sin(2 * np.pi * (date - start_date).total_seconds() / period_seconds)
        
    df_copy['summer_cos'] = df_copy['date_time'].apply(summer_cos)
    df_copy['summer_sin'] = df_copy['date_time'].apply(summer_sin)
    
    return df_copy

def add_squared_features(
    df: pd.DataFrame, 
    target_cols: List[str] = ['temperature', 'humidity']
) -> pd.DataFrame:
    """
    지정된 컬럼에 대해 제곱(squared) 특성을 생성합니다.
    변수가 타겟에 미치는 비선형 관계를 모델이 학습하는 데 도움을 줍니다.

    Args:
        df (pd.DataFrame): 특성을 추가할 데이터프레임
        target_cols (List[str]): 제곱할 대상 컬럼 리스트

    Returns:
        pd.DataFrame: 제곱 특성이 추가된 데이터프레임
    """
    df_copy = df.copy()
    for col in target_cols:
        df_copy[f'{col}_squared'] = df_copy[col] ** 2
    return df_copy


def create_cyclic_features(df):
    """
    사이클릭 피처 추가하는 함수 (create_datetime 이후에 사용)
    """
    # hour: 0–23
    df['sin_hour'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['cos_hour'] = np.cos(2 * np.pi * df['hour'] / 24)
    
    # day_of_week: 0–6
    df['sin_dow'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['cos_dow'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    
    # month: 1–12 → 0–11로 변환
    df['month0'] = df['month'] - 1
    df['sin_month'] = np.sin(2 * np.pi * df['month0'] / 12)
    df['cos_month'] = np.cos(2 * np.pi * df['month0'] / 12)
    df.drop(columns=['month0'], inplace=True)
    
    # (선택) 날짜 전체 주기: day_of_year 1–365 or 366
    df['sin_doy'] = np.sin(2 * np.pi * (df['day_of_year'] - 1) / 365)
    df['cos_doy'] = np.cos(2 * np.pi * (df['day_of_year'] - 1) / 365)
    
    # 원본 컬럼 제거
    #df.drop(columns=['hour', 'day_of_week', 'month', 'day_of_year'], inplace=True)
    return df

def CDH(xs: np.ndarray) -> np.ndarray:
    """
    Cooling Degree Hours 계산: 기준 26°C 대비 초과 온도의 누적
    """
    cumsum = np.cumsum(xs - 26)
    return np.concatenate((cumsum[:11], cumsum[11:] - cumsum[:-11]))


def add_cdh_feature(df: pd.DataFrame) -> pd.DataFrame:
    """
    건물별 온도 데이터를 이용해 CDH 피처 추가
    """
    cdhs = []
    for b in df['building_number'].unique():
        temps = df.loc[df['building_number'] == b, 'temperature'].values
        cdhs.append(CDH(temps))
    df['CDH'] = np.concatenate(cdhs)
    return df

def add_cdd_feature(df: pd.DataFrame,
                    base_temp: float = 18.0,
                    window:    int   = 24
                   ) -> pd.DataFrame:
    """
    Cooling Degree Days (CDD) 추가
      - base_temp (°C) 보다 높을 때만 (T - base_temp) 합산
      - window 시간 롤링 합산 (min_periods=1)
    """
    # 1) per-hour 초과분 계산
    df['excess'] = (df['temperature'] - base_temp).clip(lower=0)

    # 2) building_number 그룹별 rolling sum
    df['CDD'] = (df
                 .groupby('building_number')['excess']
                 .transform(lambda s: s.rolling(window, min_periods=1).sum())
                )

    # 3) 중간 컬럼 정리
    df.drop(columns=['excess'], inplace=True)
    return df
def add_thi_feature(df: pd.DataFrame) -> pd.DataFrame:
    """
    Temperature-Humidity Index (THI) 추가
    """
    df['THI'] = (9/5 * df['temperature'] 
                 - 0.55 * (1 - df['humidity']/100) 
                 * (9/5 * df['temperature'] - 26) 
                 + 32)
    return df
def add_wct_feature(df: pd.DataFrame) -> pd.DataFrame:
    """
    Wind Chill Temperature (WCT) 추가
    """
    v16 = df['windspeed'] ** 0.16
    df['WCT'] = (13.12 
                 + 0.6125 * df['temperature'] 
                 - 11.37 * v16 
                 + 0.3965 * v16 * df['temperature'])
    return df

def add_temp_features(data):
    avg_temp = (
        pd.pivot_table(
            data[data['hour'] % 3 == 0],
            values='temperature',
            index=['building_number', 'day', 'month'],
            aggfunc='mean'
        )
        .reset_index()
        .rename(columns={'temperature': 'avg_temp'})
    )
    data = pd.merge(data, avg_temp, on=['building_number', 'day', 'month'], how='left')

    max_temp = (
        pd.pivot_table(
            data,
            values='temperature',
            index=['building_number', 'day', 'month'],
            aggfunc='max'
        )
        .reset_index()
        .rename(columns={'temperature': 'max_temp'})
    )
    data = pd.merge(data, max_temp, on=['building_number', 'day', 'month'], how='left')

    min_temp = (
        pd.pivot_table(
            data,
            values='temperature',
            index=['building_number', 'day', 'month'],
            aggfunc='min'
        )
        .reset_index()
        .rename(columns={'temperature': 'min_temp'})
    )
    data = pd.merge(data, min_temp, on=['building_number', 'day', 'month'], how='left')

    data['temp_diff'] = data['max_temp'] - data['min_temp']

    return data


def _prep(df, time_col, group_col):
    """정렬 헬퍼"""
    return df.sort_values([group_col, time_col])



def add_weekly_slope(df: pd.DataFrame,
                     time_col: str = 'date_time',
                     group_col: str = 'building_number',
                     power_col: str = 'power_consumption',
                     lookback: int = 6) -> pd.DataFrame:
    """
    1주일 전 최근 lookback 시간의 선형회귀 기울기(β) 피처 추가
    """
    df = _prep(df, time_col, group_col)

    def _beta(x: pd.Series) -> float:
        if x.isna().any(): return np.nan
        idx = np.arange(len(x))
        num = idx.dot(x) * len(x) - idx.sum() * x.sum()
        den = len(x) * (idx**2).sum() - idx.sum()**2
        return num / den if den else 0.0

    pw_seq = df.groupby(group_col)[power_col].shift(WEEK_H)
    col = f'power_week_slope{lookback}h'
    df[col] = pw_seq.groupby(df[group_col]).transform(
        lambda s: s.rolling(lookback).apply(_beta, raw=False)
    ).fillna(0)
    return df

def mean_std_power(df: pd.DataFrame) -> pd.DataFrame:
    is_train = df['power_consumption'].notna()
    
    # date, hour, day_of_week 컬럼 준비
    if 'date' not in df.columns:
        df['date'] = pd.to_datetime(df['date_time']).dt.date
    if 'hour' not in df.columns:
        df['hour'] = pd.to_datetime(df['date_time']).dt.hour
    if 'day_of_week' not in df.columns:
        df['day_of_week'] = pd.to_datetime(df['date_time']).dt.weekday

    # 1) 기존에 만들어둔 holiday 플래그 사용
    #    (0/1로 되어 있다고 가정)
    df['holiday'] = df['holiday'].fillna(0).astype(int)

    # 2) (선택) 학습 데이터에만 power 조정이 필요하면 ratio 적용
    base_ratio = np.array([0.985] + [0.98]*2 + [0.995]*2 + [0.99]*2)
    ratio_all = base_ratio - 0.005
    df.loc[is_train, 'power_consumption'] = df.loc[is_train].apply(
        lambda r: r['power_consumption'] * ratio_all[int(r['day_of_week'])],
        axis=1
    )

    train_df = df[is_train]

    # 3-A) 요일·시간별 평균
    dow_hour = (
        train_df
        .groupby(['building_number', 'hour', 'day_of_week'])['power_consumption']
        .mean()
        .reset_index(name='dow_hour_mean')
    )
    df = df.merge(dow_hour, on=['building_number', 'hour', 'day_of_week'], how='left')

    # 3-B) holiday_mean & holiday_std (holiday 기준)
    hol_mean = (
        train_df
        .groupby(['building_number', 'hour', 'holiday'])['power_consumption']
        .mean()
        .reset_index(name='holiday_mean')
    )
    hol_std = (
        train_df
        .groupby(['building_number', 'hour', 'holiday'])['power_consumption']
        .std()
        .reset_index(name='holiday_std')
    )
    df = df.merge(hol_mean, on=['building_number', 'hour', 'holiday'], how='left')
    df = df.merge(hol_std,  on=['building_number', 'hour', 'holiday'], how='left')

    # 3-C) 시간(hour)별 평균·표준편차
    hr_mean = (
        train_df
        .groupby(['building_number', 'hour'])['power_consumption']
        .mean()
        .reset_index(name='hour_mean')
    )
    hr_std = (
        train_df
        .groupby(['building_number', 'hour'])['power_consumption']
        .std()
        .reset_index(name='hour_std')
    )
    df = df.merge(hr_mean, on=['building_number', 'hour'], how='left')
    df = df.merge(hr_std,  on=['building_number', 'hour'], how='left')

    return df

In [4]:
DROP_COLS = ['sunshine','solar_radiation','date_time', 'solar_power_capacity','ess_capacity', 'pcs_capacity',
             'hour', 'day_of_week', 'month', 'day_of_year',]
CAT_COLS = ['building_type', 'building_number']

In [5]:
train, test = load_and_process("./data")
train, test = create_datetime(train), create_datetime(test)
combined_df = pd.concat([train, test], ignore_index=True)
combined_df = add_holiday(combined_df) 
combined_df = remove_outliers(combined_df)
combined_df = add_squared_features(combined_df)
combined_df = add_summer_cycle_features(combined_df)
combined_df = create_cyclic_features(combined_df)
comgined_df = add_cdh_feature(combined_df)
combined_df = add_cdd_feature(combined_df)
combined_df = add_thi_feature(combined_df)
combined_df = add_wct_feature(combined_df)
combined_df = add_temp_features(combined_df)
combined_df = mean_std_power(combined_df)
combined_df = add_weekly_slope(combined_df)
split_date = pd.to_datetime('2024-08-25 00:00:00')
val_date   = split_date - pd.Timedelta(days=7)
train = combined_df[combined_df['date_time'] < split_date].copy()
test  = combined_df[combined_df['date_time'] >= split_date].copy()

for c in CAT_COLS:
    train[c] = train[c].astype('category')
    test[c]  = test[c].astype('category')
# 3. 피처/타겟 분리
train_cols_drop = ['date_time','date'] + DROP_COLS
test_cols_drop = ['date_time','power_consumption','date'] + DROP_COLS
train    = train.drop(columns=train_cols_drop, errors='ignore')
test  = test.drop(columns=test_cols_drop, errors='ignore')


Saved processed data to: ../data/processed/train.pkl
Saved processed data to: ../data/processed/test.pkl


In [6]:
train.columns

Index(['building_number', 'temperature', 'rainfall', 'windspeed', 'humidity',
       'power_consumption', 'building_type', 'total_area', 'cooling_area',
       'day', 'is_weekend', 'time_of_day', 'weekday', 'holiday',
       'temperature_squared', 'humidity_squared', 'summer_cos', 'summer_sin',
       'sin_hour', 'cos_hour', 'sin_dow', 'cos_dow', 'sin_month', 'cos_month',
       'sin_doy', 'cos_doy', 'CDH', 'CDD', 'THI', 'WCT', 'avg_temp',
       'max_temp', 'min_temp', 'temp_diff', 'dow_hour_mean', 'holiday_mean',
       'holiday_std', 'hour_mean', 'hour_std', 'power_week_slope6h'],
      dtype='object')

## Train & Infer

In [6]:
from autogluon.core.metrics import make_scorer
from autogluon.tabular import TabularPredictor, TabularDataset

In [7]:
predictor  = TabularPredictor(label="power_consumption", eval_metric='smape', problem_type="regression").fit(train_data=train,
presets=["best_quality"],num_stack_levels=1, num_bag_folds=5, num_bag_sets=3, num_gpus=1,refit_full=True)

No path specified. Models will be saved in: "AutogluonModels/ag-20250827_122935"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.3
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #29~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Jun 26 14:16:59 UTC 2
CPU Count:          32
Memory Avail:       47.65 GB / 62.72 GB (76.0%)
Disk Space Avail:   249.62 GB / 937.82 GB (26.6%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=5, num_bag_sets=3
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdou

[36m(_ray_fit pid=3452105)[0m [1000]	valid_set's l2: 30837.3	valid_set's symmetric_mean_absolute_percentage_error: -0.0254997
[36m(_ray_fit pid=3452111)[0m [1000]	valid_set's l2: 31040	valid_set's symmetric_mean_absolute_percentage_error: -0.0262569[32m [repeated 12x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
[36m(_ray_fit pid=3452105)[0m [2000]	valid_set's l2: 25750.1	valid_set's symmetric_mean_absolute_percentage_error: -0.0232019[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=3452112)[0m [2000]	valid_set's l2: 25859.6	valid_set's symmetric_mean_absolute_percentage_error: -0.0234018[32m [repeated 7x across cluster][0m
[36m(_ray_fit pid=3452111)[0m [2000]	valid_set's l2: 25702.4	valid_set's symmetric_mean_absolute_percentage_error: -0.0238695[32m [repeated 5x across cluster]

[36m(_dystack pid=3449684)[0m 	-0.0175	 = Validation score   (-symmetric_mean_absolute_percentage_error)
[36m(_dystack pid=3449684)[0m 	217.8s	 = Training   runtime
[36m(_dystack pid=3449684)[0m 	1804.39s	 = Validation runtime
[36m(_dystack pid=3449684)[0m Fitting model: LightGBM_BAG_L1 ... Training model for up to 263.94s of the 563.29s of remaining time.
[36m(_dystack pid=3449684)[0m 	Fitting 15 child models (S1F1 - S3F5) | Fitting with ParallelLocalFoldFittingStrategy (15 workers, per: cpus=2, gpus=0, memory=0.66%)


[36m(_ray_fit pid=3453323)[0m [1000]	valid_set's l2: 26388.7	valid_set's symmetric_mean_absolute_percentage_error: -0.024148
[36m(_ray_fit pid=3453326)[0m [1000]	valid_set's l2: 27357.9	valid_set's symmetric_mean_absolute_percentage_error: -0.0241296
[36m(_ray_fit pid=3453335)[0m [1000]	valid_set's l2: 26119.5	valid_set's symmetric_mean_absolute_percentage_error: -0.0238702[32m [repeated 13x across cluster][0m
[36m(_ray_fit pid=3453323)[0m [2000]	valid_set's l2: 22329.5	valid_set's symmetric_mean_absolute_percentage_error: -0.0216892
[36m(_ray_fit pid=3453331)[0m [2000]	valid_set's l2: 22496.3	valid_set's symmetric_mean_absolute_percentage_error: -0.0211712
[36m(_ray_fit pid=3453330)[0m [2000]	valid_set's l2: 21704.5	valid_set's symmetric_mean_absolute_percentage_error: -0.0213918[32m [repeated 12x across cluster][0m
[36m(_ray_fit pid=3453323)[0m [3000]	valid_set's l2: 20477.5	valid_set's symmetric_mean_absolute_percentage_error: -0.0204942[32m [repeated 2x across cl

[36m(_dystack pid=3449684)[0m 	-0.0167	 = Validation score   (-symmetric_mean_absolute_percentage_error)
[36m(_dystack pid=3449684)[0m 	196.7s	 = Training   runtime
[36m(_dystack pid=3449684)[0m 	1560.32s	 = Validation runtime
[36m(_dystack pid=3449684)[0m Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.00s of the 264.84s of remaining time.
[36m(_dystack pid=3449684)[0m 	Ensemble Weights: {'LightGBM_BAG_L1': 0.667, 'LightGBMXT_BAG_L1': 0.333}
[36m(_dystack pid=3449684)[0m 	-0.0164	 = Validation score   (-symmetric_mean_absolute_percentage_error)
[36m(_dystack pid=3449684)[0m 	0.09s	 = Training   runtime
[36m(_dystack pid=3449684)[0m 	0.0s	 = Validation runtime
[36m(_dystack pid=3449684)[0m Fitting 106 L2 models, fit_strategy="sequential" ...
[36m(_dystack pid=3449684)[0m Fitting model: LightGBMXT_BAG_L2 ... Training model for up to 264.73s of the 264.71s of remaining time.
[36m(_dystack pid=3449684)[0m 	Fitting 15 child models (S1F1 - S3F5) | F

[36m(_ray_fit pid=3454554)[0m [1000]	valid_set's l2: 20066.4	valid_set's symmetric_mean_absolute_percentage_error: -0.0193441[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=3454551)[0m [1000]	valid_set's l2: 19097	valid_set's symmetric_mean_absolute_percentage_error: -0.0193577[32m [repeated 8x across cluster][0m
[36m(_ray_fit pid=3454554)[0m [2000]	valid_set's l2: 18442	valid_set's symmetric_mean_absolute_percentage_error: -0.0179204[32m [repeated 7x across cluster][0m
[36m(_ray_fit pid=3454555)[0m [2000]	valid_set's l2: 19297.9	valid_set's symmetric_mean_absolute_percentage_error: -0.0180451[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=3454547)[0m [2000]	valid_set's l2: 18330.5	valid_set's symmetric_mean_absolute_percentage_error: -0.0182415[32m [repeated 8x across cluster][0m
[36m(_ray_fit pid=3454556)[0m [3000]	valid_set's l2: 17705.6	valid_set's symmetric_mean_absolute_percentage_error: -0.017107[32m [repeated 4x across cluster][0m
[36m(_ra

[36m(_dystack pid=3449684)[0m 	-0.0158	 = Validation score   (-symmetric_mean_absolute_percentage_error)
[36m(_dystack pid=3449684)[0m 	204.62s	 = Training   runtime
[36m(_dystack pid=3449684)[0m 	1916.37s	 = Validation runtime
[36m(_dystack pid=3449684)[0m Fitting model: WeightedEnsemble_L3 ... Training model for up to 360.00s of the -60.76s of remaining time.
[36m(_dystack pid=3449684)[0m 	Ensemble Weights: {'LightGBMXT_BAG_L2': 0.727, 'LightGBM_BAG_L1': 0.273}
[36m(_dystack pid=3449684)[0m 	-0.0156	 = Validation score   (-symmetric_mean_absolute_percentage_error)
[36m(_dystack pid=3449684)[0m 	0.1s	 = Training   runtime
[36m(_dystack pid=3449684)[0m 	0.0s	 = Validation runtime
[36m(_dystack pid=3449684)[0m AutoGluon training complete, total runtime = 959.02s ... Best model: WeightedEnsemble_L3 | Estimated inference throughput: 6.8 rows/s (35861 batch size)
[36m(_dystack pid=3449684)[0m Automatically performing refit_full as a post-fit operation (due to `.fit(...,

In [8]:
y_pred = predictor.predict(test)

In [10]:
submission_df = pd.read_csv('./data/sample_submission.csv')
submission_df['answer'] = y_pred.values
submission_df.to_csv('autogluon.csv', index=False)