In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# =========================
# 1) 데이터 로드 & 일별 집계
# =========================
CSV_PATH = "SUWON_S_DATA_TABLE_GENDER_SUM.csv"

df = pd.read_csv(CSV_PATH)
df["TA_YMD"] = pd.to_datetime(df["TA_YMD"], format="%Y%m%d")

daily = df.groupby("TA_YMD", as_index=False).agg(
    AMT_sum=("AMT", "sum"),
    TEMP_mean=("TEMP", "mean"),
    RAIN_sum=("RAIN", "sum"),
    DAY_mode=("DAY", lambda x: int(pd.Series(x).mode()[0])),
)
daily["month"] = daily["TA_YMD"].dt.month
daily = daily.sort_values("TA_YMD").reset_index(drop=True)

# ✅ 공정 비교: 시간순 split (과거 80% / 미래 20%)
split = int(len(daily) * 0.8)
train_df = daily.iloc[:split].copy()
test_df  = daily.iloc[split:].copy()

X_train = train_df[["DAY_mode", "TEMP_mean", "RAIN_sum", "month"]]
y_train = train_df["AMT_sum"].values
X_test  = test_df[["DAY_mode", "TEMP_mean", "RAIN_sum", "month"]]
y_test  = test_df["AMT_sum"].values

# =========================
# 2) 머신러닝: RandomForest
# =========================
preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["DAY_mode"]),
    ("num", "passthrough", ["TEMP_mean", "RAIN_sum", "month"]),
])

rf = Pipeline([
    ("prep", preprocess),
    ("rf", RandomForestRegressor(
        n_estimators=600,
        max_depth=12,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    ))
])

rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

rf_mae = mean_absolute_error(y_test, rf_pred)
rf_r2  = r2_score(y_test, rf_pred)

# =========================
# 3) 딥러닝: TabTransformer-style (PyTorch)
# =========================
class TabDataset(Dataset):
    def __init__(self, x_cat, x_num, y):
        self.x_cat = torch.tensor(x_cat, dtype=torch.long)
        self.x_num = torch.tensor(x_num, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).view(-1, 1)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return self.x_cat[idx], self.x_num[idx], self.y[idx]

class SimpleTabTransformer(nn.Module):
    def __init__(self, emb_dim=16, nhead=4, nlayers=2, ff=128, dropout=0.1):
        super().__init__()
        self.day_emb = nn.Embedding(7, emb_dim)
        self.mon_emb = nn.Embedding(12, emb_dim)
        enc = nn.TransformerEncoderLayer(
            d_model=emb_dim, nhead=nhead, dim_feedforward=ff, dropout=dropout, batch_first=True
        )
        self.tr = nn.TransformerEncoder(enc, num_layers=nlayers)
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim + 2, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x_cat, x_num):
        tokens = torch.stack([self.day_emb(x_cat[:,0]), self.mon_emb(x_cat[:,1])], dim=1)  # (B,2,emb)
        z = self.tr(tokens).mean(dim=1)
        return self.mlp(torch.cat([z, x_num], dim=1))

def encode(df_):
    day = (df_["DAY_mode"].astype(int).clip(1,7) - 1).values
    mon = (df_["month"].astype(int).clip(1,12) - 1).values
    x_cat = np.vstack([day, mon]).T
    x_num = df_[["TEMP_mean", "RAIN_sum"]].values.astype(np.float32)
    y = df_["AMT_sum"].values.astype(np.float32)
    return x_cat, x_num, y

xcat_tr, xnum_tr, y_tr = encode(train_df)
xcat_te, xnum_te, y_te = encode(test_df)

train_loader = DataLoader(TabDataset(xcat_tr, xnum_tr, y_tr), batch_size=64, shuffle=True)
test_loader  = DataLoader(TabDataset(xcat_te, xnum_te, y_te), batch_size=64, shuffle=False)

device = "cuda" if torch.cuda.is_available() else "cpu"
deep = SimpleTabTransformer().to(device)

opt = torch.optim.AdamW(deep.parameters(), lr=1e-3)
loss_fn = nn.SmoothL1Loss()  # Huber

# 학습(너무 길면 30~50 epochs 권장)
EPOCHS = 50
for _ in range(EPOCHS):
    deep.train()
    for xc, xn, yy in train_loader:
        xc, xn, yy = xc.to(device), xn.to(device), yy.to(device)
        pred = deep(xc, xn)
        loss = loss_fn(pred, yy)
        opt.zero_grad()
        loss.backward()
        opt.step()

# 예측
deep.eval()
preds=[]
with torch.no_grad():
    for xc, xn, _ in test_loader:
        xc, xn = xc.to(device), xn.to(device)
        preds.append(deep(xc, xn).cpu().numpy().ravel())
deep_pred = np.concatenate(preds)

deep_mae = mean_absolute_error(y_test, deep_pred)
deep_r2  = r2_score(y_test, deep_pred)

# =========================
# 4) 수치 출력
# =========================
print("=== 정확도 비교 (일 매출 AMT_sum) ===")
print(f"RandomForest  MAE={rf_mae:,.0f}  R2={rf_r2:.3f}")
print(f"DeepLearning  MAE={deep_mae:,.0f}  R2={deep_r2:.3f}")

# =========================
# 5) 이미지 1: MAE/R2 막대 그래프
# =========================
models = ["RandomForest", "DeepLearning"]
mae_vals = [rf_mae, deep_mae]
r2_vals  = [rf_r2, deep_r2]

plt.figure()
plt.bar(models, mae_vals)
plt.title("MAE Comparison (lower is better)")
plt.ylabel("MAE")
plt.savefig("accuracy_bar_mae.png", dpi=200, bbox_inches="tight")
plt.close()

plt.figure()
plt.bar(models, r2_vals)
plt.title("R2 Comparison (higher is better)")
plt.ylabel("R2")
plt.savefig("accuracy_bar_r2.png", dpi=200, bbox_inches="tight")
plt.close()

# =========================
# 6) 이미지 2: 실제 vs 예측 산점도
# =========================
plt.figure()
plt.scatter(y_test, rf_pred, label="RandomForest")
plt.scatter(y_test, deep_pred, label="DeepLearning")
plt.title("Actual vs Predicted (Test)")
plt.xlabel("Actual AMT_sum")
plt.ylabel("Predicted AMT_sum")
plt.legend()
plt.savefig("pred_scatter.png", dpi=200, bbox_inches="tight")
plt.close()

# =========================
# 7) 이미지 3: 오차(Residual) 분포
# =========================
rf_res = y_test - rf_pred
deep_res = y_test - deep_pred

plt.figure()
plt.hist(rf_res, bins=30, alpha=0.7, label="RandomForest")
plt.hist(deep_res, bins=30, alpha=0.7, label="DeepLearning")
plt.title("Residual Distribution (Test)")
plt.xlabel("Residual (Actual - Predicted)")
plt.ylabel("Count")
plt.legend()
plt.savefig("residual_hist.png", dpi=200, bbox_inches="tight")
plt.close()

print("\nSaved images:")
print("- accuracy_bar_mae.png")
print("- accuracy_bar_r2.png")
print("- pred_scatter.png")
print("- residual_hist.png")

# 해석 가이드 (딱 이것만 보면 됨)
# MAE가 낮을수록 좋음 (평균적으로 덜 틀림)
# R²가 높을수록 좋음 (패턴 설명력 큼)
# pred_scatter.png에서 점들이 대각선(Actual=Pred) 주변에 몰릴수록 좋음
# residual_hist.png에서 오차 분포가 0 주변에 좁게 몰릴수록 좋음

=== 정확도 비교 (일 매출 AMT_sum) ===
RandomForest  MAE=17,858,319  R2=0.724
DeepLearning  MAE=176,169,782  R2=-10.910

Saved images:
- accuracy_bar_mae.png
- accuracy_bar_r2.png
- pred_scatter.png
- residual_hist.png


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# =========================
# 1) 데이터 로드 & 일별 집계
# =========================
CSV_PATH = "SUWON_S_DATA_TABLE_GENDER_SUM.csv"

df = pd.read_csv(CSV_PATH)
df["TA_YMD"] = pd.to_datetime(df["TA_YMD"], format="%Y%m%d")

daily = df.groupby("TA_YMD", as_index=False).agg(
    AMT_sum=("AMT", "sum"),
    TEMP_mean=("TEMP", "mean"),
    RAIN_sum=("RAIN", "sum"),
    DAY_mode=("DAY", lambda x: int(pd.Series(x).mode()[0])),
)
daily["month"] = daily["TA_YMD"].dt.month
daily = daily.sort_values("TA_YMD").reset_index(drop=True)

# ✅ 공정 비교: 시간순 split (과거 80% / 미래 20%)
split = int(len(daily) * 0.8)
train_df = daily.iloc[:split].copy()
test_df  = daily.iloc[split:].copy()

X_train = train_df[["DAY_mode", "TEMP_mean", "RAIN_sum", "month"]]
y_train = train_df["AMT_sum"].values
X_test  = test_df[["DAY_mode", "TEMP_mean", "RAIN_sum", "month"]]
y_test  = test_df["AMT_sum"].values

# =========================
# 2) RandomForest
# =========================
preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["DAY_mode"]),
    ("num", "passthrough", ["TEMP_mean", "RAIN_sum", "month"]),
])

rf = Pipeline([
    ("prep", preprocess),
    ("rf", RandomForestRegressor(
        n_estimators=600,
        max_depth=12,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    ))
])
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# =========================
# 3) XGBoost
# =========================
xgb = Pipeline([
    ("prep", preprocess),
    ("xgb", XGBRegressor(
        n_estimators=800,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        random_state=42
    ))
])
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

# =========================
# 4) DeepLearning: TabTransformer-style (PyTorch)
# =========================
class TabDataset(Dataset):
    def __init__(self, x_cat, x_num, y):
        self.x_cat = torch.tensor(x_cat, dtype=torch.long)
        self.x_num = torch.tensor(x_num, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).view(-1, 1)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return self.x_cat[idx], self.x_num[idx], self.y[idx]

class SimpleTabTransformer(nn.Module):
    def __init__(self, emb_dim=16, nhead=4, nlayers=2, ff=128, dropout=0.1):
        super().__init__()
        self.day_emb = nn.Embedding(7, emb_dim)
        self.mon_emb = nn.Embedding(12, emb_dim)
        enc = nn.TransformerEncoderLayer(
            d_model=emb_dim, nhead=nhead, dim_feedforward=ff, dropout=dropout, batch_first=True
        )
        self.tr = nn.TransformerEncoder(enc, num_layers=nlayers)
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim + 2, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x_cat, x_num):
        tokens = torch.stack([self.day_emb(x_cat[:,0]), self.mon_emb(x_cat[:,1])], dim=1)  # (B,2,emb)
        z = self.tr(tokens).mean(dim=1)
        return self.mlp(torch.cat([z, x_num], dim=1))

def encode_for_deep(df_):
    # day: 1~7 -> 0~6 / month: 1~12 -> 0~11
    day = (df_["DAY_mode"].astype(int).clip(1,7) - 1).values
    mon = (df_["month"].astype(int).clip(1,12) - 1).values
    x_cat = np.vstack([day, mon]).T
    x_num = df_[["TEMP_mean", "RAIN_sum"]].values.astype(np.float32)
    y = df_["AMT_sum"].values.astype(np.float32)
    return x_cat, x_num, y

xcat_tr, xnum_tr, y_tr = encode_for_deep(train_df)
xcat_te, xnum_te, y_te = encode_for_deep(test_df)

train_loader = DataLoader(TabDataset(xcat_tr, xnum_tr, y_tr), batch_size=64, shuffle=True)
test_loader  = DataLoader(TabDataset(xcat_te, xnum_te, y_te), batch_size=64, shuffle=False)

device = "cuda" if torch.cuda.is_available() else "cpu"
deep = SimpleTabTransformer().to(device)
opt = torch.optim.AdamW(deep.parameters(), lr=1e-3)
loss_fn = nn.SmoothL1Loss()

EPOCHS = 60
for _ in range(EPOCHS):
    deep.train()
    for xc, xn, yy in train_loader:
        xc, xn, yy = xc.to(device), xn.to(device), yy.to(device)
        pred = deep(xc, xn)
        loss = loss_fn(pred, yy)
        opt.zero_grad()
        loss.backward()
        opt.step()

deep.eval()
deep_preds = []
with torch.no_grad():
    for xc, xn, _ in test_loader:
        xc, xn = xc.to(device), xn.to(device)
        deep_preds.append(deep(xc, xn).cpu().numpy().ravel())
deep_pred = np.concatenate(deep_preds)

# =========================
# 5) 평가 지표 계산
# =========================
def metrics(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred), r2_score(y_true, y_pred)

rf_mae, rf_r2 = metrics(y_test, rf_pred)
xgb_mae, xgb_r2 = metrics(y_test, xgb_pred)
deep_mae, deep_r2 = metrics(y_test, deep_pred)

print("=== 정확도 비교 (일 매출 AMT_sum, Test) ===")
print(f"RF   : MAE={rf_mae:,.0f}  R2={rf_r2:.3f}")
print(f"XGB  : MAE={xgb_mae:,.0f}  R2={xgb_r2:.3f}")
print(f"Deep : MAE={deep_mae:,.0f}  R2={deep_r2:.3f}")

# =========================
# 6) 그래프 저장 (matplotlib, 색상 미지정)
# =========================
models = ["RF", "XGB", "Deep"]
maes = [rf_mae, xgb_mae, deep_mae]
r2s  = [rf_r2, xgb_r2, deep_r2]

# (1) MAE bar
plt.figure()
plt.bar(models, maes)
plt.title("MAE Comparison (lower is better)")
plt.ylabel("MAE")
plt.savefig("compare_mae.png", dpi=200, bbox_inches="tight")
plt.close()

# (2) R2 bar
plt.figure()
plt.bar(models, r2s)
plt.title("R2 Comparison (higher is better)")
plt.ylabel("R2")
plt.savefig("compare_r2.png", dpi=200, bbox_inches="tight")
plt.close()

# (3) Scatter: Actual vs Pred
plt.figure()
plt.scatter(y_test, rf_pred, label="RF")
plt.scatter(y_test, xgb_pred, label="XGB")
plt.scatter(y_test, deep_pred, label="Deep")
plt.title("Actual vs Predicted (Test)")
plt.xlabel("Actual AMT_sum")
plt.ylabel("Predicted AMT_sum")
plt.legend()
plt.savefig("compare_scatter.png", dpi=200, bbox_inches="tight")
plt.close()

# (4) Residual hist
rf_res = y_test - rf_pred
xgb_res = y_test - xgb_pred
deep_res = y_test - deep_pred

plt.figure()
plt.hist(rf_res, bins=30, alpha=0.7, label="RF")
plt.hist(xgb_res, bins=30, alpha=0.7, label="XGB")
plt.hist(deep_res, bins=30, alpha=0.7, label="Deep")
plt.title("Residual Distribution (Test)")
plt.xlabel("Residual (Actual - Predicted)")
plt.ylabel("Count")
plt.legend()
plt.savefig("compare_residual_hist.png", dpi=200, bbox_inches="tight")
plt.close()

# (5) Residual boxplot
plt.figure()
plt.boxplot([rf_res, xgb_res, deep_res], labels=models)
plt.title("Residual Boxplot (Test)")
plt.ylabel("Residual (Actual - Predicted)")
plt.savefig("compare_residual_box.png", dpi=200, bbox_inches="tight")
plt.close()

print("\nSaved images:")
print("- compare_mae.png")
print("- compare_r2.png")
print("- compare_scatter.png")
print("- compare_residual_hist.png")
print("- compare_residual_box.png")

# 결과를 어떻게 보면 좋나
# compare_r2.png에서 R²가 가장 높은 모델이 “패턴 설명력” 최고
# compare_mae.png에서 MAE가 가장 낮은 모델이 “평균 오차” 최소
# compare_residual_box.png에서 박스가 좁고 0에 가까울수록 안정적

=== 정확도 비교 (일 매출 AMT_sum, Test) ===
RF   : MAE=17,858,319  R2=0.724
XGB  : MAE=19,616,764  R2=0.671
Deep : MAE=175,292,836  R2=-10.804

Saved images:
- compare_mae.png
- compare_r2.png
- compare_scatter.png
- compare_residual_hist.png
- compare_residual_box.png


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# =========================
# 0) 설정
# =========================
CSV_PATH = "SUWON_S_DATA_TABLE_GENDER_SUM.csv"
MIN_DAYS_PER_HOUR = 120      # 시간대별 일수 너무 적으면 스킵
TRAIN_RATIO = 0.8
DEEP_EPOCHS = 40             # GPU 있으면 60~100 추천
DEEP_BATCH = 64
SEED = 42

np.random.seed(SEED)
torch.manual_seed(SEED)

# =========================
# 1) 시간대별(HOUR) 일매출 데이터셋 생성
#    - 타겟: 해당 시간대의 일매출 AMT_sum
#    - 피처: DAY(요일), month, TEMP_mean, RAIN_sum
# =========================
def build_hourly_daily(df: pd.DataFrame, hour: int) -> pd.DataFrame:
    sub = df[df["HOUR"] == hour].copy()
    sub["TA_YMD"] = pd.to_datetime(sub["TA_YMD"], format="%Y%m%d")

    daily = sub.groupby("TA_YMD", as_index=False).agg(
        AMT_sum=("AMT", "sum"),
        TEMP_mean=("TEMP", "mean"),
        RAIN_sum=("RAIN", "sum"),
        DAY_mode=("DAY", lambda x: int(pd.Series(x).mode()[0])),
    )
    daily["month"] = daily["TA_YMD"].dt.month
    return daily.sort_values("TA_YMD").reset_index(drop=True)

def time_split(df: pd.DataFrame, train_ratio=0.8):
    n = len(df)
    split = int(n * train_ratio)
    return df.iloc[:split].copy(), df.iloc[split:].copy()

# =========================
# 2) 공통 전처리 (RF/XGB)
# =========================
def make_preprocessor():
    return ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore"), ["DAY_mode"]),
        ("num", "passthrough", ["TEMP_mean", "RAIN_sum", "month"]),
    ])

# =========================
# 3) 딥러닝 모델 (TabTransformer-style)
# =========================
class TabDataset(Dataset):
    def __init__(self, x_cat, x_num, y):
        self.x_cat = torch.tensor(x_cat, dtype=torch.long)
        self.x_num = torch.tensor(x_num, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).view(-1, 1)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx):
        return self.x_cat[idx], self.x_num[idx], self.y[idx]

class SimpleTabTransformer(nn.Module):
    def __init__(self, emb_dim=16, nhead=4, nlayers=2, ff=128, dropout=0.1):
        super().__init__()
        self.day_emb = nn.Embedding(7, emb_dim)
        self.mon_emb = nn.Embedding(12, emb_dim)
        enc = nn.TransformerEncoderLayer(
            d_model=emb_dim, nhead=nhead, dim_feedforward=ff, dropout=dropout, batch_first=True
        )
        self.tr = nn.TransformerEncoder(enc, num_layers=nlayers)
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim + 2, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x_cat, x_num):
        tokens = torch.stack([self.day_emb(x_cat[:,0]), self.mon_emb(x_cat[:,1])], dim=1)
        z = self.tr(tokens).mean(dim=1)
        return self.mlp(torch.cat([z, x_num], dim=1))

def encode_for_deep(df_):
    # day: 1~7 -> 0~6 / month: 1~12 -> 0~11
    day = (df_["DAY_mode"].astype(int).clip(1,7) - 1).values
    mon = (df_["month"].astype(int).clip(1,12) - 1).values
    x_cat = np.vstack([day, mon]).T
    x_num = df_[["TEMP_mean", "RAIN_sum"]].values.astype(np.float32)
    y = df_["AMT_sum"].values.astype(np.float32)
    return x_cat, x_num, y

def train_predict_deep(train_df, test_df, epochs=40, batch=64):
    xcat_tr, xnum_tr, y_tr = encode_for_deep(train_df)
    xcat_te, xnum_te, y_te = encode_for_deep(test_df)

    tr_loader = DataLoader(TabDataset(xcat_tr, xnum_tr, y_tr), batch_size=batch, shuffle=True)
    te_loader = DataLoader(TabDataset(xcat_te, xnum_te, y_te), batch_size=batch, shuffle=False)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = SimpleTabTransformer().to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=1e-3)
    loss_fn = nn.SmoothL1Loss()

    for _ in range(epochs):
        model.train()
        for xc, xn, yy in tr_loader:
            xc, xn, yy = xc.to(device), xn.to(device), yy.to(device)
            pred = model(xc, xn)
            loss = loss_fn(pred, yy)
            opt.zero_grad()
            loss.backward()
            opt.step()

    model.eval()
    preds=[]
    with torch.no_grad():
        for xc, xn, _ in te_loader:
            xc, xn = xc.to(device), xn.to(device)
            preds.append(model(xc, xn).cpu().numpy().ravel())
    return np.concatenate(preds)

# =========================
# 4) HOUR별 학습/평가 루프
# =========================
df = pd.read_csv(CSV_PATH)

hours = sorted(df["HOUR"].unique())
rows = []

for h in hours:
    daily_h = build_hourly_daily(df, int(h))
    if len(daily_h) < MIN_DAYS_PER_HOUR:
        print(f"[skip] HOUR={h} days={len(daily_h)} (<{MIN_DAYS_PER_HOUR})")
        continue

    train_h, test_h = time_split(daily_h, TRAIN_RATIO)
    y_test = test_h["AMT_sum"].values

    X_train = train_h[["DAY_mode","TEMP_mean","RAIN_sum","month"]]
    y_train = train_h["AMT_sum"].values
    X_test  = test_h[["DAY_mode","TEMP_mean","RAIN_sum","month"]]

    pre = make_preprocessor()

    # RF
    rf = Pipeline([
        ("prep", pre),
        ("rf", RandomForestRegressor(
            n_estimators=500, max_depth=12, min_samples_leaf=2,
            random_state=SEED, n_jobs=-1
        ))
    ])
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_test)

    # XGB
    xgb = Pipeline([
        ("prep", pre),
        ("xgb", XGBRegressor(
            n_estimators=800, max_depth=6, learning_rate=0.05,
            subsample=0.8, colsample_bytree=0.8,
            objective="reg:squarederror",
            random_state=SEED
        ))
    ])
    xgb.fit(X_train, y_train)
    xgb_pred = xgb.predict(X_test)

    # Deep
    deep_pred = train_predict_deep(train_h, test_h, epochs=DEEP_EPOCHS, batch=DEEP_BATCH)

    # Metrics
    for model_name, pred in [("RF", rf_pred), ("XGB", xgb_pred), ("Deep", deep_pred)]:
        rows.append({
            "HOUR": int(h),
            "Model": model_name,
            "MAE": float(mean_absolute_error(y_test, pred)),
            "R2": float(r2_score(y_test, pred)),
        })

result = pd.DataFrame(rows).sort_values(["HOUR","Model"]).reset_index(drop=True)
result.to_csv("hourly_metrics.csv", index=False)
print("\nSaved: hourly_metrics.csv")
print(result.head(12))

# =========================
# 5) 그래프 생성
# =========================
# 피벗 (HOUR x Model)
mae_pivot = result.pivot(index="HOUR", columns="Model", values="MAE").sort_index()
r2_pivot  = result.pivot(index="HOUR", columns="Model", values="R2").sort_index()

# --- (1) MAE heatmap (imshow)
plt.figure()
plt.imshow(mae_pivot.values, aspect="auto")
plt.title("MAE Heatmap (HOUR x Model)  - lower is better")
plt.xlabel("Model")
plt.ylabel("HOUR")
plt.xticks(range(len(mae_pivot.columns)), mae_pivot.columns)
plt.yticks(range(len(mae_pivot.index)), mae_pivot.index)
plt.colorbar(label="MAE")
plt.savefig("hourly_mae_heatmap.png", dpi=200, bbox_inches="tight")
plt.close()

# --- (2) R2 heatmap
plt.figure()
plt.imshow(r2_pivot.values, aspect="auto")
plt.title("R2 Heatmap (HOUR x Model)  - higher is better")
plt.xlabel("Model")
plt.ylabel("HOUR")
plt.xticks(range(len(r2_pivot.columns)), r2_pivot.columns)
plt.yticks(range(len(r2_pivot.index)), r2_pivot.index)
plt.colorbar(label="R2")
plt.savefig("hourly_r2_heatmap.png", dpi=200, bbox_inches="tight")
plt.close()

# --- (3) MAE lines
plt.figure()
for m in mae_pivot.columns:
    plt.plot(mae_pivot.index, mae_pivot[m].values, marker="o", label=m)
plt.title("MAE by HOUR (lower is better)")
plt.xlabel("HOUR")
plt.ylabel("MAE")
plt.legend()
plt.savefig("hourly_mae_lines.png", dpi=200, bbox_inches="tight")
plt.close()

# --- (4) R2 lines
plt.figure()
for m in r2_pivot.columns:
    plt.plot(r2_pivot.index, r2_pivot[m].values, marker="o", label=m)
plt.title("R2 by HOUR (higher is better)")
plt.xlabel("HOUR")
plt.ylabel("R2")
plt.legend()
plt.savefig("hourly_r2_lines.png", dpi=200, bbox_inches="tight")
plt.close()

# --- (5) Best model per hour (MAE)
best_mae = mae_pivot.idxmin(axis=1)
plt.figure()
plt.bar(best_mae.index.astype(str), best_mae.index*0 + 1)  # 막대 높이는 의미 없음(라벨용)
plt.title("Best Model by HOUR (based on MAE)")
plt.xlabel("HOUR")
plt.ylabel("Best (MAE)")
# 막대 위에 모델명 표시
for i, (h, bm) in enumerate(best_mae.items()):
    plt.text(i, 1.02, bm, ha="center", va="bottom")
plt.ylim(0, 1.2)
plt.savefig("hourly_best_model_mae.png", dpi=200, bbox_inches="tight")
plt.close()

# --- (6) Best model per hour (R2)
best_r2 = r2_pivot.idxmax(axis=1)
plt.figure()
plt.bar(best_r2.index.astype(str), best_r2.index*0 + 1)
plt.title("Best Model by HOUR (based on R2)")
plt.xlabel("HOUR")
plt.ylabel("Best (R2)")
for i, (h, bm) in enumerate(best_r2.items()):
    plt.text(i, 1.02, bm, ha="center", va="bottom")
plt.ylim(0, 1.2)
plt.savefig("hourly_best_model_r2.png", dpi=200, bbox_inches="tight")
plt.close()

print("\nSaved images:")
print("- hourly_mae_heatmap.png")
print("- hourly_r2_heatmap.png")
print("- hourly_mae_lines.png")
print("- hourly_r2_lines.png")
print("- hourly_best_model_mae.png")
print("- hourly_best_model_r2.png")

# hourly_r2_lines.png : 시간대별로 어떤 모델이 강한지 한눈에 보임
# hourly_mae_heatmap.png : 특정 시간대에서 튀는 구간(예측 어려운 시간대) 식별
# hourly_best_model_mae.png / hourly_best_model_r2.png : 시간대별 운영 모델 선택에 바로 사용 가능


Saved: hourly_metrics.csv
    HOUR Model           MAE         R2
0      1  Deep  5.432525e+06  -1.935687
1      1    RF  1.905321e+06   0.543460
2      1   XGB  2.047588e+06   0.470522
3      2  Deep  4.389300e+06 -11.552707
4      2    RF  1.158008e+06  -0.306459
5      2   XGB  1.242515e+06  -0.474685
6      3  Deep  1.087496e+07  -8.907372
7      3    RF  1.777713e+06  -0.079037
8      3   XGB  1.986194e+06  -0.112701
9      4  Deep  2.540037e+07 -46.107940
10     4    RF  2.963726e+06  -0.068408
11     4   XGB  3.257487e+06  -0.293305

Saved images:
- hourly_mae_heatmap.png
- hourly_r2_heatmap.png
- hourly_mae_lines.png
- hourly_r2_lines.png
- hourly_best_model_mae.png
- hourly_best_model_r2.png
