<a href="https://colab.research.google.com/github/larry1121/Anatomic-Landmark-Detection/blob/main/aimer_menu_lgbm_wide.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LG Aimers (DACON) – 식음업장 메뉴 수요 예측 (Phase2)
### Global LightGBM + Recursive 7-day Forecast (Wide Submission)

**사용법 요약**
1) 아래 1번 셀을 실행해 의존성을 설치합니다. (로컬 환경에서만 필요)
2) 2번 셀을 실행해 전체 파이프라인 함수를 로드합니다.
3) 3번 셀에서 `data_dir`와 `out_path`를 설정하고 실행하면, `sample_submission.csv`와 동일한 **가로(와이드)** 포맷의 `submission.csv`가 생성됩니다.

**데이터 폴더 구조** (예시)
```
data/
  train/train.csv
  test/TEST_00.csv ... TEST_09.csv
  sample_submission.csv
```

**룰 준수 사항**
- Train: train.csv만 사용
- Inference: 각 TEST_xx의 28일 입력만 사용 (샘플 간 독립)
- 예측: 재귀 방식(D+1→창 업데이트→…→D+7)
- 외부데이터 미사용, 도메인지식(요일/공휴일)만 활용
- 음수 매출은 0으로 클립 (정답에 음수 없음)


In [None]:
import sys
def pip_install(pkg):
    try:
        __import__(pkg.split('==')[0].split('>')[0].split('<')[0])
    except Exception:
        !pip install {pkg}

for p in ['pandas','numpy','lightgbm','holidays','scikit-learn']:
    try:
        __import__(p)
    except Exception:
        pip_install(p)




In [9]:
from __future__ import annotations
import warnings
from pathlib import Path
from typing import Tuple, List, Dict
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor

# ─────────────────────────────────────────────────────────────────────────────
# 공휴일/환경
# ─────────────────────────────────────────────────────────────────────────────
try:
    import holidays
    KR_HOL = holidays.KR()
except Exception:
    KR_HOL = None
    warnings.warn("holidays 패키지 미설치: 공휴일 피처 없이 진행합니다.")
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 200)
SEED = 42
np.random.seed(SEED)

# ─────────────────────────────────────────────────────────────────────────────
# 공통 유틸/피처
# ─────────────────────────────────────────────────────────────────────────────
def split_store_menu(name: str) -> Tuple[str, str]:
    if not isinstance(name, str): return ("", "")
    parts = name.split("_", 1)
    return (parts[0], parts[1]) if len(parts) == 2 else (name, "")

def is_holiday_kr(dt: pd.Timestamp) -> int:
    if KR_HOL is None: return 0
    try:
        return int(dt in KR_HOL)
    except Exception:
        return 0

def build_calendar_features(dates: pd.Series) -> pd.DataFrame:
    df = pd.DataFrame({
        "dow": dates.dt.weekday,
        "is_weekend": dates.dt.weekday.isin([5,6]).astype(int),
        "dom": dates.dt.day,
        "weekofyear": dates.dt.isocalendar().week.astype(int),
        "month": dates.dt.month,
        "doy": dates.dt.dayofyear,
        "is_month_start": dates.dt.is_month_start.astype(int),
        "is_month_end": dates.dt.is_month_end.astype(int),
    }, index=dates.index)
    df["dow_sin"] = np.sin(2*np.pi*df["dow"]/7)
    df["dow_cos"] = np.cos(2*np.pi*df["dow"]/7)
    df["month_sin"] = np.sin(2*np.pi*df["month"]/12)
    df["month_cos"] = np.cos(2*np.pi*df["month"]/12)
    df["is_holiday"] = dates.apply(is_holiday_kr).astype(int) if KR_HOL is not None else 0
    return df

def make_lag_features(y: pd.Series, max_lag: int = 28) -> pd.DataFrame:
    y0 = y.clip(lower=0)
    feats = {f"lag_{k}": y0.shift(k) for k in range(1, max_lag+1)}
    feats["roll_mean_7"]  = y0.shift(1).rolling(7).mean()
    feats["roll_mean_14"] = y0.shift(1).rolling(14).mean()
    feats["roll_mean_28"] = y0.shift(1).rolling(28).mean()
    feats["roll_med_7"]   = y0.shift(1).rolling(7).median()
    feats["roll_med_28"]  = y0.shift(1).rolling(28).median()
    feats["roll_std_7"]   = y0.shift(1).rolling(7).std()
    feats["roll_std_28"]  = y0.shift(1).rolling(28).std()
    return pd.DataFrame(feats, index=y.index)

def smape_approx(y_true, y_pred, eps=1e-6):
    yt = np.asarray(y_true); yp = np.asarray(y_pred)
    mask = (yt != 0)
    yt = yt[mask]; yp = yp[mask]
    denom = (np.abs(yt) + np.abs(yp) + eps) / 2.0
    return float(np.mean(np.abs(yp - yt) / denom)) if len(yt) else np.nan

# ─────────────────────────────────────────────────────────────────────────────
# 데이터 로드 & 전역(supervised) 테이블 (Recursive용)
# ─────────────────────────────────────────────────────────────────────────────
def load_train(data_dir: Path) -> pd.DataFrame:
    fp = data_dir / "train" / "train.csv"
    df = pd.read_csv(fp)
    df.columns = [c.strip() for c in df.columns]
    df["영업일자"] = pd.to_datetime(df["영업일자"])
    df["매출수량"] = df["매출수량"].astype(float).clip(lower=0)
    sm = df["영업장명_메뉴명"].astype(str).apply(split_store_menu)
    df["업장명"] = sm.apply(lambda x: x[0])
    df["메뉴명"] = sm.apply(lambda x: x[1])
    return df

def build_supervised_recursive(df: pd.DataFrame, max_lag=28, val_days=28):
    rows = []; weights = []
    for (store, menu), g in df.groupby(["업장명", "메뉴명"], sort=False):
        g = g.sort_values("영업일자")
        full_idx = pd.date_range(g["영업일자"].min(), g["영업일자"].max(), freq="D")
        y = g.set_index("영업일자")["매출수량"].reindex(full_idx).fillna(0.0)
        cal = build_calendar_features(pd.Series(full_idx, index=full_idx))
        lags = make_lag_features(y, max_lag=max_lag)
        feat = pd.concat([cal, lags], axis=1).iloc[max_lag:]
        target = y.iloc[max_lag:]
        feat["업장명"] = store; feat["메뉴명"] = menu; feat["target_date"] = feat.index
        sw = 2.0 if store in ["담하", "미라시아"] else 1.0
        weights.extend([sw]*len(feat))
        rows.append(pd.concat([feat, target.rename("y")], axis=1))
    data = pd.concat(rows, axis=0, ignore_index=True)
    weights = pd.Series(weights, index=data.index, name="weight")
    cutoff = data["target_date"].max() - pd.Timedelta(days=val_days)
    train_mask = data["target_date"] <= cutoff
    val_mask   = data["target_date"] >  cutoff
    y_all = data["y"]
    X_all = data.drop(columns=["y", "target_date"])
    for cat in ["업장명", "메뉴명"]:
        X_all[cat] = X_all[cat].astype("category")
    return (
        X_all.loc[train_mask].reset_index(drop=True),
        y_all.loc[train_mask].reset_index(drop=True),
        X_all.loc[val_mask].reset_index(drop=True),
        y_all.loc[val_mask].reset_index(drop=True),
        weights.loc[train_mask].reset_index(drop=True)
    )

# ─────────────────────────────────────────────────────────────────────────────
# Direct-H 학습 테이블: 앵커 t의 피처 → 타깃 y[t+h]
# (훈련 시 피처는 날짜 t 기준 과거만 사용, 타깃은 t+h 로 shift)
# ─────────────────────────────────────────────────────────────────────────────
def build_supervised_direct(df: pd.DataFrame, horizon: int, max_lag=28, val_days=28):
    assert 1 <= horizon <= 7
    rows = []; weights = []
    for (store, menu), g in df.groupby(["업장명", "메뉴명"], sort=False):
        g = g.sort_values("영업일자")
        full_idx = pd.date_range(g["영업일자"].min(), g["영업일자"].max(), freq="D")
        y = g.set_index("영업일자")["매출수량"].reindex(full_idx).fillna(0.0)

        # 앵커 날짜 시퀀스
        cal = build_calendar_features(pd.Series(full_idx, index=full_idx))
        lags = make_lag_features(y, max_lag=max_lag)
        feat = pd.concat([cal, lags], axis=1)

        # 타깃은 y[t+h] → y.shift(-h)
        target = y.shift(-horizon)

        # 최대 랙 구간 이후 & 타깃 존재(t+h 범위 내)만 사용
        valid_mask = pd.Series(True, index=feat.index)
        valid_mask.iloc[:max_lag] = False           # 랙 버퍼
        valid_mask.iloc[-horizon:] = False          # 타깃 존재 구간
        feat = feat.loc[valid_mask]
        targ = target.loc[valid_mask]

        feat["업장명"] = store; feat["메뉴명"] = menu; feat["anchor_date"] = feat.index
        sw = 2.0 if store in ["담하", "미라시아"] else 1.0
        weights.extend([sw]*len(feat))
        rows.append(pd.concat([feat, targ.rename("y")], axis=1))

    data = pd.concat(rows, axis=0, ignore_index=True)
    weights = pd.Series(weights, index=data.index, name="weight")

    cutoff = data["anchor_date"].max() - pd.Timedelta(days=val_days + horizon - 1)
    train_mask = data["anchor_date"] <= cutoff
    val_mask   = data["anchor_date"] >  cutoff

    y_all = data["y"]
    X_all = data.drop(columns=["y", "anchor_date"])
    for cat in ["업장명", "메뉴명"]:
        X_all[cat] = X_all[cat].astype("category")

    return (
        X_all.loc[train_mask].reset_index(drop=True),
        y_all.loc[train_mask].reset_index(drop=True),
        X_all.loc[val_mask].reset_index(drop=True),
        y_all.loc[val_mask].reset_index(drop=True),
        weights.loc[train_mask].reset_index(drop=True)
    )

# ─────────────────────────────────────────────────────────────────────────────
# 학습기
# ─────────────────────────────────────────────────────────────────────────────
def train_lgbm(X_train, y_train, X_val, y_val, w_train):
    params = dict(
        n_estimators=5000, learning_rate=0.03, subsample=0.9, colsample_bytree=0.9,
        num_leaves=63, min_child_samples=40, random_state=SEED, objective="mae",
        reg_alpha=1e-3, reg_lambda=1e-3, verbose=-1
    )
    model = LGBMRegressor(**params)
    model.fit(X_train, y_train, sample_weight=w_train, eval_set=[(X_val, y_val)], eval_metric="l1", callbacks=[])
    return model

def train_recursive_model(df_train: pd.DataFrame):
    X_tr, y_tr, X_va, y_va, w_tr = build_supervised_recursive(df_train, max_lag=28, val_days=28)
    m = train_lgbm(X_tr, y_tr, X_va, y_va, w_tr)
    val_pred = np.clip(m.predict(X_va), 0, None)
    print(f"[VAL][Recursive] SMAPE≈ {smape_approx(y_va.values, val_pred):.4f}")
    return m

def train_direct_models(df_train: pd.DataFrame) -> Dict[int, LGBMRegressor]:
    models = {}
    for h in range(1, 8):
        X_tr, y_tr, X_va, y_va, w_tr = build_supervised_direct(df_train, horizon=h, max_lag=28, val_days=28)
        m = train_lgbm(X_tr, y_tr, X_va, y_va, w_tr)
        val_pred = np.clip(m.predict(X_va), 0, None)
        print(f"[VAL][Direct h={h}] SMAPE≈ {smape_approx(y_va.values, val_pred):.4f}")
        models[h] = m
    return models

# ─────────────────────────────────────────────────────────────────────────────
# 추론기 (Recursive/Direct/Naive)
# ─────────────────────────────────────────────────────────────────────────────
def build_features_for_anchor(window: pd.Series, anchor_date: pd.Timestamp, store: str, menu: str) -> pd.DataFrame:
    # 앵커일(=마지막 실제일 D) 기준 피처. Direct-H에서 사용.
    cal = build_calendar_features(pd.Series([anchor_date]))
    ldict = {f"lag_{k}": [window.iloc[-k] if len(window) >= k else 0.0] for k in range(1, 29)}
    def roll(arr, w, fn):
        s = pd.Series(arr)
        return float(fn(s.iloc[-w:])) if len(arr) >= w else 0.0
    vals = window.values
    X = pd.concat([cal.reset_index(drop=True), pd.DataFrame({
        "roll_mean_7":[roll(vals,7,np.mean)], "roll_mean_14":[roll(vals,14,np.mean)], "roll_mean_28":[roll(vals,28,np.mean)],
        "roll_med_7":[roll(vals,7,np.median)], "roll_med_28":[roll(vals,28,np.median)],
        "roll_std_7":[roll(vals,7,np.std)], "roll_std_28":[roll(vals,28,np.std)]
    })], axis=1)
    X["업장명"] = pd.Series([store], dtype="category")
    X["메뉴명"] = pd.Series([menu], dtype="category")
    return X.assign(**ldict)

def predict_recursive_for_window(model: LGBMRegressor, last28: pd.Series, start_next: pd.Timestamp, store: str, menu: str) -> List[float]:
    window = last28.clip(lower=0).copy()
    preds = []
    cur = start_next
    for _ in range(7):
        X = build_features_for_anchor(window, cur - pd.Timedelta(days=1), store, menu)  # anchor를 전날로 잡아도 동일 피처
        yhat = float(model.predict(X)[0])
        yhat = max(0.0, yhat)
        preds.append(yhat)
        window = pd.concat([window, pd.Series([yhat], index=[cur])])
        cur += pd.Timedelta(days=1)
    return preds

def predict_direct_for_window(models_h: Dict[int, LGBMRegressor], last28: pd.Series, anchor_date: pd.Timestamp, store: str, menu: str) -> List[float]:
    X_anchor = build_features_for_anchor(last28, anchor_date, store, menu)
    out = []
    for h in range(1, 8):
        yhat = float(models_h[h].predict(X_anchor)[0])
        out.append(max(0.0, yhat))
    return out

def naive_t7(last28: pd.Series) -> List[float]:
    v = last28.values[-7:]
    return [float(max(0.0, x)) for x in v]

# ─────────────────────────────────────────────────────────────────────────────
# 시계열 특성 계산 & 블렌딩 룰
# ─────────────────────────────────────────────────────────────────────────────
def series_stats(last28: pd.Series) -> Dict[str, float]:
    y = last28.clip(lower=0).values.astype(float)
    s = float(np.sum(y))
    zeros = float(np.mean(y == 0.0))
    m = float(np.mean(y)) if len(y) else 0.0
    sd = float(np.std(y)) if len(y) else 0.0
    vol = (sd / m) if m > 0 else np.inf
    # t vs t-7 상관 (겹치는 21개 비교)
    if len(y) >= 28:
        a = y[7:28]; b = y[0:21]
        corr = float(np.corrcoef(a, b)[0,1]) if np.std(a)>0 and np.std(b)>0 else 0.0
    else:
        corr = 0.0
    return {"sum": s, "zero_ratio": zeros, "vol": vol, "corr7": corr, "mean": m, "std": sd}

def blend_rules(direct_7: List[float], recur_7: List[float], naive_7: List[float], stats: Dict[str, float]) -> List[float]:
    # 기본 지평별 가중
    base = []
    for h in range(1, 8):
        if h <= 2:   w_d, w_r = 0.4, 0.6
        elif h <= 5: w_d, w_r = 0.6, 0.4
        else:        w_d, w_r = 0.7, 0.3
        base.append((w_d, w_r))
    # 변동성↑ → Recursive 가중 +0.1 (최대 0.75)
    if stats["vol"] > 1.0:
        base = [(wd, min(0.75, wr+0.1)) for (wd, wr) in base]
        base = [(max(0.0, 1.0-wr), wr) for (_, wr) in base]  # 합=1 유지
    # 혼합
    blended = []
    for h in range(7):
        wd, wr = base[h]
        v = wd*direct_7[h] + wr*recur_7[h]
        blended.append(v)
    # 희소/저판매 → 나이브 0.3 섞기
    if (stats["sum"] <= 5.0) or (stats["zero_ratio"] >= 0.50):
        blended = [0.7*b + 0.3*n for b, n in zip(blended, naive_7)]
    # 음수 방지
    return [max(0.0, float(x)) for x in blended]

# ─────────────────────────────────────────────────────────────────────────────
# 제출 포맷 안전 저장/검증
# ─────────────────────────────────────────────────────────────────────────────
def write_out_submission(sample_path: Path, raw_out_df: pd.DataFrame, out_path: Path) -> None:
    sample = pd.read_csv(sample_path)
    sample.columns = [c.strip() for c in sample.columns]
    key_col = sample.columns[0]
    value_cols = list(sample.columns[1:])
    aligned = sample[[key_col]].merge(raw_out_df, on=key_col, how="left")
    for c in value_cols:
        if c in aligned.columns:
            aligned[c] = pd.to_numeric(aligned[c], errors="coerce").fillna(0.0).clip(lower=0.0)
        else:
            aligned[c] = 0.0
    final = aligned[sample.columns]
    assert list(final.columns) == list(sample.columns)
    assert len(final) == len(sample)
    assert final[key_col].equals(sample[key_col])
    assert final[value_cols].isna().sum().sum() == 0
    assert (final[value_cols].values < 0).sum() == 0
    final.to_csv(out_path, index=False, encoding="utf-8-sig")

def validate_submission(submission_path: Path, sample_path: Path) -> None:
    sub = pd.read_csv(submission_path)
    samp = pd.read_csv(sample_path)
    assert list(sub.columns) == list(samp.columns), "열 이름/순서 불일치"
    assert len(sub) == len(samp), "행 수 불일치"
    key = samp.columns[0]
    assert sub[key].equals(samp[key]), "'영업일자' 키 순서 불일치"
    vals = samp.columns[1:]
    assert sub[vals].isna().sum().sum() == 0, "값 열에 NaN 존재"
    assert (sub[vals].values < 0).sum() == 0, "값 열에 음수 존재"

# ─────────────────────────────────────────────────────────────────────────────
# 최종 빌드: Direct+Recursive+Naive 블렌딩
# ─────────────────────────────────────────────────────────────────────────────
def train_and_predict_blend(data_dir: str, out_csv: str = "submission.csv"):
    data_dir = Path(data_dir)
    assert (data_dir / "train" / "train.csv").exists(), "train/train.csv not found"
    assert (data_dir / "sample_submission.csv").exists(), "sample_submission.csv not found"

    print("[1/4] Load train")
    train_df = load_train(data_dir)

    print("[2/4] Train models: Recursive + Direct-H(1..7)")
    rec_model = train_recursive_model(train_df)
    dir_models = train_direct_models(train_df)

    print("[3/4] Inference per TEST_xx with blending rules")
    sample_path = data_dir / "sample_submission.csv"
    sample = pd.read_csv(sample_path)
    sample.columns = [c.strip() for c in sample.columns]
    menu_cols = list(sample.columns[1:])

    # (test_id, 메뉴명) -> 7개 예측 보관
    pred_cache: Dict[Tuple[str, str], List[float]] = {}

    for i in range(10):
        test_id = f"TEST_{i:02d}"
        fp = data_dir / "test" / f"{test_id}.csv"
        if not fp.exists():
            raise FileNotFoundError(f"Missing: {fp}")
        tdf = pd.read_csv(fp)
        tdf["영업일자"] = pd.to_datetime(tdf["영업일자"])



In [None]:
# 실행 예시
data_dir = './data'  # 데이터 위치로 변경
out_csv  = 'submission.csv'
train_and_predict_blend(data_dir="./data", out_csv="submission.csv")

print('노트북 준비 완료. data_dir를 지정해 실행하세요.')


[1/4] Load train
[2/4] Train models: Recursive + Direct-H(1..7)
[VAL][Recursive] SMAPE≈ 0.7953
[VAL][Direct h=1] SMAPE≈ 0.8078
[VAL][Direct h=2] SMAPE≈ 0.7984
[VAL][Direct h=3] SMAPE≈ 0.8010


In [3]:
import os
import shutil

# Define the new directory structure
new_data_dir = '/content/data'
new_train_dir = os.path.join(new_data_dir, 'train')
new_test_dir = os.path.join(new_data_dir, 'test')

# Create the new directories if they don't exist
os.makedirs(new_train_dir, exist_ok=True)
os.makedirs(new_test_dir, exist_ok=True)

# Move the files
files_to_move = {
    '/content/train.csv': new_train_dir,
    '/content/sample_submission.csv': new_data_dir,
}

for i in range(10):
    test_file = f'/content/TEST_{i:02d}.csv'
    if os.path.exists(test_file):
        files_to_move[test_file] = new_test_dir

for src, dest_dir in files_to_move.items():
    if os.path.exists(src):
        shutil.move(src, dest_dir)
        print(f"Moved {src} to {dest_dir}")
    else:
        print(f"Warning: {src} not found.")

Moved /content/train.csv to /content/data/train
Moved /content/TEST_00.csv to /content/data/test
Moved /content/TEST_01.csv to /content/data/test
Moved /content/TEST_02.csv to /content/data/test
Moved /content/TEST_03.csv to /content/data/test
Moved /content/TEST_04.csv to /content/data/test
Moved /content/TEST_05.csv to /content/data/test
Moved /content/TEST_06.csv to /content/data/test
Moved /content/TEST_07.csv to /content/data/test
Moved /content/TEST_08.csv to /content/data/test
Moved /content/TEST_09.csv to /content/data/test
