In [None]:
import os
import random
import glob
import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn
from tqdm import tqdm

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
from collections import defaultdict, Counter
from sklearn.linear_model import Ridge

# === 경로 설정 ===
BASE = Path('/data')
RAW_TRAIN = BASE / 'train/train.csv'
TRIMMED_PATH = BASE / 'train/trimmed_train.csv'
KS_OUT_DIR = Path('/data/models/keyword_signal_top1')
KS_OUT_DIR.mkdir(parents=True, exist_ok=True)
TOP1_CSV = KS_OUT_DIR / 'store_top1_token_beta.csv'  # 가게별 Top-1 키워드 + β 저장

# === 1) 데이터 로드 & 기본 정리 ===
train = pd.read_csv(RAW_TRAIN)
train.columns = ['date', 'store_menu', 'qty']
train['date'] = pd.to_datetime(train['date'], errors='coerce')
train['qty'] = pd.to_numeric(train['qty'], errors='coerce')
train = train.dropna(subset=['date', 'qty'])  # 결측만 제거

# === 2) store / menu 분리 ===
train[['store', 'menu']] = train['store_menu'].astype(str).str.split('_', n=1, expand=True)

# === 3) 출시 전(리드인) 0 제거: 첫 양수 판매일 이전 드롭 ===
n_rows_before = len(train)
n_groups_before0 = train['store_menu'].nunique()

release_date_map = (
    train.loc[train['qty'] > 0, ['store_menu', 'date']]
         .groupby('store_menu')['date']
         .min()
         .rename('release_date')
)

train = (
    train.merge(release_date_map, on='store_menu', how='left')
         .loc[lambda df: df['date'] >= df['release_date']]
         .drop(columns='release_date')
)

train = train.sort_values(['store_menu', 'date']).reset_index(drop=True)


# === 4) 날짜 파생 ===
train['weekday'] = train['date'].dt.day_name()
train['month'] = train['date'].dt.month.astype(int)
train['dayofweek'] = train['date'].dt.weekday.astype(int)  # 0=Mon..6=Sun
train['is_weekend'] = (train['dayofweek'] >= 5).astype(int)  # 토/일
train['is_fri'] = (train['dayofweek'] == 4).astype(int)
train['is_sat'] = (train['dayofweek'] == 5).astype(int)
train['is_sun'] = (train['dayofweek'] == 6).astype(int)

# === 5) 연주기 특성(사인/코사인) ===
doy = train['date'].dt.dayofyear.astype(int)  # 1..366
train['doy'] = doy
train['doy_sin'] = np.sin(2*np.pi * doy / 365.25)
train['doy_cos'] = np.cos(2*np.pi * doy / 365.25)

# === 6) 고정 특일 플래그 + 근접도(±일수) ===
SPECIAL_DATES = {
    '01-01': 'new_year',
    '12-24': 'xmas_eve',
    '12-25': 'christmas',
    '12-31': 'year_end',
}
special_md = train['date'].dt.strftime('%m-%d')
train['special_tag'] = special_md.map(SPECIAL_DATES).fillna('')
train['is_special_any'] = (train['special_tag'] != '').astype(int)

def nearest_special_dist(series_dates: pd.Series, md_dict: dict):
    """각 날짜에 대해 같은 해의 특일까지 거리(앞/뒤)를 계산"""
    df = pd.DataFrame({'date': series_dates})
    df['year'] = df['date'].dt.year
    years = df['year'].unique()
    year2special = {}
    for y in years:
        arr = []
        for md in md_dict.keys():
            m, d = map(int, md.split('-'))
            arr.append(pd.Timestamp(year=y, month=m, day=d))
        year2special[y] = np.array(arr, dtype='datetime64[D]')
    dist_to_next = np.empty(len(df), dtype='float64')
    dist_from_prev = np.empty(len(df), dtype='float64')
    dts = df['date'].values.astype('datetime64[D]')
    yrs = df['year'].values
    for i, (y, dt) in enumerate(zip(yrs, dts)):
        arr = year2special[y]
        if arr.size == 0:
            dist_to_next[i] = np.nan
            dist_from_prev[i] = np.nan
            continue
        dists = (arr - dt).astype('timedelta64[D]').astype(int)
        ge0 = dists[dists >= 0]
        le0 = dists[dists <= 0]
        dist_to_next[i] = ge0.min() if ge0.size else np.nan
        dist_from_prev[i] = -le0.max() if le0.size else np.nan
    return pd.Series(dist_to_next), pd.Series(dist_from_prev)

to_next, from_prev = nearest_special_dist(train['date'], SPECIAL_DATES)
train['days_to_special'] = to_next.fillna(999).astype(int)
train['days_from_special'] = from_prev.fillna(999).astype(int)
train['min_days_to_special'] = np.minimum(train['days_to_special'], train['days_from_special']).astype(int)
train['is_special_window_k2'] = (train['min_days_to_special'] <= 2).astype(int)

# === 7) 그룹 평균(메뉴×요일/월) ===
train['sm_dow_mean'] = (
    train.groupby(['store_menu', 'dayofweek'])['qty']
         .transform('mean')
         .astype(float)
)
train['sm_month_mean'] = (
    train.groupby(['store_menu', 'month'])['qty']
         .transform('mean')
         .astype(float)
)

# === 8) 메뉴명 토큰화 유틸 ===
PAREN_RE = re.compile(r'[\(\[\{（【](.*?)[\)\]\}）】]')
WORD_RE  = re.compile(r'[0-9A-Za-z가-힣]+')
def extract_tokens(menu, min_len=1, max_len=8):
    """괄호 안/전체에서 단어 추출 → 소문자 토큰 리스트"""
    menu = str(menu)
    toks = []
    for inner in PAREN_RE.findall(menu):
        for w in WORD_RE.findall(inner):
            if min_len <= len(w) <= max_len: toks.append(w)
    # 끝 단어(강조) + 전체 단어
    words = WORD_RE.findall(menu)
    if words:
        last = words[-1]
        if min_len <= len(last) <= max_len: toks.append(last)
    for w in words:
        if min_len <= len(w) <= max_len: toks.append(w)
    return [t.lower() for t in toks]

train['tokens'] = train['menu'].apply(extract_tokens)

# === 9) 가게별 Top-1 키워드 β 추정 (Ridge) ===
MIN_STORE_ROWS = 50
MIN_TOKEN_DAYS = 3
ALPHA = 5.0

def fit_top1_token_by_store(df: pd.DataFrame):
    """store별로 날짜×토큰 수요행렬 → '다른 메뉴 합계'를 타깃으로 Ridge 회귀 → |β| 최대 토큰 선정"""
    rows = []
    for store, g in df.groupby('store'):
        g = g.sort_values('date')
        if len(g) < MIN_STORE_ROWS:
            continue

        # 후보 토큰(최소 등장일수 충족)
        token_days = defaultdict(int)
        for d, toks in zip(g['date'], g['tokens']):
            for t in set(toks):
                token_days[t] += 1
        tokens = [t for t, c in token_days.items() if c >= MIN_TOKEN_DAYS]
        if not tokens:
            continue

        # 날짜별 토큰수요 합계 구성
        total = g.groupby('date')['qty'].sum().rename('total_qty')
        dates = sorted(total.index)
        dt_tok_qty = {(d, t): 0.0 for d in dates for t in tokens}
        for r in g.itertuples(index=False):
            d = r.date
            for t in set(r.tokens):
                if t in tokens:
                    dt_tok_qty[(d, t)] += float(r.qty)

        # 디자인행렬 X(토큰합) / 타깃 y(다른 메뉴 합계)
        X = np.zeros((len(dates), len(tokens)), dtype=float)
        y = np.zeros(len(dates), dtype=float)
        for i, d in enumerate(dates):
            tok_sum = 0.0
            for j, t in enumerate(tokens):
                v = dt_tok_qty[(d, t)]
                X[i, j] = v
                tok_sum += v
            y[i] = max(float(total.loc[d]) - tok_sum, 0.0)

        if X.shape[1] == 0 or np.all(X == 0) or np.std(y) < 1e-6:
            continue

        # 스케일링(최대값 나눔) 후 Ridge → 원스케일로 환산
        Xs = X.copy()
        maxs = Xs.max(axis=0); maxs[maxs == 0] = 1.0
        Xs = Xs / maxs
        model = Ridge(alpha=ALPHA, fit_intercept=True, random_state=42)
        model.fit(Xs, y)
        betas = model.coef_ / (maxs + 1e-9)  # 원스케일 환산

        # |β| 최대 토큰을 Top-1로 선택
        idx = int(np.argmax(np.abs(betas)))
        top_token = tokens[idx]
        top_beta = float(betas[idx])

        rows.append({'store': store, 'token': top_token, 'beta': top_beta})
    return pd.DataFrame(rows)

top1_df = fit_top1_token_by_store(train)
top1_df.to_csv(TOP1_CSV, index=False)
print(f"✅ Saved Top-1 tokens per store: {TOP1_CSV}")

# === 10) 저장 ===
trimmed_train = train.sort_values(['store_menu', 'date']).reset_index(drop=True)
trimmed_train.to_csv(TRIMMED_PATH, index=False, encoding='utf-8-sig')
print("✅ 저장 완료:", TRIMMED_PATH)
print("행/열:", trimmed_train.shape)
print("\n추가된 주요 컬럼: ['doy','doy_sin','doy_cos','is_special_any','special_tag',"
      "'days_to_special','days_from_special','min_days_to_special','is_special_window_k2',"
      "'sm_dow_mean','sm_month_mean','tokens']")
print("\n미리보기:")
print(trimmed_train.head(5))


✅ Saved Top-1 tokens per store: /content/drive/MyDrive/lg aimers/models/keyword_signal_top1/store_top1_token_beta.csv
✅ 저장 완료: /content/drive/MyDrive/lg aimers/open/train/trimmed_train.csv
행/열: (90517, 24)

추가된 주요 컬럼: ['doy','doy_sin','doy_cos','is_special_any','special_tag','days_to_special','days_from_special','min_days_to_special','is_special_window_k2','sm_dow_mean','sm_month_mean','tokens']

미리보기:
        date          store_menu  qty       store     menu    weekday  month  \
0 2023-01-17  느티나무 셀프BBQ_1인 수저세트    8  느티나무 셀프BBQ  1인 수저세트    Tuesday      1   
1 2023-01-18  느티나무 셀프BBQ_1인 수저세트    0  느티나무 셀프BBQ  1인 수저세트  Wednesday      1   
2 2023-01-19  느티나무 셀프BBQ_1인 수저세트    0  느티나무 셀프BBQ  1인 수저세트   Thursday      1   
3 2023-01-20  느티나무 셀프BBQ_1인 수저세트    0  느티나무 셀프BBQ  1인 수저세트     Friday      1   
4 2023-01-21  느티나무 셀프BBQ_1인 수저세트    0  느티나무 셀프BBQ  1인 수저세트   Saturday      1   

   dayofweek  is_weekend  is_fri  ...   doy_cos  special_tag  is_special_any  \
0          1           0       0 

In [None]:
# 설치 필요 시
!pip install autogluon tqdm -q

In [None]:
# ===============================
# 28일 창 기반 수퍼바이즈드 학습 데이터 생성
# - store Top-1 token 28일 합
# - store 총수요 28일 합
# - zero_ratio28, mean7/28, vol14, 주말×토큰 상호작용 등 추가 피처
# ===============================
import pandas as pd
from autogluon.tabular import TabularPredictor
from pathlib import Path
import numpy as np
import glob, os, re

# === 경로 설정 ===
BASE = Path('/data')
TRAIN_CSV = BASE / 'train/trimmed_train.csv'
TEST_DIR = BASE / 'test'
MODEL_DIR = Path('/data/models/simple_gbm_direct')
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# === 특일 정의 (월-일) ===
SPECIAL_DATES = {'01-01':'new_year','12-24':'xmas_eve','12-25':'christmas','12-31':'year_end'}

# === 날짜별 특일 근접도 계산 ===
def _nearest_special_dists_for_date(dn: pd.Timestamp):
    year = int(dn.year)
    sp = [pd.Timestamp(year=year, month=int(md.split('-')[0]), day=int(md.split('-')[1])) for md in SPECIAL_DATES]
    if not sp: return 999,999,999,0
    arr = np.array(sp, dtype='datetime64[D]'); d0 = np.datetime64(dn.date())
    dists = (arr - d0).astype('timedelta64[D]').astype(int)
    ge0 = dists[dists >= 0]; le0 = dists[dists <= 0]
    to_next = ge0.min() if ge0.size else np.inf
    from_prev = -le0.max() if le0.size else np.inf
    min_abs = min(int(to_next) if np.isfinite(to_next) else 999,
                  int(from_prev) if np.isfinite(from_prev) else 999)
    is_any = 1 if (min_abs == 0) else 0
    is_win2 = 1 if (min_abs <= 2) else 0
    return (int(to_next) if np.isfinite(to_next) else 999,
            int(from_prev) if np.isfinite(from_prev) else 999,
            min_abs, is_win2)


# === 보조 함수: 0 스트릭/최근 양수 이후 경과일 ===
def _zero_streak_from_end(arr):
    cnt = 0
    for v in arr[::-1]:
        if v==0: cnt+=1
        else: break
    return cnt

def _days_since_last_nonzero(arr):
    nz = np.where(np.asarray(arr)>0)[0]
    if nz.size==0: return 999
    return (len(arr)-1)-nz.max()

# === Top-1 토큰 로드 ===
TOP1_CSV = Path('/data/models/keyword_signal_top1/store_top1_token_beta.csv')

def load_top1_tokens(csv_path):
    """store별 Top-1 토큰 로드"""
    if not Path(csv_path).exists():
        print("[warn] Top-1 token file not found. Proceeding without token features.")
        return {}
    df = pd.read_csv(csv_path)
    return {str(r.store): str(r.token) for r in df.itertuples(index=False)}

# === store별 토큰 일별 합계 ===
def build_store_top1_daily_qty(train_df, store2token):
    if not store2token: return pd.DataFrame(columns=['store','date','top1_token_qty'])
    df = train_df[['store','menu','date','qty']].copy()
    df['date'] = pd.to_datetime(df['date']); df['store'] = df['store'].astype(str)
    df['menu_low'] = df['menu'].astype(str).str.lower()
    rows = []
    for store, g in df.groupby('store'):
        token = store2token.get(store)
        if not token or token=='nan': continue
        tok = str(token).lower()
        mask = g['menu_low'].str.contains(tok, na=False)
        if not mask.any(): continue
        agg = g.loc[mask].groupby('date')['qty'].sum().reset_index(name='top1_token_qty')
        agg['store'] = store
        rows.append(agg[['store','date','top1_token_qty']])
    if not rows: return pd.DataFrame(columns=['store','date','top1_token_qty'])
    return pd.concat(rows, ignore_index=True)


# === store별 총수요 일별 합계 ===
def build_store_total_daily_qty(train_df):
    tot = (train_df.groupby(['store','date'])['qty']
           .sum().reset_index(name='store_total_qty'))
    return tot


# --- 원본 로드 ---
train = pd.read_csv(TRAIN_CSV)
train['date'] = pd.to_datetime(train['date'], errors='coerce')
train['store_menu'] = train['store'].astype(str) + '_' + train['menu'].astype(str)

store2token = load_top1_tokens(TOP1_CSV)
store_top1_daily = build_store_top1_daily_qty(train, store2token).sort_values(['store','date'])
store_total_daily = build_store_total_daily_qty(train).sort_values(['store','date'])

# === 메인: 윈도 기반 학습 표본 생성 ===
def make_train_rows(g: pd.DataFrame):
    g = g.sort_values('date').reset_index(drop=True)
    q = g['qty'].values
    sm = g['store_menu'].iloc[0]
    store = g['store'].iloc[0]
    rows = []

    # store별 토큰/총수요 맵
    tok_series = store_top1_daily[store_top1_daily['store']==store]
    tok_map = dict(zip(tok_series['date'].dt.date, tok_series['top1_token_qty']))
    tot_series = store_total_daily[store_total_daily['store']==store]
    tot_map = dict(zip(tot_series['date'].dt.date, tot_series['store_total_qty']))

    # 28일 히스토리 → 7일 타깃
    for i in range(28, len(g)-7):
        hist = q[i-28:i]
        # 기본 통계
        last = hist[-1]; mean7 = hist[-7:].mean(); mean28 = hist.mean()
        std7 = hist[-7:].std(ddof=0); trend = (last+1)/(mean7+1)
        lag7 = hist[-7]; lag14 = hist[-14]
        rm3 = hist[-3:].mean(); rm14 = hist[-14:].mean()
        rs14 = hist[-14:].std(ddof=0)
        zero_streak_28 = _zero_streak_from_end(hist)
        days_since_nz = _days_since_last_nonzero(hist)
        diff_last_mean7 = last - mean7

        # 신규 피처
        zeros_28 = (hist==0).sum()
        zero_ratio28 = float(zeros_28)/28.0
        mean7_over_mean28 = (mean7+1e-6)/(mean28+1e-6)
        vol14 = rs14/(rm14+1e-6)

        # 토큰/총수요 28일 합
        hist_dates = g['date'].iloc[i-28:i].dt.date.values
        top1_qty_hist28 = float(np.nansum([tok_map.get(d, 0.0) for d in hist_dates]))
        store_total_sum28 = float(np.nansum([tot_map.get(d, 0.0) for d in hist_dates]))

        # 요일/월별 평균 (히스토리 기반)
        hist_df = g.iloc[i-28:i].copy()
        hist_df['__dow'] = hist_df['date'].dt.dayofweek   # 0~6
        hist_df['__mon'] = hist_df['date'].dt.month       # 1~12
        dow_means   = hist_df.groupby('__dow')['qty'].mean() if len(hist_df) else pd.Series(dtype=float)
        month_means = hist_df.groupby('__mon')['qty'].mean() if len(hist_df) else pd.Series(dtype=float)

        base_date = pd.to_datetime(g['date'].iloc[i-1])
        # 7일 예측 행 생성
        for h in range(1, 8):
            dn = base_date + pd.Timedelta(days=h)
            dow = dn.weekday(); mon = dn.month
            doy = int(dn.dayofyear); doy_sin = np.sin(2*np.pi*doy/365.25); doy_cos = np.cos(2*np.pi*doy/365.25)
            d_next, d_prev, d_min, is_win2 = _nearest_special_dists_for_date(dn)
            is_special_any = 1 if d_min == 0 else 0

            sm_dow_mean_next   = float(dow_means.get(dow,  mean28)) if not dow_means.empty   else float(mean28)
            sm_month_mean_next = float(month_means.get(mon, mean28)) if not month_means.empty else float(mean28)

            is_wknd = int(dow >= 5)
            rows.append({
                'store_menu': sm,
                # 기본 통계
                'last': last, 'mean7': mean7, 'mean28': mean28, 'std7': std7, 'trend': trend,
                'lag7': lag7, 'lag14': lag14, 'rm3': rm3, 'rm14': rm14, 'rs14': rs14,
                'zero_streak_28': zero_streak_28, 'days_since_nz': days_since_nz, 'diff_last_mean7': diff_last_mean7,
                # 날짜 파생
                'dow_next': dow, 'is_wknd_next': is_wknd, 'is_fri_next': int(dow==4),
                'is_sat_next': int(dow==5), 'is_sun_next': int(dow==6),
                'month_next': mon, 'h': h,
                'doy_next': doy, 'doy_sin_next': doy_sin, 'doy_cos_next': doy_cos,
                # 특일 플래그
                'is_special_any_next': is_special_any, 'days_to_special_next': d_next,
                'days_from_special_next': d_prev, 'min_days_to_special_next': d_min,
                'is_special_window_k2_next': is_win2,
                # 앵커 기반 평균
                'sm_dow_mean_next': sm_dow_mean_next, 'sm_month_mean_next': sm_month_mean_next,
                # 신규/강화 피처
                'store_top1tok_sum28': top1_qty_hist28,
                'store_total_sum28': store_total_sum28,
                'zero_ratio28': zero_ratio28,
                'mean7_over_mean28': mean7_over_mean28,
                'vol14': vol14,
                'wknd_x_top1': is_wknd * top1_qty_hist28,
                # 타깃
                'y': g['qty'].iloc[i + h - 1],
            })
    return rows

# === 모든 store_menu에 대해 학습 데이터 생성 ===
rows = []
for sm, g in train.groupby('store_menu', sort=False):
    if len(g) >= 35:    # 최소 길이 확보
        rows.extend(make_train_rows(g))
train_sup = pd.DataFrame(rows)


# === 특징 리스트 정의 ===
features = [
    'store_menu',
    'last','mean7','mean28','std7','trend',
    'lag7','lag14','rm3','rm14','rs14','zero_streak_28','days_since_nz','diff_last_mean7',
    'dow_next','is_wknd_next','is_fri_next','is_sat_next','is_sun_next',
    'month_next','h',
    'doy_next','doy_sin_next','doy_cos_next',
    'is_special_any_next','days_to_special_next','days_from_special_next',
    'min_days_to_special_next','is_special_window_k2_next',
    'sm_dow_mean_next','sm_month_mean_next',
    # 신규 피처
    'store_top1tok_sum28','store_total_sum28','zero_ratio28','mean7_over_mean28','vol14','wknd_x_top1',
]

# === 타입 정리 (int/float 구분) ===
train_sup['store_menu'] = train_sup['store_menu'].astype(str)
_int_cols = ['dow_next','is_wknd_next','is_fri_next','is_sat_next','is_sun_next',
             'month_next','h','doy_next','is_special_any_next','days_to_special_next',
             'days_from_special_next','min_days_to_special_next','is_special_window_k2_next',
             'zero_streak_28','days_since_nz']
for _c in _int_cols:
    if _c in train_sup.columns:
        train_sup[_c] = pd.to_numeric(train_sup[_c], errors='coerce').fillna(0).astype(int)

_float_cols = ['last','mean7','mean28','std7','trend','doy_sin_next','doy_cos_next',
               'sm_dow_mean_next','sm_month_mean_next','lag7','lag14','rm3','rm14',
               'rs14','diff_last_mean7','y',
               'store_top1tok_sum28','store_total_sum28','zero_ratio28','mean7_over_mean28','vol14','wknd_x_top1']
for _c in _float_cols:
    if _c in train_sup.columns:
        train_sup[_c] = pd.to_numeric(train_sup[_c], errors='coerce')


In [None]:
# ============================================
# [PATCHED] build_last_window_val
# - 검증 특징 생성이 TEST와 1:1 일치하도록 수정
# - sm_dow_mean_next / sm_month_mean_next:
#   전역 맵 대신 "최근 28일 히스토리"에서 계산
# ============================================
import numpy as np
import pandas as pd
from pathlib import Path

# === 고정 특일(월-일) 정의 ===
SPECIAL_MMDD = {'01-01','12-24','12-25','12-31'}


# === 단일 날짜 → 특일 플래그/거리 반환 ===
def _special_flags_for_date_scalar(dt: pd.Timestamp):
    """테스트 로직과 동일한 정의로 단일 날짜에 대한 특일 플래그/거리 반환"""
    if pd.isna(dt):
        return 0, 999, 999, 999, 0
    y = int(dt.year)
    sp = [pd.Timestamp(year=y, month=int(md[:2]), day=int(md[3:])) for md in SPECIAL_MMDD]
    if not sp:
        return 0, 999, 999, 999, 0
    d0 = np.datetime64(dt.normalize().to_pydatetime().date(), 'D')
    arr = np.array([np.datetime64(s.date(), 'D') for s in sp], dtype='datetime64[D]')
    dists = (arr - d0).astype('timedelta64[D]').astype(int)

    # 다음/이전 특일까지 거리
    ge0 = dists.copy(); ge0[ge0 < 0] = 10**9
    le0 = dists.copy(); le0[le0 > 0] = -10**9
    to_next  = ge0.min()
    from_prev = -le0.max()

    to_next   = 999 if to_next  == 10**9 else int(to_next)
    from_prev = 999 if from_prev == 10**9 else int(from_prev)
    min_dist  = min(to_next, from_prev)
    is_any  = 1 if min_dist == 0 else 0
    is_win2 = 1 if min_dist <= 2 else 0
    return is_any, to_next, from_prev, min_dist, is_win2

# === 보조 함수: 마지막부터 0 스트릭 길이 ===
def _zero_streak_from_end(arr):
    cnt = 0
    for v in arr[::-1]:
        if v == 0: cnt += 1
        else: break
    return cnt

# === 보조 함수: 마지막 양수 이후 경과일 ===
def _days_since_last_nonzero(arr):
    nz = np.where(np.asarray(arr) > 0)[0]
    if nz.size == 0:
        return 999
    return (len(arr) - 1) - nz.max()

# === 메인: 검증 데이터셋 생성 ===
def build_last_window_val(train_df: pd.DataFrame,
                          top1_csv: str = '/data/models/keyword_signal_top1/store_top1_token_beta.csv'):
    """
    각 store_menu별 마지막 35일 구간에서:
      - 최근 28일(hist_28)로 앵커(요일/월 평균) 계산
      - 그 다음 7일을 목표 y로 사용
    => 테스트 피처 생성 규칙과 1:1 매칭
    """
    t = train_df.copy()
    # 컬럼 표준화
    if 'qty' not in t.columns and '매출수량' in t.columns:
        t['qty'] = t['매출수량']
    if 'store_menu' not in t.columns and '영업장명_메뉴명' in t.columns:
        t['store_menu'] = t['영업장명_메뉴명']
    if 'store' not in t.columns or 'menu' not in t.columns:
        sm_split = t['store_menu'].astype(str).str.split('_', n=1, expand=True)
        if sm_split.shape[1] == 2:
            t['store'] = sm_split[0]
            t['menu']  = sm_split[1]
        else:
            t['store'] = t['store_menu'].astype(str).str.partition('_')[0]


    t['date'] = pd.to_datetime(t['date'], errors='coerce')
    t['dayofweek'] = t['date'].dt.weekday.astype('Int64')
    t['month'] = t['date'].dt.month.astype('Int64')
    t = t.dropna(subset=['date','qty','store_menu']).copy()

    # === store별 Top-1 토큰 로드 ===
    store2token = {}
    p = Path(top1_csv)
    if p.exists():
        top1_df = pd.read_csv(p)
        for r in top1_df.itertuples(index=False):
            store2token[str(r.store)] = str(r.token)
    # === store-day 토큰합 & 총합 맵 ===
    tmp = t[['store','menu','date','qty']].copy()
    tmp['menu_low'] = tmp['menu'].astype(str).str.lower()

    # store-day 토큰합
    tok_rows = []
    if store2token:
        for store, g2 in tmp.groupby('store'):
            tok = store2token.get(store)
            if not tok or tok == 'nan':
                continue
            mask = g2['menu_low'].str.contains(str(tok).lower(), na=False)
            if not mask.any():
                continue
            agg = g2.loc[mask].groupby('date')['qty'].sum().reset_index(name='top1_token_qty')
            agg['store'] = store
            tok_rows.append(agg[['store','date','top1_token_qty']])
    store_top1_daily = (pd.concat(tok_rows, ignore_index=True)
                        if tok_rows else pd.DataFrame(columns=['store','date','top1_token_qty']))
    tok_map = {(str(r.store), pd.to_datetime(r.date).date()): float(r.top1_token_qty)
               for r in store_top1_daily.itertuples(index=False)}

    # store-day 총수요
    tot_daily = t.groupby(['store','date'])['qty'].sum().reset_index(name='store_total_qty')
    tot_map = {(str(r.store), pd.to_datetime(r.date).date()): float(r.store_total_qty)
               for r in tot_daily.itertuples(index=False)}

    # === 각 store_menu별 검증 표본 생성 ===
    val_rows = []
    for sm, g in t.groupby('store_menu', sort=False):
        g = g.sort_values('date')
        if len(g) < 35:
            continue

        q = g['qty'].values
        hist_35   = q[-35:]
        hist_28   = hist_35[-35:-7][-28:]
        future_7  = q[-7:]
        base_date = g['date'].iloc[-8]      # 예측 anchor일
        store = str(g['store'].iloc[-1])

        # === 최근 28일 통계 ===
        last  = hist_28[-1]
        mean7 = hist_28[-7:].mean()
        mean28= hist_28.mean()
        std7  = hist_28[-7:].std(ddof=0)
        trend = (last+1)/(mean7+1)

        lag7  = hist_28[-7]
        lag14 = hist_28[-14]
        rm3   = hist_28[-3:].mean()
        rm14  = hist_28[-14:].mean()
        rs14  = hist_28[-14:].std(ddof=0)

        zeros_28 = (hist_28 == 0).sum()
        zero_ratio28 = float(zeros_28) / 28.0
        vol14 = rs14 / (rm14 + 1e-6)
        mean7_over_mean28 = (mean7 + 1e-6) / (mean28 + 1e-6)

        # === 토큰합/총수요 28일 합 ===
        hist_dates28 = g['date'].iloc[-35:-7].dt.date.values[-28:]
        top1_sum28 = float(np.nansum([tok_map.get((store, d), 0.0) for d in hist_dates28]))
        store_total_sum28 = float(np.nansum([tot_map.get((store, d), 0.0) for d in hist_dates28]))

        # === 요일/월별 앵커 평균 (히스토리 기반) ===
        dates28 = pd.to_datetime(hist_dates28)
        dows = dates28.weekday.values
        mons = dates28.month.values
        dmap, mmap = {}, {}
        for d in range(7):
            v = hist_28[dows == d]
            if v.size:
                dmap[d] = float(v.mean())
        for m in np.unique(mons):
            v = hist_28[mons == m]
            if v.size:
                mmap[int(m)] = float(v.mean())

        # === h=1..7 예측 행 생성 ===
        for h in range(1, 8):
            dn = pd.to_datetime(base_date) + pd.Timedelta(days=h)
            dow = int(dn.weekday())
            mon = int(dn.month)
            doy = int(dn.dayofyear)
            doy_sin = float(np.sin(2*np.pi*doy/365.25))
            doy_cos = float(np.cos(2*np.pi*doy/365.25))

            is_any, to_sp, from_sp, min_sp, is_win2 = _special_flags_for_date_scalar(dn)
            is_wknd = int(dow >= 5)

            # 앵커 평균 (히스토리 기반)
            sd = float(dmap.get(dow,  mean28))
            sm = float(mmap.get(mon,  mean28))

            val_rows.append({
                'store_menu': sm and g['store_menu'].iloc[-1] if False else str(sm),
            })
            val_rows[-1]['store_menu'] = str(g['store_menu'].iloc[-1])


            val_rows[-1].update({
                'last': last, 'mean7': mean7, 'mean28': mean28, 'std7': std7, 'trend': trend,
                'lag7': lag7,'lag14': lag14,'rm3': rm3,'rm14': rm14,'rs14': rs14,
                'zero_streak_28': _zero_streak_from_end(hist_28),
                'days_since_nz': _days_since_last_nonzero(hist_28),
                'diff_last_mean7': last - mean7,
                'dow_next': dow, 'is_wknd_next': is_wknd, 'is_fri_next': int(dow==4),
                'is_sat_next': int(dow==5), 'is_sun_next': int(dow==6),
                'month_next': mon, 'h': h,
                'doy_next': doy, 'doy_sin_next': doy_sin, 'doy_cos_next': doy_cos,
                'is_special_any_next': int(is_any), 'days_to_special_next': int(to_sp),
                'days_from_special_next': int(from_sp), 'min_days_to_special_next': int(min_sp),
                'is_special_window_k2_next': int(is_win2),
                'sm_dow_mean_next': sd, 'sm_month_mean_next': sm,
                'store_top1tok_sum28': top1_sum28,
                'store_total_sum28': store_total_sum28,
                'zero_ratio28': zero_ratio28,
                'mean7_over_mean28': mean7_over_mean28,
                'vol14': vol14,
                'wknd_x_top1': is_wknd * top1_sum28,
                'y': float(future_7[h-1]),
            })

    val_sup = pd.DataFrame(val_rows)

    # === 타입 정리 (int/float 일관성 유지) ===
    int_cols = ['dow_next','is_wknd_next','is_fri_next','is_sat_next','is_sun_next',
                'month_next','h','doy_next','is_special_any_next',
                'days_to_special_next','days_from_special_next',
                'min_days_to_special_next','is_special_window_k2_next',
                'zero_streak_28','days_since_nz']
    for c in int_cols:
        if c in val_sup.columns:
            val_sup[c] = pd.to_numeric(val_sup[c], errors='coerce').fillna(0).astype(int)

    float_cols = ['last','mean7','mean28','std7','trend','doy_sin_next','doy_cos_next',
                  'sm_dow_mean_next','sm_month_mean_next','lag7','lag14','rm3','rm14',
                  'rs14','diff_last_mean7','y',
                  'store_top1tok_sum28','store_total_sum28','zero_ratio28',
                  'mean7_over_mean28','vol14','wknd_x_top1']
    for c in float_cols:
        if c in val_sup.columns:
            val_sup[c] = pd.to_numeric(val_sup[c], errors='coerce')

    return val_sup


In [None]:
# --- 가중치 컬럼 + 안전 전처리 ---
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor

# 학습 데이터 복사본 생성
train_sup = train_sup.copy()

# 매장 이름 분리 → 담하/미라시아 가중치 강화
train_sup['store'] = train_sup['store_menu'].astype(str).str.partition('_')[0]
wmap = {'담하': 2.0, '미라시아': 2.0}
train_sup['w'] = train_sup['store'].map(wmap).fillna(1.0)

# 정수형으로 캐스팅해야 하는 컬럼 처리
for _c in ['dow_next','is_wknd_next','is_fri_next','is_sat_next','is_sun_next',
           'month_next','h','doy_next','zero_streak_28','days_since_nz']:
    if _c in train_sup.columns:
        train_sup[_c] = pd.to_numeric(train_sup[_c], errors='coerce').astype('Int64').astype(int)

# 수치형 컬럼 목록 정의 (존재하는 컬럼만 필터링)
num_cols = ['last','mean7','mean28','std7','trend','y','w',
            'lag7','lag14','rm3','rm14','rs14','diff_last_mean7',
            'sm_dow_mean_next','sm_month_mean_next',
            'doy_sin_next','doy_cos_next',
            'store_top1tok_sum28','store_total_sum28',
            'zero_ratio28','mean7_over_mean28','vol14','wknd_x_top1']
num_cols = [c for c in num_cols if c in train_sup.columns]

# inf → NaN 변환
train_sup[num_cols] = train_sup[num_cols].replace([np.inf,-np.inf], np.nan)

# 레이블(y) 보정: NaN → 0, 음수는 경고 출력 후 0으로 클리핑
train_sup['y'] = pd.to_numeric(train_sup['y'], errors='coerce').fillna(0.0)
neg_cnt = int((train_sup['y'] < 0).sum())
if neg_cnt > 0:
    print(f"[warn] Negative labels detected: {neg_cnt}. Clipping to 0.")
    train_sup.loc[train_sup['y'] < 0, 'y'] = 0.0

# 실제 사용 가능한 feature만 필터링
features_in_use = [c for c in features if c in train_sup.columns]
missing = [c for c in features if c not in features_in_use]
if missing:
    print(f"[warn] 다음 특징이 누락되어 제외됨: {missing}")

# 단조 제약(monotonic constraint) 적용할 컬럼 지정
mono_plus = {'last', 'mean7', 'store_total_sum28'}   # 값이 클수록 y도 커져야 한다는 제약
mono_vec = [ +1 if c in mono_plus else 0 for c in features_in_use ]

# === 초고속 5모델 앙상블 ===
# 1) LightGBM Quantile (분위수 학습)
gbm_q80 = {
    'objective':'quantile','alpha':0.80,'learning_rate':0.05,
    'num_leaves':63,'min_data_in_leaf':60,
    'feature_fraction':0.9,'bagging_fraction':0.9,'bagging_freq':1,
    'lambda_l2':1.0,'num_boost_round':900,
    'monotone_constraints': mono_vec,
}

# 2) LightGBM GOSS (빠른 부스팅 전략)
gbm_goss_fast = {
    'boosting_type': 'goss',
    'top_rate': 0.2, 'other_rate': 0.1,
    'learning_rate': 0.06,
    'num_leaves': 63,
    'min_data_in_leaf': 60,
    'feature_fraction': 0.9,
    'lambda_l2': 1.0,
    'num_boost_round': 1000,
}

# 3) LightGBM Tweedie (롱테일 분포 대비)
gbm_tweedie_fast = {
    'objective': 'tweedie',
    'tweedie_variance_power': 1.3,
    'learning_rate': 0.05,
    'num_leaves': 63,
    'min_data_in_leaf': 60,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 1,
    'lambda_l1': 0.0, 'lambda_l2': 1.0,
    'num_boost_round': 1200,
}

# 4) XGBoost Hist 기반 (빠름)
xgb_sq = {
    'objective':'reg:squarederror','learning_rate':0.06,
    'max_depth':8,'min_child_weight':6,'subsample':0.9,'colsample_bytree':0.9,
    'n_estimators':1200,'tree_method':'hist','max_bin':256,'random_state':7,
    'monotone_constraints': mono_vec,
}

# 5) ExtraTrees (랜덤성 강한 트리 앙상블)
xt_fast = {
    'n_estimators': 400,
    'max_depth': None,
    'bootstrap': False,
}

# 단순 LGBM/XGB 기본 버전 (monotone 전용)
gbm_mono = {
    'learning_rate': 0.05, 'num_leaves': 63, 'min_data_in_leaf': 60,
    'feature_fraction': 0.9, 'bagging_fraction': 0.9, 'bagging_freq': 1,
    'lambda_l2': 1.0, 'num_boost_round': 1200,
    'monotone_constraints': mono_vec,
}
xgb_mono = {
    'objective': 'reg:squarederror', 'learning_rate': 0.06,
    'max_depth': 8, 'min_child_weight': 6, 'subsample': 0.9, 'colsample_bytree': 0.9,
    'n_estimators': 1200, 'tree_method': 'hist', 'max_bin': 256, 'random_state': 7,
    'monotone_constraints': mono_vec,
}

# === AutoGluon Tabular 학습 ===
pred = TabularPredictor(
    label='y',
    path=str(MODEL_DIR).replace('simple_gbm_direct','fast5_gbmxgbxt_stack1'),
    problem_type='regression',
    eval_metric='smape',
    sample_weight='w',
    weight_evaluation=True,
).fit(
    train_data=train_sup[features_in_use + ['y','w']],
    presets='medium_quality_faster_train',
    hyperparameters={
        'GBM': [gbm_q80, gbm_mono, gbm_goss_fast, gbm_tweedie_fast],
        'XGB': [xgb_sq],
        'XT' : [xt_fast],
    },
    ag_args_fit={
        'num_bag_folds': 3,       # (지금처럼 빠르게)
        'num_stack_levels': 1     # ← 딱 이 줄만 0 -> 1
    },
    time_limit=1200,
    verbosity=2,
)


# === 리더보드 출력 및 SMAPE 확인 ===
lb = pred.leaderboard(silent=True)
print(lb)
if 'score_val' in lb.columns and len(lb) > 0:
    best_val_smape = abs(lb['score_val'].iloc[0])
    print(f"✅ AutoGluon 내부 검증 SMAPE(절댓값 해석): {best_val_smape:.4f}")
else:
    print("ℹ️ 분리된 검증 폴드가 없거나 보고할 점수가 없습니다.")








[warn] Negative labels detected: 98. Clipping to 0.


Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Mar 30 16:01:29 UTC 2025
CPU Count:          2
Memory Avail:       9.25 GB / 12.67 GB (73.0%)
Disk Space Avail:   58.87 GB / 100.00 GB (58.9%)
Presets specified: ['medium_quality_faster_train']
Values in column 'w' used as sample weights instead of predictive features. Evaluation will report weighted metrics, so ensure same column exists in test data.
Beginning AutoGluon training ... Time limit = 1200s
AutoGluon will save models to "/content/drive/MyDrive/lg aimers/models/fast5_gbmxgbxt_stack1"
Train Data Rows:    586334
Train Data Columns: 38
Label Column:       y
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Mem

                 model  score_val                               eval_metric  \
0  WeightedEnsemble_L2  -0.597576  symmetric_mean_absolute_percentage_error   
1           ExtraTrees  -0.600574  symmetric_mean_absolute_percentage_error   
2           LightGBM_2  -0.606471  symmetric_mean_absolute_percentage_error   
3           LightGBM_3  -0.606811  symmetric_mean_absolute_percentage_error   
4           LightGBM_4  -0.607203  symmetric_mean_absolute_percentage_error   

   pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  \
0       0.470355  1036.620370                0.002505           0.117347   
1       0.435224  1019.536164                0.435224        1019.536164   
2       0.035168    23.520036                0.035168          23.520036   
3       0.029343    15.341935                0.029343          15.341935   
4       0.032626    16.966859                0.032626          16.966859   

   stack_level  can_infer  fit_order  
0            2       True    

In [None]:
# ===============================
# TEST 예측 (최근 28일 앵커로 일치)
# ===============================
import numpy as np
import pandas as pd
import glob, os, re
from pathlib import Path

# --- 고정 특일 정의 (01-01, 12-24, 12-25, 12-31)
SPECIAL_MMDD = {'01-01','12-24','12-25','12-31'}

def special_flags_for_dates(dates: np.ndarray):
    """
      입력 날짜 배열에 대해:
        - 다음/이전 특일까지 거리
        - 최소 거리
        - 특일 여부 (당일, ±2일 윈도우)
      반환: [is_any, to_next, from_prev, min_dist, is_win2]
    """
    dates = pd.to_datetime(dates).normalize().values.astype('datetime64[D]')
    years = pd.DatetimeIndex(dates).year.values
    out = np.zeros((len(dates), 5), dtype=int)
    for y in np.unique(years):
        mask = (years == y)
        base = dates[mask]
        sp = [np.datetime64(pd.Timestamp(year=y, month=int(md[:2]), day=int(md[3:])), 'D') for md in SPECIAL_MMDD]
        arr = np.array(sp, dtype='datetime64[D]')
        dists = (arr[None, :] - base[:, None]).astype('timedelta64[D]').astype(int)

        # 다음/이전 거리 계산
        to_next  = dists.copy(); to_next[to_next < 0]  = 10**9
        from_prev = dists.copy(); from_prev[from_prev > 0] = -10**9
        to_next  = to_next.min(axis=1)
        from_prev = -from_prev.max(axis=1)

        # sentinel 값 치환
        to_next  = np.where(to_next  == 10**9, 999, to_next)
        from_prev = np.where(from_prev == 10**9, 999, from_prev)
        min_dist = np.minimum(to_next, from_prev)
        is_any  = (min_dist == 0).astype(int)
        is_win2 = (min_dist <= 2).astype(int)
        out[mask, 0] = is_any
        out[mask, 1] = to_next
        out[mask, 2] = from_prev
        out[mask, 3] = min_dist
        out[mask, 4] = is_win2
    return out

# --- 유틸 함수들
def split_store(sm: str):
    """store_menu에서 매장명만 분리"""
    sm = str(sm); p = sm.split('_', 1); return p[0]

def last_zero_streak(q):
    """뒤에서부터 연속된 0 개수"""
    c = 0
    for v in q[::-1]:
        if v == 0: c += 1
        else: break
    return c

# --- Top-1 토큰 로드
TOP1_CSV = Path('/data/models/keyword_signal_top1/store_top1_token_beta.csv')
top1_df = pd.read_csv(TOP1_CSV) if TOP1_CSV.exists() else pd.DataFrame(columns=['store','token','beta'])
store2token = {str(r.store): str(r.token) for r in top1_df.itertuples(index=False)}

all_pred_rows = []


# ==============================
# 각 TEST_*.csv 파일별 예측 루프
# ==============================
for path in sorted(glob.glob(str(TEST_DIR / 'TEST_*.csv'))):
    test_df = pd.read_csv(path)
    test_prefix = re.search(r'(TEST_\d+)', os.path.basename(path)).group(1)

    # --- 기본 전처리
    test_df['영업일자'] = pd.to_datetime(test_df['영업일자'], errors='coerce')
    test_df['영업장명_메뉴명'] = test_df['영업장명_메뉴명'].astype(str)
    test_df['_store'] = test_df['영업장명_메뉴명'].apply(split_store)
    test_df['_menu_low'] = test_df['영업장명_메뉴명'].str.split('_', n=1).str[1].str.lower()
    test_df = test_df.sort_values(['영업장명_메뉴명', '영업일자'])

    # -----------------------------
    # (1) store_total_sum28 / store_token_sum28
    # -----------------------------
    # 각 메뉴의 최근 28일 매출 합
    menu_last28_sum = (test_df.groupby('영업장명_메뉴명')
                              .tail(28)
                              .groupby('영업장명_메뉴명')['매출수량']
                              .sum())

    # 매장별 총합 (store 기준)
    menu2store = test_df.drop_duplicates('영업장명_메뉴명').set_index('영업장명_메뉴명')['_store']
    store_total_sum28 = (menu_last28_sum
                         .rename('sum28')
                         .to_frame()
                         .join(menu2store, how='left')
                         .groupby('_store')['sum28']
                         .sum()
                         .to_dict())

    # 매장별 Top-1 토큰 합 (최근 28일)
    store_token_sum28 = {}
    for store, tok in store2token.items():
        if pd.isna(tok) or str(tok).lower() == 'nan':
            store_token_sum28[store] = 0.0
            continue
        mask = (test_df['_store'] == store) & (test_df['_menu_low'].str.contains(str(tok).lower(), na=False))
        if not mask.any():
            store_token_sum28[store] = 0.0
            continue
        tmp = test_df.loc[mask]
        tok_last28_sum = (tmp.groupby('영업장명_메뉴명')
                            .tail(28)
                            .groupby('영업장명_메뉴명')['매출수량']
                            .sum()
                            .sum())
        store_token_sum28[store] = float(tok_last28_sum)

    # -----------------------------
    # (2) 7일 예측용 feature 생성
    # -----------------------------
    X_all, meta_all = [], []

    # 최근 28일 캐시
    last_per_menu = test_df.groupby('영업장명_메뉴명').tail(28).groupby('영업장명_메뉴명')
    last_28_map, last_date_map = {}, {}
    for name, g in last_per_menu:
        qty28 = g['매출수량'].astype(float).values
        if len(qty28) < 28:
            continue
        last_28_map[name] = qty28[-28:]
        last_date_map[name] = g['영업일자'].max()

    # 메뉴별 최근 28일 → 요일/월별 평균 (히스토리 앵커)
    hist_anchor = {}
    for sm, qty28 in last_28_map.items():
        ld = last_date_map[sm]
        dates28 = pd.date_range(ld - pd.Timedelta(days=27), ld, freq='D')
        dows = dates28.weekday.values
        mons = dates28.month.values
        m28  = float(np.mean(qty28))
        dmap, mmap = {}, {}
        for d in range(7):
            v = qty28[dows == d]
            if v.size: dmap[d] = float(v.mean())
        for m in np.unique(mons):
            v = qty28[mons == m]
            if v.size: mmap[int(m)] = float(v.mean())
        hist_anchor[sm] = {'dow': dmap, 'mon': mmap, 'mean28': m28}

    # --- 각 store_menu별 7일 feature 생성
    for store_menu, qty28 in last_28_map.items():
        last_date = last_date_map[store_menu]

        # 기본 통계
        last = qty28[-1]; mean7 = qty28[-7:].mean(); mean28 = qty28.mean()
        std7 = qty28[-7:].std(ddof=0); trend = (last+1)/(mean7+1)
        lag7 = qty28[-7]; lag14 = qty28[-14]
        rm3 = qty28[-3:].mean(); rm14 = qty28[-14:].mean()
        rs14 = qty28[-14:].std(ddof=0)

        # 비율/분산 피처
        zeros_28 = (qty28 == 0).sum()
        zero_ratio28 = float(zeros_28) / 28.0
        vol14 = rs14 / (rm14 + 1e-6)
        mean7_over_mean28 = (mean7 + 1e-6) / (mean28 + 1e-6)

        # 매장별 합계 (store 기준)
        store = split_store(store_menu)
        top1_sum28 = float(store_token_sum28.get(store, 0.0))
        total_sum28 = float(store_total_sum28.get(store, 0.0))

        # 예측 대상 7일 날짜 생성
        dn = pd.date_range(last_date + pd.Timedelta(days=1), periods=7, freq='D')
        dow = dn.weekday.values
        mon = dn.month.values
        doy = dn.dayofyear.values
        doy_sin = np.sin(2*np.pi*doy/365.25)
        doy_cos = np.cos(2*np.pi*doy/365.25)
        sp = special_flags_for_dates(dn.values)  # (7,5)

        # 앵커 참조
        ha = hist_anchor.get(store_menu, {'dow':{}, 'mon':{}, 'mean28': float(mean28)})

        for i in range(7):
            sd = ha['dow'].get(int(dow[i]),  ha['mean28'])
            sm = ha['mon'].get(int(mon[i]),  ha['mean28'])

            # 한 날(h) 예측 feature 행 추가
            row = {
                'store_menu': store_menu,
                'last': last, 'mean7': mean7, 'mean28': mean28, 'std7': std7, 'trend': trend,
                'lag7': lag7, 'lag14': lag14, 'rm3': rm3, 'rm14': rm14, 'rs14': rs14,
                'zero_streak_28': last_zero_streak(qty28),
                'days_since_nz': (27 - np.where(qty28 > 0)[0].max()) if (qty28 > 0).any() else 999,
                'diff_last_mean7': last - mean7,

                'dow_next': int(dow[i]),
                'is_wknd_next': int(dow[i] >= 5),
                'is_fri_next': int(dow[i] == 4),
                'is_sat_next': int(dow[i] == 5),
                'is_sun_next': int(dow[i] == 6),
                'month_next': int(mon[i]),
                'h': int(i + 1),

                'doy_next': int(doy[i]),
                'doy_sin_next': float(doy_sin[i]),
                'doy_cos_next': float(doy_cos[i]),

                'is_special_any_next': int(sp[i, 0]),
                'days_to_special_next': int(sp[i, 1]),
                'days_from_special_next': int(sp[i, 2]),
                'min_days_to_special_next': int(sp[i, 3]),
                'is_special_window_k2_next': int(sp[i, 4]),

                # 최근 28일 앵커 기반 요일/월 평균
                'sm_dow_mean_next': float(sd),
                'sm_month_mean_next': float(sm),

                # 신규/강화 피처
                'store_top1tok_sum28': top1_sum28,
                'store_total_sum28': total_sum28,
                'zero_ratio28': zero_ratio28,
                'mean7_over_mean28': mean7_over_mean28,
                'vol14': vol14,
                'wknd_x_top1': int(dow[i] >= 5) * top1_sum28,
            }
            X_all.append(row)
            meta_all.append((last_date, store_menu, i + 1))

    if not X_all:
        continue

    X_df = pd.DataFrame(X_all)
    use_cols = [c for c in features_in_use if c in X_df.columns]
    if not use_cols:
        continue

    # -----------------------------
    # (3) 예측 수행
    # -----------------------------
    yhat_all = np.asarray(pred.predict(X_df[use_cols]))
    yhat_all = np.nan_to_num(yhat_all, nan=0.0).clip(min=0)

    # -----------------------------
    # (4) 결과 저장
    # -----------------------------
    for (base_date, store_menu, h), v in zip(meta_all, yhat_all):
        all_pred_rows.append({
            'test_prefix': test_prefix,
            'date': (pd.to_datetime(base_date) + pd.Timedelta(days=int(h))).date().isoformat(),
            '영업장명_메뉴명': store_menu,
            'pred': float(v)
        })

# 전체 예측 결과 DataFrame
pred_long = pd.DataFrame(all_pred_rows)
print("✅ pred_long shape:", pred_long.shape)
print(pred_long.head())


✅ pred_long shape: (13510, 4)
  test_prefix        date            영업장명_메뉴명      pred
0     TEST_00  2024-07-14  느티나무 셀프BBQ_1인 수저세트  6.670634
1     TEST_00  2024-07-15  느티나무 셀프BBQ_1인 수저세트  3.350654
2     TEST_00  2024-07-16  느티나무 셀프BBQ_1인 수저세트  4.233986
3     TEST_00  2024-07-17  느티나무 셀프BBQ_1인 수저세트  4.424637
4     TEST_00  2024-07-18  느티나무 셀프BBQ_1인 수저세트  4.832962


In [None]:
# ===============================
# 제출 파일 생성 + 반올림 버전 저장
# ===============================
import pandas as pd
import numpy as np
from pathlib import Path

def convert_to_submission_format(pred_df: pd.DataFrame, sample_submission: pd.DataFrame):
    """
    예측 DataFrame(pred_long) → 최종 제출 포맷 변환
      - sample_submission과 같은 형태로 맞춤
      - (영업일자, 영업장명_메뉴명) 키 매칭
    """
    df = pred_df.copy()
    df = df.rename(columns={'pred':'매출수량'})
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df['매출수량'] = pd.to_numeric(df['매출수량'], errors='coerce').fillna(0.0)

    # test_prefix + 메뉴별 순서 부여 → "TEST_xx+n일" 형태 만들기
    df = df.sort_values(['test_prefix','영업장명_메뉴명','date'])
    df['predict_day'] = df.groupby(['test_prefix','영업장명_메뉴명']).cumcount() + 1
    df['영업일자'] = df.apply(lambda r: f"{r['test_prefix']}+{int(r['predict_day'])}일", axis=1)
    df = df[['영업일자','영업장명_메뉴명','매출수량']]

    # (영업일자, 메뉴) → 매출수량 dict 생성
    pred_dict = dict(zip(zip(df['영업일자'], df['영업장명_메뉴명']), df['매출수량']))

    # sample_submission과 동일 구조로 채워 넣기
    final_df = sample_submission.copy()
    float_cols = [c for c in final_df.columns if c != final_df.columns[0]]
    for c in float_cols:
        final_df[c] = pd.to_numeric(final_df[c], errors='coerce').fillna(0.0)
    for i in final_df.index:
        key_date = str(final_df.loc[i,'영업일자'])
        for col in float_cols:
            final_df.loc[i, col] = float(pred_dict.get((key_date, col), 0.0))

    # 영업일자 기준 정렬
    final_df = final_df.sort_values(['영업일자']).reset_index(drop=True)
    return final_df


# === 변환 실행 ===
BASE = Path('/data')
sample_submission_df = pd.read_csv(BASE / 'sample_submission.csv')
submission = convert_to_submission_format(pred_long, sample_submission_df)

# 원본 예측 제출 파일 저장
out_path = BASE / 'submission_raw.csv'
submission.to_csv(out_path, index=False, encoding='utf-8-sig')
print("✅ 제출 파일 저장:", out_path)

# 반올림 버전 생성 + 저장
rounded = submission.copy()
for col in rounded.columns[1:]:
    rounded[col] = pd.to_numeric(rounded[col], errors='coerce').fillna(0.0).round()
out_path_round = BASE / 'submission_rounded.csv'
rounded.to_csv(out_path_round, index=False, encoding='utf-8-sig')
print("✅ 반올림 제출 파일 저장:", out_path_round)


  final_df.loc[i, col] = float(pred_dict.get((key_date, col), 0.0))
  final_df.loc[i, col] = float(pred_dict.get((key_date, col), 0.0))
  final_df.loc[i, col] = float(pred_dict.get((key_date, col), 0.0))
  final_df.loc[i, col] = float(pred_dict.get((key_date, col), 0.0))
  final_df.loc[i, col] = float(pred_dict.get((key_date, col), 0.0))
  final_df.loc[i, col] = float(pred_dict.get((key_date, col), 0.0))
  final_df.loc[i, col] = float(pred_dict.get((key_date, col), 0.0))
  final_df.loc[i, col] = float(pred_dict.get((key_date, col), 0.0))
  final_df.loc[i, col] = float(pred_dict.get((key_date, col), 0.0))
  final_df.loc[i, col] = float(pred_dict.get((key_date, col), 0.0))
  final_df.loc[i, col] = float(pred_dict.get((key_date, col), 0.0))
  final_df.loc[i, col] = float(pred_dict.get((key_date, col), 0.0))
  final_df.loc[i, col] = float(pred_dict.get((key_date, col), 0.0))
  final_df.loc[i, col] = float(pred_dict.get((key_date, col), 0.0))
  final_df.loc[i, col] = float(pred_dict.get((ke

✅ 제출 파일 저장: /content/drive/MyDrive/lg aimers/open/submission_raw2.csv
✅ 반올림 제출 파일 저장: /content/drive/MyDrive/lg aimers/open/submission_rounded2.csv


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# === 입력/출력 경로 설정 ===
src = Path("/data/submission_rounded.csv")
dst = src.with_name(src.stem + "_final.csv")

# === 제출 파일 로드 ===
df = pd.read_csv(src)

# === 치환 전 0 개수 집계 ===
# 숫자 타입에서 정확히 0인 값 개수
before_num = int((df.select_dtypes(include=[np.number]) == 0).sum().sum())
# 문자열 타입에서 정확히 "0"인 값 개수
before_str = int((df.select_dtypes(exclude=[np.number]) == "0").sum().sum())

# === 0을 모두 1로 변환 ===
# - 숫자 0 → 1
# - 문자열 "0" → "1"
df = df.replace({0: 1, "0": "1"})

# === 저장 ===
df.to_csv(dst, index=False)
print(f"[done] 저장: {dst}")
print(f"[info] 치환된 0 개수: 숫자={before_num}, 문자열='{before_str}'")


[done] 저장: /content/drive/MyDrive/lg aimers/open/submission_rounded2_final.csv
[info] 치환된 0 개수: 숫자=1003, 문자열='0'
