In [18]:
# 기존 fetch(code, start, end) 대신 아래로 교체
def fetch(code):
    cache = cache_dir / f"{code}.csv"
    # ① KRX 전체 과거
    try:
        df = fdr.DataReader(f"KRX:{code}")
    except:
        # ② Naver fallback
        df = fdr.DataReader(code)
    df = (
        df.reset_index()[["Date","Close","Volume"]]
          .rename(columns={"Date":"date","Close":"close","Volume":"volume"})
    )
    df["stock_code"] = code
    df.to_csv(cache, index=False)   # 캐시 덮어쓰기
    return df

In [None]:
import os, warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from datetime import timedelta
import pandas as pd, numpy as np
from tqdm.auto import tqdm

BASE        = "/Users/gun/Desktop/미래에셋 AI 공모전/data"
DIV_FP      = os.path.join(BASE, "dividend_ml_ready.csv")
CACHE_DIR   = os.path.join(BASE, "price_cache")
SECTOR_FP   = os.path.join(BASE, "sector_info.csv")
OUT_FP      = os.path.join(BASE, "module_datasets", "classification.csv")
os.makedirs(os.path.dirname(OUT_FP), exist_ok=True)

# 1️⃣ 이벤트 & rank
df_div = pd.read_csv(DIV_FP, dtype={"stock_code":str,"rcept_no":str})
df_div["rcept_dt"] = pd.to_datetime(df_div["rcept_no"].str[:8], format="%Y%m%d", errors="coerce")
df_div.dropna(subset=["rcept_dt"], inplace=True)
df_div["period"] = df_div["rcept_dt"].dt.to_period("M")
df_div["div_amount_rank"] = df_div.groupby("period")["per_share_common"].rank(pct=True)
df_div.drop(columns="period", inplace=True)

# 2️⃣ 섹터 매핑
sec = pd.read_csv(SECTOR_FP, dtype=str)
sec["stock_code"] = sec["stock_code"].str.zfill(6)
sec_map = sec.set_index("stock_code")["sector"].to_dict()

# 3️⃣ 풀 히스토리 캐시 로딩
price_data = {}
for code in df_div["stock_code"].unique():
    fn = os.path.join(CACHE_DIR, f"{code}.csv")
    if os.path.exists(fn):
        price_data[code] = pd.read_csv(fn, parse_dates=["date"])

# 4️⃣ FE 생성 (윈도우 = 1일)
feats = []
for _, row in tqdm(df_div.iterrows(), total=len(df_div), desc="Classification FE"):
    code, dt = row.stock_code, row.rcept_dt
    sub = price_data.get(code)
    if sub is None: continue

    # dt → 실제 거래일 보정
    if dt not in sub.date.values:
        fut = sub.loc[sub.date>=dt,"date"]
        if fut.empty: continue
        dt_adj = fut.min()
    else:
        dt_adj = dt

    sub = sub.sort_values("date").reset_index(drop=True)
    idx = sub.date.searchsorted(dt_adj)
    if idx+1 >= len(sub): continue

    c = sub.close.values
    ret1 = c[idx+1]/c[idx] - 1

    feats.append({
        "stock_code":       code,
        "rcept_dt":         dt_adj,
        "sector":           sec_map.get(code, ""),
        "per_share_common": row.per_share_common,
        "yield_common":     row.yield_common,
        "total_amount":     row.total_amount,
        "div_amount_rank":  row.div_amount_rank,
        "gap_before":       (c[idx]/c[idx-1]-1) if idx>0 else 0.0,
        "month":            dt_adj.month,
        "is_year_end":      int(dt_adj.month==12),
        "sector_avg_yield_gap": (
            row.yield_common
            - df_div[df_div.rcept_dt.dt.to_period("M")
                        == dt_adj.to_period("M")]
                   .yield_common.mean()
        ),
        "up_1d":            int(ret1>0),
    })

df_clf = pd.DataFrame(feats)
print(f"▶ 분류용 샘플 수: {len(df_clf):,}  (≈{len(df_clf)/len(df_div):.1%} 유지)")
df_clf.to_csv(OUT_FP, index=False, encoding="utf-8-sig")
print("✅ 분류 데이터 저장 →", OUT_FP)

Classification FE: 100%|██████████| 15460/15460 [00:02<00:00, 5957.00it/s]


▶ 분류용 샘플 수: 4,022  (≈26.0% 유지)
✅ 분류 데이터 저장 → /Users/gun/Desktop/미래에셋 AI 공모전/data/module_datasets/classification.csv
