<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [1]</a>'.</span>

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [1]:
import os
import pandas as pd

# ── 1. 경로 설정
BASE       = "/Users/gun/Desktop/미래에셋 AI 공모전/data"
DIV_PATH   = os.path.join(BASE, "dividend_ml_ready.csv")
IN_DIR     = os.path.join(BASE, "module_datasets")
MODULES    = ["classification", "regression", "clustering"]

# ── 2. 배당 이벤트 로드 (corp_name 포함)
df_div = pd.read_csv(DIV_PATH, dtype={"stock_code": str, "rcept_no": str})
df_div["stock_code"] = df_div["stock_code"].str.zfill(6)
df_div["rcept_dt"]   = pd.to_datetime(df_div["rcept_no"].str[:8], format="%Y%m%d", errors="coerce")
df_div = df_div.dropna(subset=["rcept_dt"])
# keep only the columns we need for the join
df_div = df_div[["stock_code", "rcept_dt", "corp_name"]]

# ── 3. 각 모듈별 파일에 corp_name 머지 & 재저장
for mod in MODULES:
    fp = os.path.join(IN_DIR, f"{mod}.csv")
    df_mod = pd.read_csv(fp, parse_dates=["rcept_dt"], dtype={"stock_code": str})
    # merge on stock_code + rcept_dt
    df_merged = (
        df_mod
        .merge(df_div, on=["stock_code", "rcept_dt"], how="left")
        .assign(corp_name=lambda d: d["corp_name"].fillna("UNKNOWN"))
    )
    # reorder: corp_name first
    cols = ["corp_name"] + [c for c in df_merged.columns if c != "corp_name"]
    df_merged = df_merged[cols]
    # overwrite
    df_merged.to_csv(fp, index=False, encoding="utf-8-sig")
    print(f"✅ {mod}.csv → corp_name 추가 후 저장 ({len(df_merged)} rows)")

KeyError: 'corp_name'

In [None]:
import pandas as pd
from datetime import timedelta
from tqdm.auto import tqdm

BASE = "/Users/gun/Desktop/미래에셋 AI 공모전/data"
DIV_PATH = f"{BASE}/dividend_ml_ready.csv"
FULL_PATH = f"{BASE}/full_price_history.csv"

df_div = pd.read_csv(DIV_PATH, dtype={"stock_code":str, "rcept_no":str})
df_div["stock_code"] = df_div["stock_code"].str.zfill(6)
df_div["rcept_dt"] = pd.to_datetime(df_div["rcept_no"].str[:8], format="%Y%m%d", errors="coerce")
df_div = df_div.dropna(subset=["rcept_dt"]).reset_index(drop=True)

df_full = pd.read_csv(FULL_PATH, parse_dates=["date"], dtype={"stock_code":str})
df_full = df_full.sort_values(["stock_code", "date"])

events = df_div[["stock_code", "rcept_dt"]].to_records(index=False)
price_map = {code: grp.reset_index(drop=True) for code, grp in df_full.groupby("stock_code")}

# 각 모듈별 window 후보 정의
windows = {
    "classification": list(range(1, 6)),    # w=1~5
    "regression":     list(range(3, 16)),   # w=3~15
    "clustering":     list(range(10, 21)),  # w=10~20
}
total = len(df_div)
result_table = {}

for mod, win_list in windows.items():
    res = []
    for w in win_list:
        kept = 0
        for ev in tqdm(events, desc=f"{mod} w={w}", leave=False):
            code, dt = ev.stock_code, ev.rcept_dt
            sub = price_map.get(code)
            if sub is None: continue
            pos = sub["date"].searchsorted(dt)
            # 보정: dt가 영업일이 아닐 경우
            if pos >= len(sub) or sub.loc[pos, "date"] < dt:
                fut = sub["date"][sub["date"] >= dt]
                if len(fut) == 0: continue
                pos = sub["date"].searchsorted(fut.min())
            lo = max(pos - w, 0)
            hi = min(pos + w + 1, len(sub))
            if hi - lo == 2 * w + 1:
                kept += 1
        res.append((w, kept, kept/total*100))
    result_table[mod] = res

# 보기 좋게 DataFrame 변환
for mod, data in result_table.items():
    print(f"\n[{mod}] 윈도우별 이벤트 보존율")
    df = pd.DataFrame(data, columns=["window", "kept", "kept_pct"])
    df["kept_pct"] = df["kept_pct"].map("{:.2f}%".format)
    print(df.to_string(index=False))

In [None]:
import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

# ── 0. 설정
BASE         = "/Users/gun/Desktop/미래에셋 AI 공모전/data"
DIV_PATH     = os.path.join(BASE, "dividend_with_text.csv")
FULL_PATH    = os.path.join(BASE, "full_price_history.csv")
SECTOR_PATH  = os.path.join(BASE, "sector_info.csv")
OUT_DIR      = os.path.join(BASE, "module_datasets")
os.makedirs(OUT_DIR, exist_ok=True)

# ── 1. 윈도우 정의
windows = {
    "classification": 1,
    "regression":     10,
    "clustering":     10
}
reg_days = [1,2,3,4,5,6,7,10]
reg_cols = [f"ret_{d}d" for d in reg_days]

# ── 2. 데이터 로드
df_div = pd.read_csv(DIV_PATH, dtype={"stock_code":str, "rcept_no":str})
df_div["stock_code"] = df_div["stock_code"].str.zfill(6)
df_div["rcept_dt"]   = pd.to_datetime(df_div["rcept_no"].str[:8],
                                      format="%Y%m%d", errors="coerce")
df_div.dropna(subset=["rcept_dt"], inplace=True)
df_div.reset_index(drop=True, inplace=True)

# 공통 피처
df_div["period"]          = df_div["rcept_dt"].dt.to_period("M")
df_div["div_amount_rank"] = df_div.groupby("period")["per_share_common"].rank(pct=True)
df_div["month"]           = df_div["rcept_dt"].dt.month
df_div["is_year_end"]     = (df_div["month"]==12).astype(int)

sec = pd.read_csv(SECTOR_PATH, dtype=str)
sec["stock_code"] = sec["stock_code"].str.zfill(6)
sec_map = sec.set_index("stock_code")["sector"].to_dict()
df_div["sector"] = df_div["stock_code"].map(sec_map).fillna("")

common_cols = [
    "stock_code","rcept_dt","sector",
    "per_share_common","yield_common","total_amount",
    "div_amount_rank","month","is_year_end"
]

# ── 3. full history 로드 & 맵 생성
df_full = pd.read_csv(FULL_PATH, parse_dates=["date"], dtype={"stock_code":str})
df_full.sort_values(["stock_code","date"], inplace=True)
price_map = {
    code: grp.reset_index(drop=True)
    for code, grp in df_full.groupby("stock_code")
}

total_events = len(df_div)

# ── 4. 모듈별 슬라이스 & 저장
for mod, w in windows.items():
    rows = []
    kept = 0

    desc = f"{mod:14} slicing"
    for ev in tqdm(df_div.itertuples(False), total=total_events, desc=desc):
        code, dt = ev.stock_code, ev.rcept_dt
        sub = price_map.get(code)
        if sub is None:
            continue

        # ① dt 보정 (다음 거래일)
        pos = sub["date"].searchsorted(dt)
        if pos>=len(sub) or sub.loc[pos,"date"] < dt:
            fut = sub["date"][sub["date"]>=dt]
            if fut.empty:
                continue
            pos = sub["date"].searchsorted(fut.min())

        lo, hi = pos-w, pos+w+1
        if lo<0 or hi>len(sub):
            continue
        window_df = sub.iloc[lo:hi].reset_index(drop=True)
        if len(window_df) != 2*w+1:
            continue

        kept += 1
        feat = {c: getattr(ev, c) for c in common_cols}

        # 타겟 생성
        if mod=="classification":
            ret1 = window_df.close.iloc[w+1]/window_df.close.iloc[w] - 1
            feat["up_1d"] = int(ret1 > 0)
        elif mod=="regression":
            for d, col in zip(reg_days, reg_cols):
                idx = w + d
                if idx < len(window_df):
                    feat[col] = window_df.close.iloc[idx] / window_df.close.iloc[w] - 1
                else:
                    feat[col] = np.nan
        # clustering은 타겟 없음

        rows.append(feat)

    # 컬럼 순서 조정
    if mod == "regression":
        final_cols = common_cols + reg_cols
    elif mod == "classification":
        final_cols = common_cols + ["up_1d"]
    else:
        final_cols = common_cols

    df_mod = pd.DataFrame(rows)
    df_mod = df_mod[final_cols]
    out_fp = os.path.join(OUT_DIR, f"{mod}.csv")
    df_mod.to_csv(out_fp, index=False, encoding="utf-8-sig")

    pct = kept/total_events*100
    print(f"✅ [{mod}] 보존: {kept:,}/{total_events:,} ({pct:.1f}%) → {out_fp}")