In [None]:
# !pip install dart-fss
!pip -q install -U dart-fss pandas tqdm tenacity

⠦ Loading Stock Market Information

In [None]:
# !pip install -U dart-fss pandas openpyxl tqdm

import os, re, time
from dataclasses import dataclass
from typing import Optional, Dict

import pandas as pd
from tqdm import tqdm
import dart_fss as dart

# =========================
# 0) Open DART API KEY 설정
# =========================
api_key = '********************************'
dart.set_api_key(api_key=api_key)

SLEEP = 0.15  # API 호출 간격(안전)

# =========================
# 1) 대상 기업(표시명) 목록
# =========================
TARGET_NAMES = [
    "삼성물산","효성중공업","현대건설","HJ중공업","DL이앤씨",
    "GS건설","대우건설","HDC현대산업개발","아이에스동서","태영건설"
]

@dataclass
class CorpRef:
    corp_code: str
    corp_name: str
    stock_code: Optional[str] = None

# 고정 매핑으로 시작
corp_refs: Dict[str, CorpRef] = {
    "삼성물산": CorpRef("00126229", "삼성물산", "000830"),   # 이후 028260으로 교체 앵커링
    "효성중공업": CorpRef("01316245", "효성중공업", "298040"),
    "현대건설": CorpRef("00164478", "현대건설", "000720"),
    "HJ중공업": CorpRef("00633835", "HJ중공업", "097230"),
    "DL이앤씨": CorpRef("01524093", "DL이앤씨", "375500"),
    "GS건설": CorpRef("00120030", "GS건설", "006360"),
    "대우건설": CorpRef("00124540", "대우건설", "047040"),
    "HDC현대산업개발": CorpRef("01310269", "HDC현대산업개발", "294870"),
    "아이에스동서": CorpRef("00115977", "아이에스동서", "010780"),
    "태영건설": CorpRef("00535825", "태영건설", None),      # 이후 009410으로 교체 앵커링
}

# ==========================================
# 2) 보고서 코드 / 분기말 / 계정명 정규화 등
# ==========================================
REPRT = {"1Q":"11013","H1":"11012","3Q":"11014","ANNUAL":"11011"}
ORDER = ["1Q","H1","3Q","ANNUAL"]
Q_END = {"11013":"-03-31","11012":"-06-30","11014":"-09-30","11011":"-12-31"}

NOTE_TAIL = re.compile(r"\s*\([^)]*\)\s*\)*\s*$")
MULTISPACE = re.compile(r"\s+")
def clean_account_name(s: str) -> str:
    if not isinstance(s, str):
        return s
    s = s.strip()
    s = NOTE_TAIL.sub("", s)
    s = MULTISPACE.sub(" ", s)
    s = s.replace("영업이익(손실)", "영업이익")
    s = s.replace("분기(중간)순이익", "분기순이익")
    s = s.replace("연결분기순이익", "당기순이익")
    s = s.replace("연결당기순이익", "당기순이익")
    s = s.replace("지배기업 소유주지분 순이익", "당기순이익")
    s = s.replace("지배기업소유주지분순이익", "당기순이익")
    s = s.replace("당기순이익(손실)", "당기순이익")
    return s

TARGET_ACCOUNTS = {
    "자산총계": ["자산총계", "총자산"],
    "부채총계": ["부채총계", "총부채"],
    "자본총계": [
        "자본총계", "총자본",
        "지배기업의소유지분", "지배기업 소유주지분",
        "자본총계(지배+비지배)", "지배기업소유주지분+비지배지분",
    ],
    "매출액": ["매출액", "매출", "매출수익", "영업수익"],
    "영업이익": ["영업이익", "영업손익"],
    "당기순이익": ["분기순이익", "당기순이익"],
}

def parse_amount(x):
    if x is None:
        return None
    if isinstance(x, (int, float)):
        return float(x)
    s = str(x).strip()
    if s in ("", "-", "None", "nan"):
        return None
    s = s.replace(",", "")
    if s.startswith("(") and s.endswith(")"):  # (1,234) → -1234
        s = "-" + s[1:-1]
    try:
        return float(s)
    except:
        return None

# =========================================================
# 3) 주식코드로 정확한 corp_code를 찾는 앵커링 함수
# =========================================================
def _resolve_by_stock(stock_code: str):
    """
    주식코드로 corp_list를 순회해서 정확한 corp_code/현재공식명칭을 찾음
    """
    cl = dart.get_corp_list()
    for c in cl:
        if getattr(c, "stock_code", None) == stock_code:
            return str(getattr(c, "corp_code")).zfill(8), getattr(c, "corp_name", "")
    raise ValueError(f"stock_code '{stock_code}' 회사를 찾지 못했습니다.")

# ---------------------------------------------------------
# (A) 메인 표시명 중 삼성물산/태영건설을 올바른 엔트리로 교체 앵커링
#     - 삼성물산: 028260 (합병 후 신주)
#     - 태영건설: 009410
# ---------------------------------------------------------
PRIMARY_STOCK_ANCHOR = {
    "삼성물산": "028260",
    "태영건설": "009410",
}
for nm, sc in PRIMARY_STOCK_ANCHOR.items():
    try:
        cc, now_official = _resolve_by_stock(sc)
        corp_refs[nm] = CorpRef(cc, nm, sc)  # 표시명은 그대로 유지
        print(f"[anchor primary] {nm} -> corp_code={cc}, stock_code={sc}, now_official='{now_official}'")
    except Exception as e:
        print(f"[warn] anchor primary {nm}: {e}")

# ---------------------------------------------------------
# (B) 레거시 명칭(라인리지)도 주식코드로 정확히 고정
#     - 대림산업: 000210
#     - 현대산업개발: 012630
# ---------------------------------------------------------
LEGACY_STOCK_ANCHOR = {
    "대림산업": "000210",
    "현대산업개발": "012630",
}
for legacy_name, sc in LEGACY_STOCK_ANCHOR.items():
    try:
        cc, now_official = _resolve_by_stock(sc)
        corp_refs[legacy_name] = CorpRef(cc, legacy_name, sc)
        print(f"[anchor legacy] {legacy_name} -> corp_code={cc}, stock_code={sc}, now_official='{now_official}'")
    except Exception as e:
        print(f"[warn] anchor legacy {legacy_name}: {e}")

# ==========================================
# 4) DART 단일 보고서 → 계정 추출 (CFS→OFS 폴백)
# ==========================================
def fetch_report_accounts(corp_code: str, year: int, reprt_code: str, fs_div: str = "CFS"):
    """
    fnltt_singl_acnt_all → dict['list']를 DF로 파싱
    - corp_code: 8자리 문자열
    - 연결(CFS) 실패시 별도(OFS) 폴백
    - 보고일이 없으면 분기말로 보정
    반환: {'자산총계','부채총계','자본총계','매출액','영업이익','당기순이익','report_date'}
    """
    def _call(cc, yy, rc, div):
        time.sleep(SLEEP)
        return dart.api.finance.fnltt_singl_acnt_all(
            corp_code=cc, bsns_year=str(yy), reprt_code=rc, fs_div=div
        )

    cc = str(corp_code).zfill(8)

    # 1) 연결
    resp = _call(cc, year, reprt_code, fs_div)
    items = resp.get("list", []) if isinstance(resp, dict) else (resp or [])
    # 2) 별도 폴백
    if not items and fs_div == "CFS":
        resp = _call(cc, year, reprt_code, "OFS")
        items = resp.get("list", []) if isinstance(resp, dict) else (resp or [])

    df = pd.DataFrame(items)
    if df.empty or "account_nm" not in df.columns:
        # 데이터 없음 → 값 None, 보고일은 분기말로 보정
        return {k: None for k in TARGET_ACCOUNTS} | {"report_date": f"{year}{Q_END[reprt_code]}"}

    df["account_nm_clean"] = df["account_nm"].astype(str).map(clean_account_name)
    amt_col = "thstrm_amount" if "thstrm_amount" in df.columns else None

    out = {}
    for key, cands in TARGET_ACCOUNTS.items():
        sel = df[df["account_nm_clean"].isin(cands)]
        val = None
        if not sel.empty and amt_col:
            row = sel.sort_values("ord").iloc[0] if "ord" in sel.columns else sel.iloc[0]
            val = parse_amount(row[amt_col])
        out[key] = val

    # 보고일 세팅
    rpt_dt = None
    if "thstrm_dt" in df.columns and pd.notna(df["thstrm_dt"]).any():
        rpt_dt = str(df["thstrm_dt"].dropna().iloc[0])
    else:
        rpt_dt = f"{year}{Q_END[reprt_code]}"
    out["report_date"] = rpt_dt
    return out

# ==========================================
# 5) 연-분기 테이블(누적→분기 변환)
# ==========================================
def yearly_quarter_table(corp_name: str, corp_code: str, year: int) -> pd.DataFrame:
    raw, dates = {}, {}
    for tag in ORDER:
        try:
            rec = fetch_report_accounts(corp_code, year, REPRT[tag], fs_div="CFS")
        except Exception:
            rec = {k: None for k in TARGET_ACCOUNTS} | {"report_date": f"{year}{Q_END[REPRT[tag]]}"}
        raw[tag] = rec
        dates[tag] = rec.get("report_date")

    def d(a, b):
        return None if (a is None or b is None) else a - b

    # 손익(누적 → 분기)
    sales_q = {
        "Q1": raw["1Q"]["매출액"],
        "Q2": d(raw["H1"]["매출액"], raw["1Q"]["매출액"]),
        "Q3": d(raw["3Q"]["매출액"], raw["H1"]["매출액"]),
        "Q4": d(raw["ANNUAL"]["매출액"], raw["3Q"]["매출액"]),
    }
    op_q = {
        "Q1": raw["1Q"]["영업이익"],
        "Q2": d(raw["H1"]["영업이익"], raw["1Q"]["영업이익"]),
        "Q3": d(raw["3Q"]["영업이익"], raw["H1"]["영업이익"]),
        "Q4": d(raw["ANNUAL"]["영업이익"], raw["3Q"]["영업이익"]),
    }
    ni_q = {
        "Q1": raw["1Q"]["당기순이익"],
        "Q2": d(raw["H1"]["당기순이익"], raw["1Q"]["당기순이익"]),
        "Q3": d(raw["3Q"]["당기순이익"], raw["H1"]["당기순이익"]),
        "Q4": d(raw["ANNUAL"]["당기순이익"], raw["3Q"]["당기순이익"]),
    }

    # 재무상태표(분기말 잔액)
    assets_q = {"Q1": raw["1Q"]["자산총계"], "Q2": raw["H1"]["자산총계"], "Q3": raw["3Q"]["자산총계"], "Q4": raw["ANNUAL"]["자산총계"]}
    liab_q   = {"Q1": raw["1Q"]["부채총계"], "Q2": raw["H1"]["부채총계"], "Q3": raw["3Q"]["부채총계"], "Q4": raw["ANNUAL"]["부채총계"]}
    equity_q = {"Q1": raw["1Q"]["자본총계"], "Q2": raw["H1"]["자본총계"], "Q3": raw["3Q"]["자본총계"], "Q4": raw["ANNUAL"]["자본총계"]}
    date_q   = {"Q1": dates["1Q"], "Q2": dates["H1"], "Q3": dates["3Q"], "Q4": dates["ANNUAL"]}

    rows = []
    for q in ["Q1","Q2","Q3","Q4"]:
        rows.append({
            "corp_name": corp_name,
            "corp_code": str(corp_code).zfill(8),
            "year": year,
            "quarter": q,
            "report_date": date_q[q],
            "자산총계": assets_q[q],
            "부채총계": liab_q[q],
            "자본총계": equity_q[q],
            "매출액": sales_q[q],
            "영업이익": op_q[q],
            "분기순이익": ni_q[q],
        })
    return pd.DataFrame(rows)

# ==========================================
# 6) 라인리지 폴백 (DL/HDC만 적용)
# ==========================================
LINEAGE_RULES = {
    "DL이앤씨": ("대림산업", 2020),         # 2020년까지 레거시 '대림산업' 사용
    "HDC현대산업개발": ("현대산업개발", 2017), # 2017년까지 레거시 '현대산업개발' 사용
}

def yearly_quarter_table_with_lineage(pretty_name: str, corp_code: str, year: int) -> pd.DataFrame:
    base = yearly_quarter_table(pretty_name, corp_code, year)
    num_cols = ["자산총계","부채총계","자본총계","매출액","영업이익","분기순이익"]
    all_missing = base[num_cols].isna().all().all()

    # 레거시 폴백: 해당 연도 전체가 결측이고 룰에 걸리면 레거시 주식코드 앵커로 다시 조회
    if all_missing:
        rule = LINEAGE_RULES.get(pretty_name)
        if rule and year <= rule[1]:
            legacy_name = rule[0]
            if legacy_name not in corp_refs:
                raise RuntimeError(f"legacy '{legacy_name}' corp_refs 미정의")
            alt = yearly_quarter_table(legacy_name, corp_refs[legacy_name].corp_code, year)
            alt["corp_name"] = pretty_name
            alt["corp_code"] = str(corp_code).zfill(8)
            return alt

    return base

# ==========================================
# 7) 2014~2024 로드 → 2015~2024 저장
# ==========================================
START_YEAR_LOAD, END_YEAR = 2014, 2024
NUM_COLS = ["자산총계","부채총계","자본총계","매출액","영업이익","분기순이익"]

all_dfs = []
for nm in tqdm(TARGET_NAMES, desc="Companies"):
    ref = corp_refs[nm]
    for yy in range(START_YEAR_LOAD, END_YEAR+1):
        try:
            df_y = yearly_quarter_table_with_lineage(nm, ref.corp_code, yy)
            all_dfs.append(df_y)
        except Exception as e:
            print(f"[warn] {nm} {yy}: {e}")

result_df = pd.concat(all_dfs, ignore_index=True)
result_df = result_df[result_df["year"] >= 2015].copy()  # 2015~2024만 사용

COL_ORDER = ["corp_name","corp_code","year","quarter","report_date"] + NUM_COLS
result_df = result_df[COL_ORDER].sort_values(["corp_name","year","quarter"]).reset_index(drop=True)

# --- 2015년 1~3분기 행 제거 ---
to_drop_mask = (result_df["year"] == 2015) & (result_df["quarter"].isin(["Q1", "Q2", "Q3"]))
to_drop = result_df.loc[to_drop_mask, ["corp_name","year","quarter"]].copy()

print("삭제 대상(2015 Q1~Q3) 행 수:", len(to_drop))
if len(to_drop):
    print(to_drop.sort_values(["corp_name","quarter"]).to_string(index=False))

# 실제 제거
result_df = result_df.loc[~to_drop_mask].reset_index(drop=True)

# 정렬 유지
result_df = result_df.sort_values(["corp_name","year","quarter"]).reset_index(drop=True)

# ==========================================
# 8) 저장 (CSV / XLSX)
# ==========================================
os.makedirs("./dart_out", exist_ok=True)
csv_path  = "./dart_out/건설10_11년로드_2015~2024_연결_분기재무_정규화.csv"
xlsx_path = "./dart_out/건설10_11년로드_2015~2024_연결_분기재무_정규화.xlsx"

result_df.to_csv(csv_path, index=False, encoding="utf-8-sig")
with pd.ExcelWriter(xlsx_path) as w:
    result_df.to_excel(w, sheet_name="quarterly", index=False)

print(result_df.shape, "Saved:", csv_path, xlsx_path)

# ==========================================
# 9) 결측 요약 및 샘플
# ==========================================
missing = (
    result_df
    .assign(_miss=result_df[NUM_COLS].isna().all(axis=1))
    .query("_miss == True")[["corp_name","year","quarter"]]
    .groupby(["corp_name","year"]).agg(missing_quarters=("quarter","unique")).reset_index()
    .sort_values(["corp_name","year"])
)
print("\n== 결측 요약 (2015~2024) ==")
try:
    display(missing.head(30))
except Exception:
    print(missing.head(30).to_string(index=False))

# 샘플
try:
    display(result_df.head(12))
except Exception:
    print(result_df.head(12).to_string(index=False))


⠙ Loading Stock Market Information[anchor primary] 삼성물산 -> corp_code=00149655, stock_code=028260, now_official='삼성물산'
[anchor primary] 태영건설 -> corp_code=00153861, stock_code=009410, now_official='태영건설'
⠹ Loading Stock Market Information[anchor legacy] 대림산업 -> corp_code=00109693, stock_code=000210, now_official='DL'
[anchor legacy] 현대산업개발 -> corp_code=00164636, stock_code=012630, now_official='HDC'


Companies:   0%|          | 0/10 [00:00<?, ?it/s]

⠦ Loading Stock Market Information

Companies:  10%|█         | 1/10 [00:26<03:56, 26.25s/it]

⠋ Loading Stock Market Information

Companies:  20%|██        | 2/10 [00:49<03:14, 24.33s/it]

⠹ Loading Stock Market Information

Companies:  30%|███       | 3/10 [01:14<02:53, 24.78s/it]

⠧ Loading Stock Market Information

Companies:  40%|████      | 4/10 [01:38<02:26, 24.43s/it]

⠸ Loading Stock Market Information

Companies:  50%|█████     | 5/10 [02:17<02:27, 29.55s/it]

⠧ Loading Stock Market Information

Companies:  60%|██████    | 6/10 [02:42<01:52, 28.16s/it]

⠹ Loading Stock Market Information

Companies:  70%|███████   | 7/10 [03:05<01:19, 26.52s/it]

⠧ Loading Stock Market Information

Companies:  80%|████████  | 8/10 [03:36<00:55, 27.75s/it]

⠹ Loading Stock Market Information

Companies:  90%|█████████ | 9/10 [04:02<00:27, 27.30s/it]

⠴ Loading Stock Market Information

Companies: 100%|██████████| 10/10 [04:26<00:00, 26.62s/it]

삭제 대상(2015 Q1~Q3) 행 수: 30
corp_name  year quarter
    DL이앤씨  2015      Q1
    DL이앤씨  2015      Q2
    DL이앤씨  2015      Q3
     GS건설  2015      Q1
     GS건설  2015      Q2
     GS건설  2015      Q3
HDC현대산업개발  2015      Q1
HDC현대산업개발  2015      Q2
HDC현대산업개발  2015      Q3
    HJ중공업  2015      Q1
    HJ중공업  2015      Q2
    HJ중공업  2015      Q3
     대우건설  2015      Q1
     대우건설  2015      Q2
     대우건설  2015      Q3
     삼성물산  2015      Q1
     삼성물산  2015      Q2
     삼성물산  2015      Q3
   아이에스동서  2015      Q1
   아이에스동서  2015      Q2
   아이에스동서  2015      Q3
     태영건설  2015      Q1
     태영건설  2015      Q2
     태영건설  2015      Q3
     현대건설  2015      Q1
     현대건설  2015      Q2
     현대건설  2015      Q3
    효성중공업  2015      Q1
    효성중공업  2015      Q2
    효성중공업  2015      Q3
⠧ Loading Stock Market Information


  result_df = pd.concat(all_dfs, ignore_index=True)


(370, 11) Saved: ./dart_out/건설10_11년로드_2015~2024_연결_분기재무_정규화.csv ./dart_out/건설10_11년로드_2015~2024_연결_분기재무_정규화.xlsx

== 결측 요약 (2015~2024) ==
                                   ⠇ Loading Stock Market Information

Unnamed: 0,corp_name,year,missing_quarters
0,HDC현대산업개발,2015,[Q4]
1,HDC현대산업개발,2018,[Q1]
2,효성중공업,2015,[Q4]
3,효성중공업,2016,"[Q1, Q2, Q3, Q4]"
4,효성중공업,2017,"[Q1, Q2, Q3, Q4]"
5,효성중공업,2018,[Q1]


Unnamed: 0,corp_name,corp_code,year,quarter,report_date,자산총계,부채총계,자본총계,매출액,영업이익,분기순이익
0,DL이앤씨,1524093,2015,Q4,2015-12-31,12064890000000.0,7259125000000.0,4805769000000.0,,,
1,DL이앤씨,1524093,2016,Q1,2016-03-31,12332080000000.0,7389715000000.0,4942363000000.0,2253709000000.0,90775870000.0,31037070000.0
2,DL이앤씨,1524093,2016,Q2,2016-06-30,12378280000000.0,7324959000000.0,5053324000000.0,310076900000.0,45397330000.0,88759550000.0
3,DL이앤씨,1524093,2016,Q3,2016-09-30,12185420000000.0,7042494000000.0,5142928000000.0,-106422300000.0,-5518794000.0,-10709090000.0
4,DL이앤씨,1524093,2016,Q4,2016-12-31,12391510000000.0,7246135000000.0,5145374000000.0,7396406000000.0,288733300000.0,184107800000.0
5,DL이앤씨,1524093,2017,Q1,2017-03-31,12812120000000.0,7563966000000.0,5248157000000.0,2511359000000.0,113983700000.0,149346100000.0
6,DL이앤씨,1524093,2017,Q2,2017-06-30,13206280000000.0,7818788000000.0,5387496000000.0,594928500000.0,29052860000.0,-44812440000.0
7,DL이앤씨,1524093,2017,Q3,2017-09-30,14137480000000.0,8398251000000.0,264111500000.0,320920300000.0,54249170000.0,191409400000.0
8,DL이앤씨,1524093,2017,Q4,2017-12-31,13402450000000.0,7708148000000.0,490493000000.0,8908328000000.0,348613100000.0,212055700000.0
9,DL이앤씨,1524093,2018,Q1,2018-03-31,13689850000000.0,8038344000000.0,243905900000.0,2836063000000.0,248241700000.0,250513400000.0
