In [1]:
import pandas as pd
import requests
import json
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime
from dateutil.relativedelta import relativedelta
from tqdm import tqdm
import time
time.sleep(0.3)
from pykrx import stock
import dart_fss as dart
from dart_fss.errors import NotFoundConsolidated
pd.options.display.float_format = '{:.6f}'.format

In [2]:
cdf = pd.read_csv('data/company_name_and_code.csv', index_col = 0)
cdf

Unnamed: 0,corp_code,corp_name,stock_code
0,365387,AJ네트웍스,95570
1,125080,AK홀딩스,6840
2,219097,BGF,27410
3,1263022,BGF리테일,282330
4,858364,BNK금융지주,138930
...,...,...,...
726,111421,휴니드테크놀러지스,5870
727,362238,휴비스,79980
728,156488,휴스틸,5010
729,103176,흥국화재,540


In [3]:
my_key = pd.read_csv('data/dart_key.txt', header = None)[0][0]
dart.set_api_key(my_key)

'd6546705cd01f3257395ab1d45d6099f97d60e04'

In [4]:
def net_profitloss(cdf, my_key, start_year=2019, end_year=2024):

    base_url = 'https://opendart.fss.or.kr/api/fnlttSinglAcntAll.xml'
    reprt_code = '11011'

    corp_map = cdf[['corp_code', 'corp_name']].copy()
    corp_map['corp_code'] = corp_map['corp_code'].astype(str).str.zfill(8)
    corp_codes = corp_map['corp_code']

    year_dfs = {}

    net_profit = ('ifrs-full_ProfitLoss')
    
    for year in range(start_year, end_year + 1):
        print(f"\n=== {year}년 진행 ===")
        all_dfs = []

        for idx, code in enumerate(
            tqdm(corp_codes, desc=f"{year}년 기업 진행", ncols=100),
            start=1
        ):

            def fetch_financial(fs_div):
                params = {
                    'crtfc_key': my_key,
                    'corp_code': code,
                    'bsns_year': str(year),
                    'reprt_code': reprt_code,
                    'fs_div': fs_div
                }

                try:
                    r = requests.get(base_url, params=params, timeout=10)
                    r.raise_for_status()
                except requests.RequestException:
                    return None

                soup = BeautifulSoup(r.text, "lxml-xml")

                status = soup.find('status')
                if status and status.text != '000':
                    return None

                lists = soup.find_all('list')

                profit = None

                for item in lists:
                    id_tag = item.find('account_id')
                    if not id_tag:
                        continue

                    account_id = id_tag.text.strip()
                    value_tag = item.find('thstrm_amount')
                    value = value_tag.text.strip() if value_tag else None

                    if account_id in net_profit and profit is None:
                        profit = value
                    
                return profit

            # 1️⃣ OFS 조회
            set_profit = fetch_financial('CFS')

            # 2️⃣ 하나라도 없으면 → CFS에서 부족한 것만 보완
            if set_profit is None:
                cfs_profit = fetch_financial('OFS')

                if set_profit is None:
                    set_profit = cfs_profit
                
            if set_profit is not None:
                all_dfs.append(pd.DataFrame([{
                    'corp_code': code,
                    'year': year,
                    'net_profit': set_profit
                }]))

            # 속도 제어
            time.sleep(0.3)
            if idx % 50 == 0:
                time.sleep(5)

        # 연도별 raw DF
        if all_dfs:
            year_raw = pd.concat(all_dfs, ignore_index=True)
        else:
            year_raw = pd.DataFrame(
                columns=['corp_code', 'year', 'net_profit']
            )

        base_df = corp_map.copy()
        base_df['year'] = year

        year_df = base_df.merge(
            year_raw,
            on=['corp_code', 'year'],
            how='left'
        )

        year_df['net_profit'] = pd.to_numeric(
            year_df['net_profit'], errors='coerce'
        ).fillna(0)

        year_df = year_df[
            ['corp_code', 'corp_name', 'year',
             'net_profit']
        ]

        year_dfs[year] = year_df

    return tuple(year_dfs[year] for year in range(start_year, end_year + 1))


In [5]:
adf_2019, adf_2020, adf_2021, adf_2022, adf_2023, adf_2024 = net_profitloss(cdf, my_key, 2019, 2024)


=== 2019년 진행 ===


2019년 기업 진행: 100%|███████████████████████████████████████████| 731/731 [08:09<00:00,  1.49it/s]



=== 2020년 진행 ===


2020년 기업 진행: 100%|███████████████████████████████████████████| 731/731 [08:10<00:00,  1.49it/s]



=== 2021년 진행 ===


2021년 기업 진행: 100%|███████████████████████████████████████████| 731/731 [08:05<00:00,  1.51it/s]



=== 2022년 진행 ===


2022년 기업 진행: 100%|███████████████████████████████████████████| 731/731 [07:59<00:00,  1.53it/s]



=== 2023년 진행 ===


2023년 기업 진행: 100%|███████████████████████████████████████████| 731/731 [08:09<00:00,  1.49it/s]



=== 2024년 진행 ===


2024년 기업 진행: 100%|███████████████████████████████████████████| 731/731 [08:12<00:00,  1.48it/s]


In [6]:
adf_lst = [adf_2019, adf_2020, adf_2021, adf_2022, adf_2023, adf_2024]

for i in adf_lst:
    print(len(i[i['net_profit'] == 0]))

54
43
44
43
1
1


In [7]:
adf_lst = [
    adf_2019, adf_2020, adf_2021, adf_2022, adf_2023, adf_2024
]

In [8]:
for df, year in zip(adf_lst, range(2019, 2025)):
    df.to_csv(
        f'data/net_profit/net_profit_{year}.csv',
        float_format='%.6f',
        index=False
    )

## 데이터 전처리

In [9]:
adf_2019 = pd.read_csv('data/net_profit/net_profit_2019.csv')
adf_2020 = pd.read_csv('data/net_profit/net_profit_2020.csv')
adf_2021 = pd.read_csv('data/net_profit/net_profit_2021.csv')
adf_2022 = pd.read_csv('data/net_profit/net_profit_2022.csv')
adf_2023 = pd.read_csv('data/net_profit/net_profit_2023.csv')
adf_2024 = pd.read_csv('data/net_profit/net_profit_2024.csv')

In [10]:
all_19 = pd.read_csv('data/data_clean/2019(연간)/2019_연간보고서_포괄손익계산서전처리.txt', sep = '\t', encoding ='cp949')[['회사명', '당기순이익']]
all_20 = pd.read_csv('data/data_clean/2020연간/2020_연간보고서_포괄손익계산서전처리.txt', sep = '\t', encoding ='cp949')[['회사명', '당기순이익']]
all_21 = pd.read_csv('data/data_clean/2021연간/2021_연간보고서_포괄손익계산서전처리.txt', sep = '\t', encoding ='cp949')[['회사명', '당기순이익']]
all_22 = pd.read_csv('data/data_clean/2022연간/2022_연간보고서_포괄손익계산서전처리.txt', sep = '\t', encoding ='cp949')[['회사명', '당기순이익']]
all_23 = pd.read_csv('data/data_clean/2023연간/2023_연간보고서_포괄손익계산서전처리.txt', sep = '\t', encoding ='cp949')[['회사명', '당기순이익']]
all_24 = pd.read_csv('data/data_clean/2024연간/2024_연간보고서_포괄손익계산서전처리.txt', sep = '\t', encoding ='cp949')[['회사명', '당기순이익']]

In [11]:
new_19 = pd.read_csv('data/data_clean/2019(연간)/2019_연간보고서_손익계산서전처리.txt', sep = '\t', encoding ='cp949')[['회사명', '당기순이익']]
new_20 = pd.read_csv('data/data_clean/2020연간/2020_연간보고서_손익계산서전처리.txt', sep = '\t', encoding ='cp949')[['회사명', '당기순이익']]
new_21 = pd.read_csv('data/data_clean/2021연간/2021_연간보고서_손익계산서전처리.txt', sep = '\t', encoding ='cp949')[['회사명', '당기순이익']]
new_22 = pd.read_csv('data/data_clean/2022연간/2022_연간보고서_손익계산서전처리.txt', sep = '\t', encoding ='cp949')[['회사명', '당기순이익']]
new_23 = pd.read_csv('data/data_clean/2023연간/2023_연간보고서_손익계산서전처리.txt', sep = '\t', encoding ='cp949')[['회사명', '당기순이익']]
new_24 = pd.read_csv('data/data_clean/2024연간/2024_연간보고서_손익계산서전처리.txt', sep = '\t', encoding ='cp949')[['회사명', '당기순이익']]

In [12]:
ca_19 = adf_2019[adf_2019['net_profit'] == 0]
ca_20 = adf_2020[adf_2020['net_profit'] == 0]
ca_21 = adf_2021[adf_2021['net_profit'] == 0]
ca_22 = adf_2022[adf_2022['net_profit'] == 0]
ca_23 = adf_2023[adf_2023['net_profit'] == 0]
ca_24 = adf_2024[adf_2024['net_profit'] == 0]

In [13]:
print(
len(ca_19), len(ca_20), len(ca_21), len(ca_22), len(ca_23), len(ca_24)
)

54 43 44 43 1 1


In [14]:
adf_columns = ['corp_name', 'net_profit']
all_19.columns = adf_columns
all_20.columns = adf_columns
all_21.columns = adf_columns
all_22.columns = adf_columns
all_23.columns = adf_columns
all_24.columns = adf_columns
all_19

Unnamed: 0,corp_name,net_profit
0,3S,-724725770.000000
1,AJ네트웍스,
2,AK홀딩스,
3,APS,
4,AP시스템,27615352866.000000
...,...,...
2685,흥국,
2686,흥국에프엔비,
2687,흥아해운,
2688,희림,


In [15]:
adf_columns = ['corp_name', 'net_profit']
new_19.columns = adf_columns
new_20.columns = adf_columns
new_21.columns = adf_columns
new_22.columns = adf_columns
new_23.columns = adf_columns
new_24.columns = adf_columns
new_19

Unnamed: 0,corp_name,net_profit
0,CJ,63281893000.000000
1,CJ대한통운,
2,DB하이텍,
3,DSR,
4,GS,
...,...,...
193,호텔신라,
194,화천기계,
195,효성티앤씨,
196,효성화학,92166808901.000000


In [16]:
cadf_2019 = adf_2019.copy()
cadf_2020 = adf_2020.copy()
cadf_2021 = adf_2021.copy()
cadf_2022 = adf_2022.copy()
cadf_2023 = adf_2023.copy()
cadf_2024 = adf_2024.copy()

In [17]:
cadf_lst = [cadf_2019, cadf_2020, cadf_2021, cadf_2022, cadf_2023, cadf_2024]
all_lst = [all_19, all_20, all_21, all_22, all_23, all_24]
new_lst = [new_19, new_20, new_21, new_22, new_23, new_24]

for cdf, adf, ndf in zip(cadf_lst, all_lst, new_lst):
    cdf = cdf.merge(
    adf[['corp_name', 'net_profit']],
    on='corp_name',
    how='left')
    
    cdf = cdf.merge(
    ndf[['corp_name', 'net_profit']],
    on='corp_name',
    how='left')
    
    mask_a = cdf['net_profit_y'].notnull()
    mask_n = (cdf['net_profit_x'] == 0) & (cdf['net_profit'].notnull())
    
    cdf.loc[mask_a, 'net_profit_x'] = cdf.loc[mask_a, 'net_profit_y']
    cdf.loc[mask_n, 'net_profit_x'] = cdf.loc[mask_n, 'net_profit']
    
    cdf = cdf.drop(['net_profit_y', 'net_profit'], axis = 1)
    cdf.columns = ['corp_code', 'corp_name', 'year', 'net_profit']
    
    print(len(cdf[cdf['net_profit'] == 0]))

54
43
43
43
1
1


In [18]:
for df, year in zip(cadf_lst, range(2019, 2025)):
    df.to_csv(
        f'data/net_profit/net_profit_{year}.csv',
        float_format='%.6f',
        index=False
    )