### 데이터 값 찾기

* 문서 URL 목록 가져오기

In [1]:
import pandas as pd

# 사업보고서 제무제표 주석 문서 URL 목록
file_path = "D:/PythonProject/data-gatherer/dart_fs_notes/fs_notes_urls/fs_notes_url_all.csv"
fs_notes_all = pd.read_csv(file_path, dtype=object)
# 회사코드 정렬
fs_notes_all = fs_notes_all.sort_values(by=['corp_code'], ascending=True)
# 회사코드 목록
list_all_codes = fs_notes_all['corp_code'].unique()
len(list_all_codes)

342

* 이미 데이터 값을 찾은 것은 대상에서 제외

In [2]:
import os.path

# 이미 데이터 값을 찾은 회사코드 목록
file_path = "D:/PythonProject/data-gatherer/dart_fs_notes/items_from_section/items_all.csv"
 
if os.path.exists(file_path):
    try:
        items_all = pd.read_csv(file_path, dtype=object)
        list_own_codes = items_all['corp_code'].unique()
    except pd.errors.EmptyDataError:
        list_own_codes = []    
else:
    list_own_codes = []
    
len(list_own_codes)

0

In [3]:
# 리스트에서 중복 제거 (이미 corp info가 있는 경우 대상에서 제외)
list_target_codes = list(set(list_all_codes) - set(list_own_codes))
len(list_target_codes)

342

* 리스트 분할

In [6]:
list_cnt = len(list_target_codes)
n = 50
list_of_lists = [list_target_codes[i * n:(i + 1) * n] for i in range((list_cnt + n - 1) // n )] 
len(list_of_lists)

7

* 항목과 값 찾기: 확정급여채무의 현재가치, 사외적립자산의 공정가치

In [7]:
import traceback
from time import sleep
from tqdm import tqdm

import sys
module_path = "D:\PythonProject\data-gatherer\dart_fs_notes"
sys.path.append(module_path)
import myutil, mydart

# 파일 저장 위치
path_dir = "D:/PythonProject/data-gatherer/dart_fs_notes/items_from_section/"

for corp_codes in list_of_lists:
    for corp_code in tqdm(corp_codes):
        str_expr = "corp_code == @corp_code"
        df_doc = fs_notes_all.query(str_expr)
        doc_count = df_doc.shape[0]
        data_list = []
        for i, doc in df_doc.iterrows():
            sleep(1.0)
            try:
                val_a = val_b = 0.0
                section_title = None
                unit = None
                doc_result = doc['doc_result']
                if doc_result == 'No Financial Statement Notes':
                    data_list.append({'corp_code' : doc['corp_code'], 'stock_code' : doc['stock_code'], 'corp_cls' : doc['corp_cls'], 'corp_name' : doc['corp_name'], 
                                    'rpt_num' : doc['rpt_num'], 'rpt_name' : doc['rpt_name'], 
                                    'doc_title' : doc['doc_title'], 'doc_result' : doc_result, 'section_title' : None, 
                                    'unit' : None, 'val_a' : None, 'val_b' : None, 'item_result' : None, 'doc_url' : None})
                else:
                    doc_url = doc['doc_url']                    
                    soup = mydart.get_document(doc_url)
                    subtitle_list = mydart.get_subtitles(soup)
                    section_title = mydart.find_section_title(subtitle_list)
                    if section_title is None:       # 해당 Section을 찾지 못했을 경우
                        data_list.append({'corp_code' : doc['corp_code'], 'stock_code' : doc['stock_code'], 'corp_cls' : doc['corp_cls'], 'corp_name' : doc['corp_name'], 
                                        'rpt_num' : doc['rpt_num'], 'rpt_name' : doc['rpt_name'], 
                                        'doc_title' : doc['doc_title'], 'doc_result' : doc_result, 'section_title' : None, 
                                        'unit' : None, 'val_a' : None, 'val_b' : None, 'item_result' : None, 'doc_url' : doc_url})                        
                    else:                                         
                        section_page = mydart.get_section_html(soup, subtitle_list, section_title)
                        # 해당 Section에서 사용된 금액 단위
                        unit = mydart.find_unit_from_section(section_page)
                        # 당기말의 "확정급여채무"와 "사외적립자산" 금액 찾기
                        val_a, val_b, item_result = mydart.find_item_from_section(section_page)      
                        data_list.append({'corp_code' : doc['corp_code'], 'stock_code' : doc['stock_code'], 'corp_cls' : doc['corp_cls'], 'corp_name' : doc['corp_name'], 
                                        'rpt_num' : doc['rpt_num'], 'rpt_name' : doc['rpt_name'], 
                                        'doc_title' : doc['doc_title'], 'doc_result' : doc_result, 'section_title' : section_title, 
                                        'unit' : unit, 'val_a' : val_a, 'val_b' : val_b, 'item_result' : item_result, 'doc_url' : doc_url})
            except ValueError as err:
                print(doc['corp_code'], doc['corp_name'], doc['rpt_num'], doc['rpt_name'], doc['rpt_url'], doc['doc_title'], doc['doc_url'])
                print(err)
                data_list.append({'corp_code' : doc['corp_code'], 'stock_code' : doc['stock_code'], 'corp_cls' : doc['corp_cls'], 'corp_name' : doc['corp_name'], 
                                'rpt_num' : doc['rpt_num'], 'rpt_name' : doc['rpt_name'], 
                                'doc_title' : doc['doc_title'], 'doc_result' : doc_result, 'section_title' : section_title, 
                                'unit' : unit, 'val_a' : val_a, 'val_b' : val_b, 'item_result' : 'ValueError', 'doc_url' : doc_url})                     
                break
            except Exception as e:
                print(doc['corp_code'], doc['corp_name'], doc['rpt_num'], doc['rpt_name'], doc['rpt_url'], doc['doc_title'], doc['doc_url'])
                err_msg = traceback.format_exc()
                print(err_msg)
                data_list.append({'corp_code' : doc['corp_code'], 'stock_code' : doc['stock_code'], 'corp_cls' : doc['corp_cls'], 'corp_name' : doc['corp_name'], 
                                'rpt_num' : doc['rpt_num'], 'rpt_name' : doc['rpt_name'], 
                                'doc_title' : doc['doc_title'], 'doc_result' : doc_result, 'section_title' : section_title, 
                                'unit' : unit, 'val_a' : val_a, 'val_b' : val_b, 'item_result' : 'Exception', 'doc_url' : doc_url})                     
                break
        # 결과를 파일로 저장
        doc_value_count = len(data_list)
        if doc_value_count == 0:
            break
        elif doc_value_count == doc_count:        
            df_items = pd.DataFrame(data_list)
            file_name = corp_code + '_item_' + myutil.now_dt_str() + '.csv'
            df_items.to_csv(path_dir + file_name, index=False)
        else:
            continue

  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [12:09<00:00, 14.60s/it]
100%|██████████| 50/50 [11:53<00:00, 14.26s/it]
100%|██████████| 50/50 [12:21<00:00, 14.82s/it]
100%|██████████| 50/50 [11:34<00:00, 13.89s/it]
100%|██████████| 50/50 [12:06<00:00, 14.53s/it]
100%|██████████| 50/50 [12:11<00:00, 14.62s/it]
100%|██████████| 42/42 [10:24<00:00, 14.88s/it]

00125530 SPC삼립 20230316001219 사업보고서 (2022.12) http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20230316001219 5. 재무제표 주석 http://dart.fss.or.kr/report/viewer.do?rcpNo=20230316001219&dcmNo=9066329&eleId=22&offset=816226&length=389465&dtd=dart3.xsd
Traceback (most recent call last):
  File "C:\Users\kyungho\AppData\Local\Temp\ipykernel_14440\2406801516.py", line 42, in <module>
    section_page = mydart.get_section_html(soup, subtitle_list, section_title)
  File "d:\PythonProject\data-gatherer\dart_fs_notes\mydart.py", line 124, in get_section_html
    next_title = subtitle_list[idx + 1]
IndexError: list index out of range




