### 데이터 값 찾기

* 문서 URL 목록 가져오기

In [14]:
import pandas as pd

# 사업보고서 제무제표 주석 문서 URL 목록
file_path = "D:/PythonProject/data-gatherer/dart_fs_notes/fs_notes/fs_notes_all.csv"
fs_notes_all = pd.read_csv(file_path, dtype=object)

In [15]:
# 회사코드 정렬
fs_notes_all = fs_notes_all.sort_values(by=['corp_code'], ascending=True)
# 회사코드 목록
list_all_codes = fs_notes_all['corp_code'].unique()
len(list_all_codes)

53

* 이미 데이터 값을 찾은 것은 대상에서 제외

In [16]:
import os.path
# 이미 데이터 값을 찾은 회사코드 목록
file_path = "D:/PythonProject/data-gatherer/dart_fs_notes/items/items_all.csv"
if os.path.exists(file_path):
    items_all = pd.read_csv(file_path, dtype=object)
    list_own_codes = items_all['corp_code'].unique()
else:
    list_own_codes = []

len(list_own_codes)

39

In [17]:
# 리스트에서 중복 제거 (이미 corp info가 있는 경우 대상에서 제외)
list_target_codes = list(set(list_all_codes) - set(list_own_codes))
len(list_target_codes)

14

* 리스트 분할

In [18]:
list_cnt = len(list_target_codes)
n = 1000
list_of_lists = [list_target_codes[i * n:(i + 1) * n] for i in range((list_cnt + n - 1) // n )] 
len(list_of_lists)

1

* 항목과 값 찾기: 확정급여채무의 현재가치, 사외적립자산의 공정가치

In [21]:
import traceback
import datetime
from time import sleep
import OpenDartReader
api_key = 'f2e08d4ed3de0ba3d5cbf59c04c223e02b1751a2'
dart = OpenDartReader(api_key)

# 금액 단위 찾기
def get_unit(text):
    if '(단위:백만원)' in text or '[단위:백만원]' in text:
        unit = '백만원'
    elif '(단위:천원)' in text or '[단위:천원]' in text:
        unit = '천원'
    elif '(단위:원)' in text or '[단위:원]' in text:
        unit = '원'
    else:
        unit = None   
    return unit

# 콤마와 괄호를 제거한다.
def str2num(x):
    if "," in x:
        x = x.replace(",","")
    if x.startswith("("):
        x = x.lstrip("(")
    if x.endswith(")"):
        x = x.rstrip(")")
    return float(x)

# 현재 시각: 년월일_시분초
def now_dt_str():
    now = datetime.datetime.now()
    dt = now.strftime('%Y%m%d_%H%M%S')
    return dt

In [20]:
import requests
from bs4 import BeautifulSoup

# 파일 저장 위치
path_dir = "D:/PythonProject/data-gatherer/dart_fs_notes/items/"

for corp_codes in list_of_lists:
    for corp_code in corp_codes:
        str_expr = "corp_code == @corp_code"
        df_doc = fs_notes_all.query(str_expr)
        data_list = []
        for i, doc in df_doc.iterrows():
            sleep(0.1)
            try:
                val_a = val_b = 0.0
                unit = None
                doc_result = doc['doc_result']
                if doc_result == 'No Financial Statement Notes':
                    data_list.append({'corp_code' : doc['corp_code'], 'stock_code' : doc['stock_code'], 'corp_cls' : doc['corp_cls'], 'corp_name' : doc['corp_name'], 
                                    'rpt_num' : doc['rpt_num'], 'rpt_name' : doc['rpt_name'], 
                                    'doc_title' : doc['doc_title'], 'doc_result' : doc_result, 'unit' : unit, 
                                    'val_a' : None, 'val_b' : None, 'item_result' : None})
                else:
                    # 재무제표 주석 문서에서 사용된 금액 단위 찾기
                    response = requests.get(doc['doc_url'])
                    if response.status_code == 200:     # URL GET '200 정상'
                        soup = BeautifulSoup(response.text)
                        doc_text = soup.text
                        trimmed_text = doc_text.replace(" ","")     # 문서에 있는 모든 공백 제거
                        unit = get_unit(trimmed_text)
                    else:
                        unit = None
                    # 당기말의 "확정급여채무"와 "사외적립자산" 금액 찾기
                    doc_data = pd.read_html(doc['doc_url'])
                    for df in doc_data:
                        row = df.shape[0]
                        for i in range(0, row):
                            item = str(df.iloc[i,0])
                            if item.__contains__('확정급여채무의 현재가치'):
                                # print(i, type(df.iloc[i,1]), df.iloc[i,1])
                                str_a = str(df.iloc[i,1])
                                if str_a == "-": val_a = 0.0
                                else: val_a += str2num(str_a)       # 값을 합한다.
                            elif item.__contains__('사외적립자산의 공정가치'):
                                # print(i, type(df.iloc[i,1]), df.iloc[i,1])                            
                                str_b = str(df.iloc[i,1])
                                if str_b == "-": val_b = 0.0
                                else: val_b = str2num(str_b)
                            else:
                                continue    
                        if (val_a != 0.0) and (val_b != 0.0):
                            break          
                    data_list.append({'corp_code' : doc['corp_code'], 'stock_code' : doc['stock_code'], 'corp_cls' : doc['corp_cls'], 'corp_name' : doc['corp_name'], 
                                    'rpt_num' : doc['rpt_num'], 'rpt_name' : doc['rpt_name'], 
                                    'doc_title' : doc['doc_title'], 'doc_result' : doc_result, 'unit' : unit, 
                                    'val_a' : val_a, 'val_b' : val_b, 'item_result' : 'OK'})
            except ValueError as err:
                print(doc['corp_code'], doc['corp_name'], doc['rpt_num'], doc['rpt_name'], doc['rpt_url'], doc['doc_title'], doc['doc_url'])
                print(err)
                break
            except Exception as e:
                print(doc['corp_code'], doc['corp_name'], doc['rpt_num'], doc['rpt_name'], doc['rpt_url'], doc['doc_title'], doc['doc_url'])
                err_msg = traceback.format_exc()
                print(err_msg)
                break
        # 결과를 파일로 저장
        if len(data_list) == 0:
            break
        else:        
            df_items = pd.DataFrame(data_list)
            file_name = corp_code + '_item_' + now_dt_str() + '.csv'
            df_items.to_csv(path_dir + file_name, index=False)

01535150 SK리츠 20230314000759 사업보고서 (2022.12) http://dart.fss.or.kr/dsaf001/main.do?rcpNo=20230314000759 5. 재무제표 주석 http://dart.fss.or.kr/report/viewer.do?rcpNo=20230314000759&dcmNo=9054144&eleId=22&offset=569608&length=199453&dtd=dart3.xsd
Traceback (most recent call last):
  File "d:\PythonProject\data-gatherer\data-gatherer.venv\lib\site-packages\urllib3\connectionpool.py", line 790, in urlopen
    response = self._make_request(
  File "d:\PythonProject\data-gatherer\data-gatherer.venv\lib\site-packages\urllib3\connectionpool.py", line 536, in _make_request
    response = conn.getresponse()
  File "d:\PythonProject\data-gatherer\data-gatherer.venv\lib\site-packages\urllib3\connection.py", line 454, in getresponse
    httplib_response = super().getresponse()
  File "C:\Users\kyungho\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 1374, in getresponse
    response.begin()
  File "C:\Users\kyungho\AppData\Local\Programs\Python\Python310\lib\http\client.py", line 318, i