# DART Open API를 이용해 기업 공시 정보를 가져온다. 

DART Open API 사용 연습

해당 코드는 Python 3.5/3.7, 32bit/64bit 상관 없다. 

In [1]:
# -*- coding: utf-8 -*-

from urllib.request import Request
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs

import pandas as pd
import numpy as np
import pickle

import time
import datetime
import dateutil.parser
import re
import json
import xml.etree.ElementTree as elemTree
import sys

## DART Open API와 연결

DART API에는 4가지 정보가 있고, 각 정보는 더 세부적으로 나뉜다. 
1. 공시정보
2. 사업보고서 주요정보
3. 상장기업 재무정보
4. 지분공시 종합정보

In [2]:
with open('./DART_password.txt', 'r') as f:
    API_KEY = f.read()

In [3]:
crtfc_key = '?crtfc_key=' + API_KEY

1. 공시정보: 
    - 공시검색: 공시 유형별, 회사별, 날짜별 등 여러가지 조건으로 공시보고서 검색기능을 제공합니다.
    - 기업개황: DART에 등록되어있는 기업의 개황정보를 제공합니다.
    - 공시서류원본파일: 공시보고서 원본파일을 제공합니다.
    - 고유번호: DART에 등록되어있는 공시대상회사의 고유번호,회사명,대표자명,종목코드, 최근변경일자를 파일로 제공합니다.

기타 세부사항은 API doc에서 확인: https://opendart.fss.or.kr/guide/detail.do?apiGrpCd=DS001&apiId=2019001

In [4]:
## 공시정보 base URLs

DART_list_json = 'https://opendart.fss.or.kr/api/list.json' # 공시검색
DART_company_json = 'https://opendart.fss.or.kr/api/company.json' # 기업개황
DART_document_xml = 'https://opendart.fss.or.kr/api/document.xml' # 공시서류원본파일
DART_corpCode_xml = 'https://opendart.fss.or.kr/api/corpCode.xml' # 고유번호

In [6]:
def DART_annc_info(info_type, **kwargs):
    """Create a request url that includes given parameters. 
    
    Args:
        info_type (str): Type of info to request
        
    Kwargs:
        Too many. Refer to the API doc link above. 
        
    Returns:
        str.
        A complete url to hand over to Open DART API. 
    
    """
    parameters = ''
    for k, v in kwargs.items():
        parameters += '&' + str(k) + '=' + str(v)
    
    if info_type == 'list':
        return DART_list_json + crtfc_key + parameters
    elif info_type == 'company':
        return DART_company_json + crtfc_key + parameters
    elif info_type == 'document':
        return DART_document_xml + crtfc_key + parameters
    elif info_type == 'corpCode':
        return DART_corpCode_xml + crtfc_key + parameters
    else:
        print('Wrong info_type. Choose from:')
        print('''
        1. "list": 공시검색
        2. "company": 기업개활
        3. "document": 공시서류원본파일
        4. "corpCode": 고유번호
        ''')

In [7]:
def DART_get_response(request_url):
    """Get response from Open DART API. 
    
    Args: 
        request_url (str): The url to request. 
        
    Returns:
        tuple.
        (
            (str) type of the object,
            xml or json object
        )
        
    
    """
    req = urlopen(request_url)
    response = req.read().decode('utf8')
    
    try:
        result = ('json', json.loads(response))
    except JSONDecodeError:
        result = ('xml', elemTree.fromstring(response))
    except:
        print("An error occurred: ", sys.exc_info()[0])
        return 0
        
    return result

In [8]:
req_url = DART_annc_info('list', corp_code='00919966', bgn_de='20130801', end_de='20150815')
req_url

'https://opendart.fss.or.kr/api/list.json?crtfc_key=407c1fe7fc7a1a183002c6d5f981408662cd879e&corp_code=00919966&bgn_de=20130801&end_de=20150815'

In [9]:
DART_get_response(req_url)

('json',
 {'status': '000',
  'message': '정상',
  'page_no': 1,
  'page_count': 10,
  'total_count': 9,
  'total_page': 1,
  'list': [{'corp_code': '00919966',
    'corp_name': '신라젠',
    'stock_code': '215600',
    'corp_cls': 'K',
    'report_nm': '분기보고서 (2015.03)',
    'rcept_no': '20150601000841',
    'flr_nm': '신라젠',
    'rcept_dt': '20150601',
    'rm': '정'},
   {'corp_code': '00919966',
    'corp_name': '신라젠',
    'stock_code': '215600',
    'corp_cls': 'K',
    'report_nm': '주요사항보고서(중요한자산양수도결정)',
    'rcept_no': '20150430001501',
    'flr_nm': '신라젠',
    'rcept_dt': '20150430',
    'rm': ''},
   {'corp_code': '00919966',
    'corp_name': '신라젠',
    'stock_code': '215600',
    'corp_cls': 'K',
    'report_nm': '[기재정정]사업보고서 (2014.12)',
    'rcept_no': '20150423000246',
    'flr_nm': '신라젠',
    'rcept_dt': '20150423',
    'rm': '연'},
   {'corp_code': '00919966',
    'corp_name': '신라젠',
    'stock_code': '215600',
    'corp_cls': 'K',
    'report_nm': '[기재정정]사업보고서 (2014.12)',
    'r

## 공시시간 크롤링

문의결과, DART API는 현재 공시시간 정보를 제공하지 않는다. (아직 시범운영기간임을 감안하긴 해야한다.)

따라서, 최근 공시 페이지는 직접 크롤링하기로 하였다. 다소 번거롭지만 API에서 지원이 되기 전까진 DART API에서 기본적인 공시 정보를 가져오고, 분단위의 공시시간이 필요한 경우 해당 날짜의 최근 공시를 크롤링한 결과와 대조해 결과를 매칭시키도록 한다. 

In [10]:
def get_recent_annc_list_bs(date):
    """Scrape recent announcements of a specific date. 
    
    Args: 
        date (str): The date to scrape. Should be in YYYY.MM.DD format. 
        
    Returns:
        list.
        The list of beautifulsoup objects that each contains an announcement. 
    
    """
    date_regex = re.compile(r'^\d{4}\.\d{2}\.\d{2}$')
    if not date_regex.match(date):
        print("Error: Date format should be - yyyy.mm.dd")
        return 0
    
    recent_annc_list_bs = []
    
    for page in range(1, 11):
        recent_annc_url = f'http://dart.fss.or.kr/dsac001/mainK.do?selectDate={date}&currentPage={page}&sort=&series=&mdayCnt=0#'
        recent_annc_req = urlopen(recent_annc_url)
        recent_annc_bs = bs(recent_annc_req, 'html.parser')
        recent_annc_list_bs += recent_annc_bs.select('div.table_list > table > tr')
    
    recent_annc_list_bs = [x for x in recent_annc_list_bs if '검색된 자료가 없습니다.' not in x.text]
    
    return recent_annc_list_bs

In [11]:
def annc_bs2data(annc_bs):
    """Convert an announcement's beautifulsoup object to a dictionary data. 
    
    Args: 
        annc_bs (bs object): An announcement's beautifulsoup object. 
        
    Returns:
        dict.
        {
            'annc_time': ,
            'corp_code': ,
            'annc_title': , 
            'annc_id': ,
        }
    
    """
    annc_time_regex = re.compile(r'\d\d:\d\d')
    annc_time = annc_bs.find('td', attrs={'class':'cen_txt'}).text
    annc_time = annc_time_regex.search(annc_time).group()
    
    corp_code_regex = re.compile(r'\d{8}')
    corp_code = annc_bs.find('span', {'class':'nobr1'}).a.attrs['onclick']
    corp_code = corp_code_regex.search(corp_code).group()
    
    annc_content_regex = re.compile(r'openReportViewer')
    annc_content = annc_bs.find('a', attrs={'onclick':annc_content_regex})  
    
    annc_title_regex = re.compile('\\\\.')
    annc_content_text = annc_content.text 
    str_text = "%r"%annc_content_text # raw string으로 변환시켜줘야 \t가 tab으로 인식되지 않는다. 
    raw_text = str_text[1:-1]
    
    annc_title = re.sub(annc_title_regex, '', raw_text).strip() 
    
    annc_id_regex = re.compile(r'\d+')
    annc_id = annc_id_regex.search(annc_content.attrs['id']).group()
    
    data = {}
    data['annc_time'] = annc_time
    data['corp_code'] = corp_code
    data['annc_title'] = annc_title
    data['annc_id'] = annc_id
    
    return data

In [22]:
def recent_anncs2df(start_date, end_date, save=False, logging=False, delay=(50, 300, 2)):
    """The main function of DART recent announcements scraping. Scrape data in given date range and convert them to pandas df. 
    
    Args: 
        start_date (str or int): Start date.
        end_date (str or int): End date.
        save (bool): Save to .pkl if True.
        logging (bool): Print logs if True.
        delay (tuple): (
                        how many iterations before long pause, 
                        long pause seconds, 
                        each iteration pause seconds
                        )
        
    Returns:
        pandas dataframe
        Pandas dataframe of announcement data in given date range. 
        columns = ['datetime', 'corp_code', 'annc_title', 'annc_id']
    
    """
    start_date = dateutil.parser.parse(str(start_date))
    end_date = dateutil.parser.parse(str(end_date))
    date_range = pd.date_range(start=start_date, end=end_date).tolist()
    
    all_anncs_df = pd.DataFrame(columns=['date', 'annc_time', 'corp_code', 'annc_title', 'annc_id'])
    error_dates =[]
    
    for i, date in enumerate(date_range):
        
        try:
            time.sleep(delay[2])
            if (i != 0) and (i % delay[0] == 0) and logging:
                print(f'sleeping for {delay[1]} seconds...')
                time.sleep(delay[1])

            anncs_df = pd.DataFrame(columns=['date', 'annc_time', 'corp_code', 'annc_title', 'annc_id'])
            anncs_of_the_day = get_recent_annc_list_bs(date.strftime('%Y.%m.%d'))
            anncs_of_the_day = [annc_bs2data(annc) for annc in anncs_of_the_day]
            anncs_df = anncs_df.append(pd.DataFrame(anncs_of_the_day)) 
            anncs_df.date = date

            all_anncs_df = all_anncs_df.append(anncs_df)

            if logging:
                print(f'Added data of {date}')
        except:
            print(f'Error occured at {date}. Sleeping for {delay[1]} seconds...')
            error_dates.append(date)
            time.sleep(delay[1])
            continue
    
    all_anncs_df.loc[:, 'datetime'] = pd.to_datetime(all_anncs_df.date.astype(str) + ' ' + all_anncs_df.annc_time)
    all_anncs_df.drop(['date', 'annc_time'], axis=1, inplace=True)
    
    if save:
        all_anncs_df.to_pickle(f"./all_anncs_df_{start_date.strftime('%Y.%m.%d')}-{end_date.strftime('%Y.%m.%d')}.pkl")
        with open(f'./error_dates_{start_date.strftime('%Y.%m.%d')}-{end_date.strftime('%Y.%m.%d')}.txt', 'w') as f:
            f.writelines("{}\n".format(err_date) for err_date in error_dates)
    
    if error_dates != []:
        print(f'There were error with these dates:')
        for err_date in error_dates:
            print(err_date)
        
    return all_anncs_df

In [None]:
%%time

recent_anncs2df(20140101, 20200404, save=True, logging=True)

Added data of 2014-01-01 00:00:00
Added data of 2014-01-02 00:00:00
Added data of 2014-01-03 00:00:00
Added data of 2014-01-04 00:00:00
Added data of 2014-01-05 00:00:00
Added data of 2014-01-06 00:00:00
Added data of 2014-01-07 00:00:00
Added data of 2014-01-08 00:00:00
Added data of 2014-01-09 00:00:00
Added data of 2014-01-10 00:00:00
Added data of 2014-01-11 00:00:00
Added data of 2014-01-12 00:00:00
Added data of 2014-01-13 00:00:00
Added data of 2014-01-14 00:00:00
Added data of 2014-01-15 00:00:00
Added data of 2014-01-16 00:00:00
Added data of 2014-01-17 00:00:00
Added data of 2014-01-18 00:00:00
Added data of 2014-01-19 00:00:00
Added data of 2014-01-20 00:00:00
Added data of 2014-01-21 00:00:00
Added data of 2014-01-22 00:00:00
Added data of 2014-01-23 00:00:00
Added data of 2014-01-24 00:00:00
Added data of 2014-01-25 00:00:00
Added data of 2014-01-26 00:00:00
Added data of 2014-01-27 00:00:00
Added data of 2014-01-28 00:00:00
Added data of 2014-01-29 00:00:00
Added data of 