# Gathering data

1. 한국은행 금융통화위원회 의사록
2. '금리'를 포함한 연합인포맥스, 연합뉴스, 이데일리
3. 네이버 금융 각 증권사의 채권분석리포트 
4. 일별 콜금리
5. 한국은행 기준금리

In [1]:
# -*- coding: utf-8 -*-

import pandas as pd

from bs4 import BeautifulSoup as bs
from urllib.request import Request
from urllib.request import urlopen
from urllib.request import urlretrieve

import olefile
import re
import datetime

## 1. 한국은행 금융통화위원회 의사록

In [307]:
bok_mpb_url = "https://www.bok.or.kr/portal/bbs/B0000245/list.do?menuNo=200761"

In [309]:
def get_date_and_title(page_bs):
    result = []
    title_bs_list = page_bs.find_all('span', {'class':'titlesub'})
    
    for item in title_bs_list:
        item = item.text
        date_regex = re.compile(r'\d{4}\.\d+\.\d+')
        date = date_regex.search(item).group()
        date = datetime.datetime.strptime(date, "%Y.%m.%d").date()

        title_regex = re.compile(r'.*\)\(')
        title = title_regex.search(item).group()[:-1]
        
        date_and_title = {}
        date_and_title['date'] = date # 의사록은 하루에 2개 이상 올라오지 않는다고 가정. 
        date_and_title['title'] = title
        
        result.append(date_and_title)
        
    return result

In [311]:
def find_download_link_ancestor_from_elem(elem):
    file_regex = re.compile(r'fileDown')
    if elem.find('a', {'href':file_regex}):
        return elem
    else:
        return find_download_link_ancestor_from_elem(elem.parent)

In [319]:
def download_mpb_minute(title_bs, title, date_string):
    download_link_ancestor = find_download_link_ancestor_from_elem(title_bs)
    
    file_regex = re.compile(r'fileDown')
    aTag_list = download_link_ancestor.find_all(attrs={'href': file_regex})
    href_list = [x.attrs['href'] for x in aTag_list]
    
    file_type_regex = re.compile(r'hwp|pdf')
    file_type_list = download_link_ancestor.find_all(attrs={'title': file_type_regex})
    file_type_list = [file_type_regex.search(x.text).group() for x in file_type_list]
    
    href2file_type = {}
    if file_type_list == []:
        for href in href_list:
            href2file_type[href] = 'hwp'
    else:
        for idx, href in enumerate(href_list):
            href2file_type[href] = file_type_list[idx]
    
    for href in href_list:
        download_link = 'https://www.bok.or.kr' + href
        urlretrieve(download_link, './data_files/BOK_minutes/' + href2file_type[href] + '/' + date_string + '_-_' + title + '.' + href2file_type[href])

In [315]:
def get_mpb_minutes_files(page):
    print("Downloading MPB documents on page {}...".format(page))
    
    page_url = bok_mpb_url + "&pageIndex=" + str(page)
    page_req = urlopen(page_url)
    page_bs = bs(page_req, 'html.parser')

    title_bs_list = page_bs.find_all('span', {'class':'titlesub'}) ##
    date_and_titles = get_date_and_title(page_bs)
    
#     last_date_string = str(date_and_titles[-1]['date'])
    for idx, dict_elem in enumerate(date_and_titles):
        date = dict_elem['date']
        date_string = str(date)
        
        title = dict_elem['title']
        
        download_mpb_minute(title_bs_list[idx], title, date_string)
    return 1

In [320]:
for page in range(1, 32):
    get_mpb_minutes_files(page)

Downloading MPB documents on page 1...
Downloading MPB documents on page 2...
Downloading MPB documents on page 3...
Downloading MPB documents on page 4...
Downloading MPB documents on page 5...
Downloading MPB documents on page 6...
Downloading MPB documents on page 7...
Downloading MPB documents on page 8...
Downloading MPB documents on page 9...
Downloading MPB documents on page 10...
Downloading MPB documents on page 11...
Downloading MPB documents on page 12...
Downloading MPB documents on page 13...
Downloading MPB documents on page 14...
Downloading MPB documents on page 15...
Downloading MPB documents on page 16...
Downloading MPB documents on page 17...
Downloading MPB documents on page 18...
Downloading MPB documents on page 19...
Downloading MPB documents on page 20...
Downloading MPB documents on page 21...
Downloading MPB documents on page 22...
Downloading MPB documents on page 23...
Downloading MPB documents on page 24...
Downloading MPB documents on page 25...
Downloadi

## 2. 네이버금융 각 증권사 채권분석리포트

In [2]:
naver_bond_url = "https://finance.naver.com/research/debenture_list.nhn?"

In [15]:
def get_bond_info_and_download(page):
    print("Downloading bond reports on page {}".format(page))
    
    page_url = naver_bond_url + "&page=" + str(page)
    page_req = urlopen(page_url)
    page_bs = bs(page_req, 'html.parser')
    
    reports_bs_list = page_bs.select("div.box_type_m > table.type_1 > tr > td.file")
    reports_bs_list = [x.parent for x in reports_bs_list]
    
    reports_info_list = []
    
    for bs_elem in reports_bs_list:
        td_elements = []
        for child in bs_elem.select('td'):
            td_elements.append(child)
        
        title = td_elements[0].text
        company = td_elements[1].text
        date = datetime.datetime.strptime(td_elements[3].text, "%y.%m.%d").date()
        download_link = td_elements[2].a.attrs['href']
        forbidden_regex = re.compile(r'[\\/\*"\?><\|:]')
        urlretrieve(download_link, './data_files/NAVER_bond_reports/' + str(date) + "_" + company + re.sub(forbidden_regex, "", title) + ".pdf")
        
        info_dic = {}
        info_dic['report_title'] = title
        info_dic['company'] = company
        info_dic['date'] = date
        
        reports_info_list.append(info_dic)
    return reports_info_list

In [16]:
for page in range(1, 122):
    get_bond_info_and_download(page)

Downloading bond reports on page 1
Downloading bond reports on page 2
Downloading bond reports on page 3
Downloading bond reports on page 4
Downloading bond reports on page 5
Downloading bond reports on page 6
Downloading bond reports on page 7
Downloading bond reports on page 8
Downloading bond reports on page 9
Downloading bond reports on page 10
Downloading bond reports on page 11
Downloading bond reports on page 12
Downloading bond reports on page 13
Downloading bond reports on page 14
Downloading bond reports on page 15
Downloading bond reports on page 16
Downloading bond reports on page 17
Downloading bond reports on page 18
Downloading bond reports on page 19
Downloading bond reports on page 20
Downloading bond reports on page 21
Downloading bond reports on page 22
Downloading bond reports on page 23
Downloading bond reports on page 24
Downloading bond reports on page 25
Downloading bond reports on page 26
Downloading bond reports on page 27
Downloading bond reports on page 28
D