# 有報キャッチャーから有価証券報告書を取得する

In [3]:
import pandas as pd
import re
import os
import pickle
from selenium import webdriver
from selenium.webdriver.support.select import Select
import datetime

## 1. 決算短信のURL一覧を取得する

In [4]:
def display_day(driver, day):
    ID_START = 'ctl00_contentsCopy_IrList1_TextBoxFrom'
    ID_END = 'ctl00_contentsCopy_IrList1_TextBoxTo'
    
    for id in [ID_START, ID_END]:
        element = driver.find_element_by_id(id)
        element.clear()
        element.send_keys(day)
    
    search_btn = driver.find_element_by_id('ctl00_contentsCopy_IrList1_ButtonSearch2')
    driver.find_element_by_id(ID_START).click()
    search_btn.click()

In [5]:
def get_doctitle_url(element, doc_num):
    code_id = 'ctl00_contentsCopy_IrList1_ListViewIR_ctrl' + str(doc_num)+ '_HyperLinkCompany'
    code_element = element.find_element_by_id(code_id)
    code = re.search(r'\d+', code_element.text).group(0)

    ym_id = 'ctl00_contentsCopy_IrList1_ListViewIR_ctrl' + str(doc_num) + '_lblDisclosureDate'
    ym_element = element.find_element_by_id(ym_id)
    ym = ym_element.text.replace('/', '')

    doc_element_id = 'ctl00_contentsCopy_IrList1_ListViewIR_ctrl' + str(doc_num) + '_lblDocName'
    doc_element = element.find_element_by_id(doc_element_id)
    doc_title = doc_element.text

    title = '_'.join([code, ym, doc_title])
    url = doc_element.get_attribute('href')
    result = [title, url]

    return result

In [6]:
def make_day_list(start_date, end_date):
    """開始日時と終了日時を設定すると、期間内の日付がすべて入ったリストが返される

    Args:
        start_date (datetime.date): 開始日時
        end_date (datetime.date): 終了日時

    Returns:
        list: 開始日時から終了日時までのすべての日付が格納されたリスト
    """
    period = int((end_date - start_date).days)
    day_list = []
    for d in range(period):
        day = start_date + datetime.timedelta(days=d)
        day = str(day).replace('-', '/')
        day_list.append(day)
    
    day_list.append(str(end_date).replace('-', '/'))
    print(f'start_date is {day_list[0]}')
    print(f'end_date is {day_list[-1]}')
    print(f'period is {period} days')

    return day_list

In [7]:
def get_pageinfo(driver, day):
    try:
        # 検索対象が見つかった場合
        table = driver.find_element_by_id('ctl00_contentsCopy_IrList1_ListViewIR_tblProducts')
        trs = table.find_elements_by_tag_name('tr')

        titles = []
        urls = []
        for i, e in enumerate(trs):
            doc_info = get_doctitle_url(e, i)
            titles.append(doc_info[0])
            urls.append(doc_info[1])
        
        df = pd.DataFrame(columns=['title', 'url'])
        df['title'] = titles
        df['url'] = urls
        print(f'{day}に提出された書類のURL取得完了')
        return df

    except:
        # 検索対象が見つからなかった場合
        print(f"{day}に提出された有価証券報告書は存在しません。")
        
        return None

In [8]:
def get_dayinfo(driver, day):
    display_day(driver, day)

    try:
        page_change_id = 'ctl00_contentsCopy_IrList1_DropDownListPage'
        page_change_element = driver.find_element_by_id(page_change_id)
        select = Select(page_change_element)
        selectOps = select.options

        df_list = []
        for o in selectOps:
            if o.text == '1ページ':
                df = get_pageinfo(driver, day)
            else:
                select.select_by_visible_text(o.text)
                df = get_pageinfo(driver, day)
            
            df_list.append(df)
    except:
        df_list = [get_pageinfo(driver, day)]
    
    return df_list

In [9]:
def get_yuho_url(start, end):
    url = 'https://ufocatch.com/Xir.aspx?m=y'
    driver = webdriver.Chrome('../driver/chromedriver.exe')
    driver.get(url)
    search_box = driver.find_element_by_id('ctl00_contentsCopy_IrList1_TextBoxSearch')
    search_box.send_keys('有価証券報告書')

    for year in range(start, end):
        start = datetime.date(year, 1, 1)
        end = datetime.date(year, 12, 31)
        period = int((end - start).days) + 1
        day_list = make_day_list(start, end)

        df_list = []
        count = 1
        for day in day_list:
            print(f'now loading: {day}, 進捗率: {count}/{period}')
            df_list_1day = get_dayinfo(driver, day)
            count += 1
            df_list.append(df_list_1day)

        df_list_year = [d2 for d in df_list for d2 in d if not d2 is None]
        df_all = pd.concat(df_list_year)

        df_all.reset_index(inplace=True)
        df_all.drop(columns='index', inplace=True)

        file_path = '../../data/EDINET/download_list/' + str(year) +\
             '_yuhochatcher_dl_list.pickle'
        with open(file_path, mode='wb') as file:
            pickle.dump(df_all, file)

In [66]:
start = 2014
end = 2017
get_yuho_url(start, end)

start_date is 2014/01/01
end_date is 2014/12/31
period is 364 days
now loading: 2014/01/01, 進捗率: 1/365
2014/01/01に提出された有価証券報告書は存在しません。
now loading: 2014/01/02, 進捗率: 2/365
2014/01/02に提出された有価証券報告書は存在しません。
now loading: 2014/01/03, 進捗率: 3/365
2014/01/03に提出された有価証券報告書は存在しません。
now loading: 2014/01/04, 進捗率: 4/365
2014/01/04に提出された有価証券報告書は存在しません。
now loading: 2014/01/05, 進捗率: 5/365
2014/01/05に提出された有価証券報告書は存在しません。
now loading: 2014/01/06, 進捗率: 6/365
2014/01/06に提出された有価証券報告書は存在しません。
now loading: 2014/01/07, 進捗率: 7/365
2014/01/07に提出された有価証券報告書は存在しません。
now loading: 2014/01/08, 進捗率: 8/365
2014/01/08に提出された有価証券報告書は存在しません。
now loading: 2014/01/09, 進捗率: 9/365
2014/01/09に提出された書類のURL取得完了
now loading: 2014/01/10, 進捗率: 10/365
2014/01/10に提出された書類のURL取得完了
now loading: 2014/01/11, 進捗率: 11/365
2014/01/11に提出された有価証券報告書は存在しません。
now loading: 2014/01/12, 進捗率: 12/365
2014/01/12に提出された有価証券報告書は存在しません。
now loading: 2014/01/13, 進捗率: 13/365
2014/01/13に提出された有価証券報告書は存在しません。
now loading: 2014/01/14, 進捗率: 14/365
2014/01/14に提出された書類

In [10]:
def save_all_dl():
    os.chdir('C:/Users/koeci/Google ドライブ/MBA/ワークショップ/data/EDINET/download_list')
    dl_list = []
    for year in range(2008, 2017):
        path = str(year) + '_yuhochatcher_dl_list.pickle'
        with open(path, mode='rb') as file:
            dl = pickle.load(file)
            dl_list.append(dl)

    dl_all = pd.concat(dl_list, axis=0)
    dl_all.reset_index(inplace=True)
    dl_all.to_csv('yuhochatcher_all_dl_list.csv', encoding='CP932')
    with open('yuhochatcher_all_dl_list.pickle', mode='wb') as file:
        pickle.dump(dl_all, file)

In [149]:
save_all_dl()

In [11]:
# ダウンロードする際の正しい書類名、URLに修正する
def fix_url(start, end):
    os.chdir('C:/Users/koeci/Google ドライブ/MBA/ワークショップ/data/EDINET/download_list')
    df_list = []
    for year in range(start, end):
        path = str(year) + '_yuhochatcher_dl_list.pickle'
        with open(path, mode='rb') as file:
            df = pickle.load(file)
        
        df = df[~df['title'].str.contains('訂正') \
            & ~df['title'].str.contains('修正') \
            & ~df['title'].str.contains('数値') \
            & ~df['title'].str.contains('データ')]
        df['title'] = df['title'].str.replace(' ', '').str.replace('　', '')
        
        url_list = list(df['url'])
        new_url_list = []
        for url in url_list:
            url_split = url.split('=')
            common = 'https://resource.ufocatch.com/data/edinet/'
            code = url_split[1].split('&')[0] # ED~~の部分だけ抜く操作
            new_url = common + code
            new_url_list.append(new_url)

        new_df = df.drop(columns='url', inplace=False)
        new_df['url'] = new_url_list

        path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/data/EDINET/download_list_fixed/'\
             + str(year) + '_yuhochatcher_dl_list_fixed.pickle'
        with open(path, mode='wb') as file:
            pickle.dump(new_df, file)

        df_list.append(new_df)

    path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/data/EDINET/download_list_fixed/'\
             + 'yuhochatcher_all_dl_list_fixed.pickle'
    df_all = pd.concat(df_list, axis=0)
    df_all.reset_index(inplace=True)
    with open(path, mode='wb') as file:
        pickle.dump(df_all, file)

In [48]:
start = 2008
end = 2017
fix_url(start, end)

## 2. URLの一覧から決算短信XBRLファイルをダウンロードする

In [3]:
import pandas as pd
import requests
import re
import os
import pickle
from selenium import webdriver
from selenium.webdriver.support.select import Select
import datetime

In [2]:
def save_file(year, n=0):
    os.chdir('C:/Users/koeci/Google ドライブ/MBA/ワークショップ/data/EDINET/download_list_fixed/')
    path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/data/EDINET/download_list_fixed/' + str(year) + '_yuhochatcher_dl_list_fixed.pickle'
    with open(path, mode='rb') as file:
        df = pickle.load(file)

    all_num = df.shape[0]
    filename_list = 'D:/Workshop_Data/securities/' + str(year) + '/' + df['title'] + '.zip'
    url_list = df['url']
    filename_list = filename_list[n:]
    url_list = url_list[n:]
    for filename, url in zip(filename_list, url_list):
        doc = requests.get(url=url)
        try:
            with open(filename, mode='wb') as file:
                file.write(doc.content)
        except:
            pass
        
        print(f'{year}: {n + 1} / {all_num} was saved.')
        n += 1

In [1]:
start = 2013
end = 2017
for year in range(start, end):
    save_file(year, n=0)

NameError: name 'save_file' is not defined