# XBRLファイルから株主データを取得する

## 1. XBRLからHTML形式で株主テーブルを取得する

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import re
import zipfile
import pickle
from bs4 import BeautifulSoup
from arelle import ModelManager
from arelle import Cntlr

In [2]:
def unzip(zip_path):
    zip_files = glob.glob(os.path.join(zip_path, '*.zip'))
    for i, f in enumerate(zip_files):
        len_files = len(zip_files)
        print(f'file:{f}, loading {i+1} / {len_files}')
        try:
            with zipfile.ZipFile(f) as zip_f:
                zip_f.extractall(zip_path)
        except:
            print('zipファイルではありません。')
    
    return None

In [None]:
zip_path = 'D:/Workshop_Data/new_release/'
unzip(zip_path)

In [3]:
def xbrl(xbrl_path, intermediate_path, recursive=False):
    edinet_code_list = []
    security_code_list = []
    firm_name_list = []
    ymd_list = []
    stk_html_list = []

    xbrl_files = glob.glob(xbrl_path, recursive=recursive)
    length = len(xbrl_files)

    for i, xbrl_file in enumerate(xbrl_files):
        print(f'{i + 1} / {length}')
        ctrl = Cntlr.Cntlr()
        model_manager = ModelManager.initialize(ctrl)
        model_xbrl = model_manager.load(xbrl_file)

        edinet_code = 0
        security_code = 0
        firm_name = 0
        ymd = 0
        stockholder = 0
        for fact in model_xbrl.facts:
            if fact.concept.qname.localName == 'EDINETCodeDEI':
                print(f'EDINETコード：{fact.value}')
                edinet_code = fact.value

            elif fact.concept.qname.localName == 'SecurityCodeDEI':
                print(f'証券コード：{fact.value}')
                security_code = fact.value
            
            elif fact.concept.qname.localName == 'FilerNameInJapaneseDEI':
                print(f'企業名：{fact.value}')
                firm_name = fact.value

            elif fact.concept.qname.localName == 'FilingDateCoverPage':
                print(f'提出日：{fact.value}')
                ymd = fact.value

            elif fact.concept.qname.localName == 'SummaryOfShareholdersTextBlock':
                # print(f'株主の状況：{fact.value}')
                stockholder = fact.value

        edinet_code_list.append(edinet_code)
        security_code_list.append(security_code)
        firm_name_list.append(firm_name)
        ymd_list.append(ymd)
        stk_html_list.append(stockholder)
        print(edinet_code, firm_name, ymd)

    df_stk = pd.DataFrame(data=[edinet_code_list, security_code_list, firm_name_list, ymd_list, stk_html_list]).T
    df_stk.columns = ['edinet_code', 'security_code', 'firm_name', 'filling_ymd', 'stk_html']
    with open(intermediate_path, mode='wb') as file:
        pickle.dump(df_stk, file)

In [None]:
# EDINETからの取得分
xbrl_path = 'D:/Workshop_Data/new_release/XBRL/PublicDoc/*.xbrl'
intermediate_path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/intermediate/xbrl_parsed_0924.pickle'
xbrl(xbrl_path, intermediate_path)

# 有報キャッチャーからの取得分
xbrl_path = 'D:/Workshop_Data/new_release_2/**/*.xbrl'
intermediate_path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/intermediate/xbrl_parsed_1015.pickle'
xbrl(xbrl_path, intermediate_path, recursive=True)

In [4]:
intermediate_path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/intermediate/xbrl_parsed_0924.pickle'
with open(intermediate_path, mode='rb') as file:
    df_stk_1 = pickle.load(file)
intermediate_path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/intermediate/xbrl_parsed_1015.pickle'
with open(intermediate_path, mode='rb') as file:
    df_stk_2 = pickle.load(file)

df_stk = pd.concat([df_stk_1, df_stk_2], axis=0)
df_stk = df_stk.query('not stk_html == 0')

In [5]:
class Translate:
    ZENKAKU = ''.join(chr(0xff01 + i) for i in range(94)) # 全角文字
    HANKAKU = ''.join(chr(0x21 + i) for i in range(94)) # 半角文字
    ZENTOHAN = str.maketrans(ZENKAKU, HANKAKU) # 全角→半角
    HANTOZEN = str.maketrans(HANKAKU, ZENKAKU) # 半角→全角
    ROUNDNUM = ''.join(chr(0x2460 + i) for i in range(9)) # 丸文字
    NUM = ''.join(str(i) for i in range(1, 10)) # 数字
    ROUNDTONUM = str.maketrans(ROUNDNUM, NUM) # 丸文字→数字
    ADHOC_DICT = {'注': '※', '*': '※'} # 手動で整形する文字
    ADHOC_TRANS = str.maketrans(ADHOC_DICT) # 変換テーブル

    def __init__(self, value):
        self.value = value
        

    def zentohan(self, zentohan=True):
        if zentohan is True:
            self.value = self.value.translate(self.ZENTOHAN)
            return self
        elif zentohan is False:
            self.value = self.value.translate(self.HANTOZEN)
            return self
        else:
            print('変換失敗')
            return None


    def roundtonum(self):
        self.value = self.value.translate(self.ROUNDTONUM)
        return self


    def adhoc_trans(self):
        self.value = self.value.translate(self.ADHOC_TRANS)
        return self

In [6]:
def fix_td(td):
    td_text = td.text
    fix = Fix(td_text)
    td_fixed = fix.fix_value().value

    return td_fixed


def fix_tr(tr):
    tds = tr.findAll('td')
    tr_fixed = list(map(fix_td, tds))

    return tr_fixed


def div_to_df(div):
    trs = div.findAll('tr')
    trs_fixed = list(map(fix_tr, trs))
    df = pd.DataFrame(trs_fixed)
    df = df.replace('', np.nan)
    df.dropna(axis=0, how='all', inplace=True)
    df.set_axis(df.iloc[0, :], inplace=True, axis=1)
    df = df.iloc[1:, :]

    return df

In [208]:
class Fix:
    def __init__(self, value):
        self.value = value

    def fix_value(self):
        value_split = self.value.splitlines()
        value_join = ''.join(value_split)
        translate = Translate(value_join)
        value_fixed = translate.zentohan().roundtonum().adhoc_trans().value # Translateオブジェクトのvalue変数(str)を抽出
        value_fixed = value_fixed.replace(' ', '').replace('　', '').replace('\xa0', '').replace('\u3000', '')
        self.value = value_fixed

        return self


    def fix_annotation_pre(self):
        pattern_pre = r'^[0-9]\.|^[0-9]|^\(※\)|^※|^※[0-9]|^\(※\)[0-9]\.|\(※[0-9]\)|^\(|^\)|、[0-9]'
        repl = ''
        annot_fixed = re.sub(pattern=pattern_pre, repl=repl, string=self.value, count=10)
        self.value = annot_fixed

        return self


    def fix_annotation_suff(self):
        pattern_suff = r'[0-9]$|[0-9]\.$|、$|※[0-9]$'
        repl = ''
        annot_fixed = re.sub(pattern=pattern_suff, repl=repl, string=self.value, count=10)
        self.value = annot_fixed

        return self


    def fix_annotation_adhoc(self, pattern, repl):
        """
        指定した文字を指定した文字に置き換える関数
        """
        annot_list = [re.sub(pattern=pattern, repl=repl, string=a) for a in self.value]
        self.value = annot_list

        return self        

In [8]:
def annotation_original_to_df(html):
    # 注釈文章をpタグで分割したままのデータフレームを出力
    soup  = BeautifulSoup(html, 'html.parser')
    div_last = soup.findAll('div')[-1]
    annotation = div_last.findNextSiblings() #<p>タグ（等）が個数分だけ入る
    annot_list = [re.sub(r'[0-9] ', r'[0-9]', Translate(a.text).roundtonum().value.rstrip(' ').rstrip('　').replace('\u3000', '').replace('\xa0', ''))
        for a in annotation]
    annot_list = [a for a in annot_list if a]
    df = pd.DataFrame(annot_list).T

    return df

In [20]:
codes = df_stk['edinet_code']
names = df_stk['firm_name']
ymds = df_stk['filling_ymd']
htmls = df_stk['stk_html']
annot_list = [[c, n, y, annotation_original_to_df(h)] for c, n, y, h in zip(codes, names, ymds, htmls)]

annot_add_list = []
for edinet_code, firm_name, filling_ymd, df in annot_list:
    df.insert(loc=0, column='edinet_code', value=edinet_code)
    df.insert(loc=1, column='firm_name', value=firm_name)
    df.insert(loc=2, column='filling_ymd', value=filling_ymd)
    annot_add_list.append(df)

annot_merged = pd.concat(annot_add_list, axis=0)
annot_cols = ['edinet_code', 'firm_name', 'filling_ymd'] + ['a_' + str(i) for i in range(19)]
annot_merged.columns = annot_cols

annot_merged.sort_values(['a_0'], inplace=True)
path = '../../output/annotation_table.csv'
annot_merged.to_csv(path, encoding='cp932', header=True, index=False)

In [185]:
path = '../../output/annotation_table_sukoshi_fixed.csv'
annot_sukoshi = pd.read_csv(path, encoding='cp932', header=0)

row_list = []
for row in annot_sukoshi.itertuples():
    col_list = []
    for num in range(0, len(annot_sukoshi.columns) + 1):
        if num >= 6:
            value = row[num]
            if not '[0-9]' in str(value):
                new_values = str(value).split()
            else:
                new_values = str(value).split('[0-9]')
            col_list.append(new_values)
    code = row[1]
    append_list = [code] + [c for cl in col_list for c in cl]
    append_list_fixed = [Fix(a).fix_value().fix_annotation_pre().fix_annotation_pre().fix_annotation_pre().value for a in append_list]
    append_list_fixed = [a for a in append_list_fixed if not a == '' and (not a == 'nan') and (a is not None) and (not '自己株式' in a) and (not '新株予約権' in a) and (not '総数' in a) and not ('潜在' in a) and (not 'す。' in a)]
    row_list.append(append_list_fixed)

annot_new_df = pd.DataFrame(row_list)
annot_new_df.dropna(how='all', axis=1, inplace=True)
path = '../../output/annot_new_fixed.csv'
annot_new_df.to_csv(path, encoding='cp932', header=True)

In [342]:
# 手動整形後のcsv読込
path = '../../output/annot_new_fixed_full.csv'
annot_fixed_full = pd.read_csv(path, encoding='cp932', header=0, index_col='edinet_code')
annot_fixed_full.dropna(how='all', inplace=True, axis=1)

cols = annot_fixed_full.columns
cols = [c for c in cols if c.startswith('a_')]
for col in cols:
    annot_fixed_full[col] = annot_fixed_full[col].apply(lambda x: Fix(str(x)).fix_annotation_pre().fix_annotation_pre().fix_annotation_suff().fix_annotation_suff().value)

path = '../../output/annot_comp.csv'
annot_fixed_full.to_csv(path, encoding='cp932', header=True, index=True)

In [None]:
def annotation_to_df(html):
    # 注釈HTMLを抽出する
    soup  = BeautifulSoup(html, 'html.parser')
    div_last = soup.findAll('div')[-1]
    annotation = div_last.findNextSiblings() #<p>タグ（等）が個数分だけ入る
    print(annotation)

    # '特別利害関係者'など、必要な項目のみをピックアップする
    annot_list = []
    for a in annotation:
        a_text = a.get_text('@')
        a_split = [a for al in a_text.split('\n') for a in re.split(r'[@]', al)]
        annot_list.append(a_split)

    print(annot_list)
    #annot_pickuped = list(filter(pickup_annot, annot_list))

    # 各項目について文字列の整形を行う
    annot_fixed_list = [a for al in annot_list for a in al] # 半角等様々な整形
    annot_fixed_list = [a for a in annot_fixed_list if a]
    # print(annot_to_df)
    annot_to_df = annot_fixed_list
    df = pd.DataFrame(annot_to_df).T
    return df

In [None]:
    pattern_brackets = r'(\(.+\))|(:.+)'
    brackets = [re.search(pattern=pattern_brackets, string=a) for a in annot_fixed_list]
    brackets = [b.group() for b in brackets if b]
    annot_remove_brackets = [re.sub(pattern_brackets, repl='', string=a) for a in annot_fixed_list if not a == '']
    annot_remove_brackets = [a for a in annot_remove_brackets if not a == '']
    print(brackets)
    print(annot_remove_brackets)
    annot_to_df = []
    #for a, b in zip(annot_remove_brackets, brackets):
    #    annot_to_df.append(a)
    #    annot_to_df.append(b)
    annot_to_df = [''.join([a, b]) for a, b in zip(annot_remove_brackets, brackets)]
    # if annot_to_df == []:
    #     annot_to_df = annot_fixed_list
    # 最終リストを作成（アドホックに削除する）
    ad_pattern_list = [
        r'\(※\)', r'\(※[0-9]\)', r'(※)[0-9]\.', r'^※', r'^※[0-9]', r'^※[0-9]', 
        r'[0-9]$|、', r'^[0-9]\.', r'^[0-9]', r'^\.', r'^:',
        r'。なおRed\(※1\)', r'。なおRed', r''
        ]
    for ad_pattern in ad_pattern_list:
        annot_to_df = Fix(annot_to_df).fix_annotation_adhoc(pattern=ad_pattern, repl='').value
    

In [None]:
annotation_to_df(df_stk['stk_html'][84])

In [None]:
annot_list = [annotation_to_df(d) for d in df_stk['stk_html']]

In [347]:
annot_merged = pd.concat(annot_list, axis=0)
path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/intermediate/annotation_table.csv'
annot_merged.to_csv(path, encoding='cp932', header=False, index=False)

In [None]:
html = df_stk['stk_html'][952]
soup = BeautifulSoup(html, 'html.parser')
div = soup.findAll('div')[-1]
annotation = div.findNextSiblings()
annot_text = [t.text for t in annotation]
print(annotation)
print(soup)

In [211]:
def table_to_df(html):
    soup = BeautifulSoup(html, 'html.parser')
    divs = soup.findAll('div')
    df_list = list(map(div_to_df, divs))
    # print(df_list)
    try:
        df = pd.concat(df_list, axis=0)
        df.reset_index(drop=True, inplace=True)
    except:
        return pd.DataFrame() # 空のデータフレームを返す

    if len(df.columns) == 4:
        return df
    else:
        return pd.DataFrame() # 空のデータフレームを返す

In [145]:
def table_to_df_2(html, num):
    df_html_list = pd.read_html(html)
    df_list = []
    for df in df_html_list:
        df.dropna(how='all', inplace=True)
        col_len = len(df.columns)
        df.set_axis(labels=list(range(col_len)), axis=1, inplace=True)
        df.drop_duplicates(inplace=True)
        df_list.append(df)

    df_all = pd.concat(df_list, axis=0)
    df_all['firm_num'] = num
    
    if len(df_all.columns) == 5:
        return df_all
    else:
        return None

In [None]:
# table_to_df_2
htmls = df_stk['stk_html']
df_htmls = [table_to_df_2(html, i) for i, html in enumerate(htmls)]
df_htmls_merged = pd.concat(df_htmls, axis=0)

In [231]:
# table_to_df
codes = df_stk['edinet_code']
names = df_stk['firm_name']
ymds = df_stk['filling_ymd']
htmls = df_stk['stk_html']
firm_infos = [[code, name, ymd, table_to_df(html)] for code, name, ymd, html in zip(codes, names, ymds, htmls)]
path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/intermediate/html_tables.pickle'
with open(path, mode='wb') as file:
    pickle.dump(obj=firm_infos, file=file)

In [None]:
firm_infos[450][3]

In [260]:
def add_metadata(num, edinet_code, firm_name, filling_ymd, df):
    tb = df

    # EDINETコード等をdfに追加する
    try:
        tb.insert(loc=0, column='edinet_code', value=edinet_code)
        tb.insert(loc=1, column='firm_name', value=firm_name)
        tb.insert(loc=2, column='filling_ymd', value=filling_ymd)
    except:
        return pd.DataFrame()

    # 列数や企業番号を列に追加する
    tb['row'] = tb.index
    tb['num'] = num
    new_colname = ['edinet_code', 'firm_name', 'filling_ymd', 'name', 'loc', 'stock', 'rates', 'row', 'num']
    tb.set_axis(new_colname, inplace=True, axis=1)

    return tb

In [256]:
def recompose_df(df):
    tb = df

    def check_additional_info(value):
        pattern = r'\(([0-9]|.)+\)'# (4.10)のような括弧の中に数字か'.'のみが入っているパターンは注釈なので位置行上にずらす
        if pd.isna(value):
            return 1
        elif re.match(pattern=pattern, string=value):
            return 1
        else:
            return 0

    # [rateがNAの行]=[名前の直後に（）のような株数の記載がある行]に目印を入れる
    tb['r_na'] = tb['rates'].apply(check_additional_info) # rateがNAだったら１を入れる
    tb_r_na = tb[tb['r_na'] == 1].copy() # rateがNAの行のみを抜き出す
    tb_r_na['row'] = tb_r_na['row'].apply(lambda x: x - 1) # １行上=（）の氏名や住所が対応する行番号を取得する
    r_na_colname = ['edinet_code', 'firm_name', 'filling_ymd', 'tmp1', 'tmp2', 'tmp3', 'tmp4', 'row', 'num', 'r_na']
    tb_r_na.set_axis(r_na_colname, inplace=True, axis=1)

    # （）内を該当行の直後の行から、横に持ってきて連結する
    tb = tb[tb['r_na'] == 0] # （）が別行に入っていない行を抽出する
    new_tb = pd.merge(left=tb, right=tb_r_na, on=['edinet_code', 'firm_name', 'filling_ymd', 'num', 'row'], how='left') 

    return new_tb

In [261]:
path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/intermediate/html_tables.pickle'
with open(path, mode='rb') as file:
    firm_infos = pickle.load(file)
tb_list = [add_metadata(num, fi[0], fi[1], fi[2], fi[3]) for num, fi in enumerate(firm_infos) if not len(fi[3]) == 0]
tb_recomposed_list = [recompose_df(df) for df in tb_list]

In [504]:
merged = pd.concat(tb_recomposed_list, axis=0)
merged.sort_values(['num', 'row'], ascending=[True, True], inplace=True)

merged_copy = merged.copy()
tmp1 = merged_copy['tmp1']
tmp2 = merged_copy['tmp2']
tmp3 = merged_copy['tmp3']
tmp4 = merged_copy['tmp4']
new_tmp1 = []
new_tmp2 = []
new_tmp3 = []
new_tmp4 = []

regex = r'\([0-9]|,+\)|[0-9]|,+\([0-9]|,+\)'
pattern = re.compile(regex)
for t1, t2, t3, t4 in zip(tmp1, tmp2, tmp3, tmp4):
    try:
        matchObj = pattern.match(t1)
        if not matchObj == None:
            new_t3 = t2
            new_t2 = t1
            new_t1 = np.nan
        else:
            new_t3 = t3
            new_t2 = t2
            new_t1 = t1
    except:
        new_t3 = t3
        new_t2 = t2
        new_t1 = t1
    
    try:
        matchObj = pattern.match(t2)
        if not matchObj == None:
            new_t4 = new_t3
            new_t3 = new_t2
            new_t2 = np.nan
        else:
            new_t4 = t4
            new_t3 = t3
            new_t2 = t2
    except:
        new_t4 = t4
        new_t3 = t3
        new_t2 = t2

    new_tmp1.append(new_t1)
    new_tmp2.append(new_t2)
    new_tmp3.append(new_t3)
    new_tmp4.append(new_t4)


new_merged = merged.copy()
new_merged['tmp1'] = new_tmp1
new_merged['tmp2'] = new_tmp2
new_merged['tmp3'] = new_tmp3
new_merged['tmp4'] = new_tmp4
#546, 16
# 4000<500>を4000(500)にする
new_merged['stock'] = [nm.replace('<', '(').replace('>', ')') for nm in new_merged['stock']]
new_merged['rates'] = [nm.replace('<', '(').replace('>', ')') for nm in new_merged['rates']]

# 名前の分割
name_split = new_merged['name'].copy().apply(lambda x: str(x).split('※', maxsplit=1)[0])
annot = new_merged['name'].copy().apply(lambda x: x.split('※', maxsplit=1)[1] if len(str(x).split('※')) >= 2 else np.nan)
new_merged['name'] = name_split
new_merged['tmp1'] = annot

# 株数の分割
stock_split = new_merged['stock'].copy().apply(lambda x: str(x).split('(', maxsplit=1)[0])
stock_brackets = new_merged['stock'].copy().apply(lambda x: x.split('(', maxsplit=1)[1] if len(str(x).split('(')) >= 2 else np.nan)
new_merged['stock'] = stock_split
new_merged['tmp3'] = stock_brackets

# 比率の分割
rates_split = new_merged['rates'].copy().apply(lambda x: str(x).split('(', maxsplit=1)[0])
rates_brackets = new_merged['rates'].copy().apply(lambda x: x.split('(', maxsplit=1)[1] if len(str(x).split('(')) >= 2 else np.nan)
new_merged['rates'] = rates_split
new_merged['tmp4'] = rates_brackets

# 両端の括弧等を取る
for col in new_merged.columns:
    new_merged[col] = new_merged[col].apply(lambda x: str(x).strip(r'\(').strip(r'\)').replace('※', '').replace('、', ','))

# それぞれの列でおかしな所を修正する
new_merged['stock'] = new_merged['stock'].apply(lambda x: x.replace(',', '').replace('普通株式', ''))
new_merged['rates'] = new_merged['rates'].apply(lambda x: x.replace(',', ''))
new_merged['tmp1'] = new_merged['tmp1'].apply(lambda x: x.replace('.', ',').replace('()', ',').replace(')(', ',').strip(','))

# tmp1列には数字かカンマ以外残さない関数
def degit_or_comma(value):
    find = re.findall(r'\d+', str(value))
    result = ','.join(find)
    if result == '':
        result = np.nan

    return result


# 2桁の注釈番号にカンマを付ける関数
def comma(value):
    value_splited = []
    s = str(value)
    if str.isnumeric(s) and len(s) >= 2 and not s == '10' and not s == '11' and not s == '12' and not s == '13' and not s == '14':
        value_splited = [v for v in s]
        result = ','.join(value_splited)
    else:
        result = value

    return result



new_merged['tmp1'] = new_merged['tmp1'].apply(comma)
new_merged['tmp1'] = new_merged['tmp1'].apply(degit_or_comma)
new_merged['tmp3'] = new_merged['tmp3'].apply(lambda x: x.replace(',', ''))
new_merged['tmp4'] = new_merged['tmp4'].apply(lambda x: x.replace(',', ''))

# 注釈番号を区切って横に並べたデータ
annot_numbers = new_merged['tmp1']
annot_numbers_splited = list(map(lambda x: str(x).split(','), annot_numbers))
annot_numbers_splited = list(map(pd.unique, annot_numbers_splited))
annot_numbers_df = pd.DataFrame(annot_numbers_splited)
cols = ['a_num_' + str(i + 1) for i in range(5)]
annot_numbers_df.set_axis(labels=cols, axis=1, inplace=True)

new_merged = pd.merge(left=new_merged, right=annot_numbers_df, left_index=True, right_index=True, how='left')
path = '../../intermediate/stockholders_table_merged_1016.csv'
new_merged.to_csv(path, encoding='utf-8', header=True, index=False)

In [500]:
# EDINETコードと証券コードの対応表を読み込み
path = '../../data/EDINET/code_list/EdinetcodeDlInfo.csv'
syoken_df = pd.read_csv(path, encoding='cp932', skiprows=1, header=0)
syoken_df = syoken_df[['ＥＤＩＮＥＴコード', '証券コード']]
syoken_df.set_axis(labels=['edinet_code', 'security_code'], axis=1, inplace=True)
syoken_df['security_code'] = syoken_df['security_code'].apply(lambda x: str(x)[0:4] if x else np.nan)
syoken_df.sort_values('security_code', inplace=True)
path = '../../data/EDINET/code_list/edinet_security_code.csv'
syoken_df.to_csv(path, encoding='cp932', header=True, index=False)

path = '../../data/EDINET/code_list/edinet_security_code.csv'
df_security = pd.read_csv(path, encoding='cp932', header=0, index_col=0, dtype={'security_code': str})

# 注釈テーブルの読込
path = '../../output/annot_comp.csv'
annot_fixed_full = pd.read_csv(path, encoding='cp932', header=0, index_col=0)

# 株主テーブルの読込
path = '../../intermediate/stockholders_table_merged_1016.csv'
new_merged = pd.read_csv(path, encoding='utf-8', header=0, index_col=0)

# すべてを１つのテーブルに収める
df_comp = pd.merge(left=new_merged, right=df_security, left_index=True, right_index=True, how='left')
df_comp = pd.merge(left=df_comp, right=annot_fixed_full, left_index=True, right_index=True, how='left')
cols_comp = [
    'firm_name', 'filling_ymd', 'stockholder_name', 'loc', 'stock', 'stock_rates', 'table_row', 'table_num', 'r_na_x', 'name_infos', 'loc_infos',
    'stock_infos', 'stock_rates_infos', 'r_na_y' 
]
a_num_cols = annot_numbers_df.columns
a_text_cols = [c for c in annot_fixed_full.columns if c.startswith('a_')]
cols_comp = [cols_comp] + [a_num_cols] + [['security_code']] + [a_text_cols]
cols_comp = [c for cl in cols_comp for c in cl]
df_comp.set_axis(labels=cols_comp, axis=1, inplace=True)


sort_cols = [
    'security_code', 'firm_name', 'filling_ymd', 'stockholder_name', 'loc', 'stock',
    'stock_rates', 'stock_infos', 'stock_rates_infos', 'a_num_1',
    'a_num_2', 'a_num_3', 'a_num_4', 'a_num_5', 'a_1',
    'a_2', 'a_3', 'a_4', 'a_5', 'a_6', 'a_7', 'a_8', 'a_9', 'a_10', 'a_11',
    'a_12', 'a_13', 'a_14', 'a_15', 'a_16', 'a_17'
    ]

df_comp = df_comp[sort_cols]

# 注釈番号と注釈文章を対応させる
row_list = []
for row in df_comp.itertuples():
    col_list = []
    for j in range(1, 5):
        annot_num = row[j + 9]
        if type(annot_num) == float and annot_num == annot_num and annot_num is not None:
            annot_text = row[int(annot_num) + 14]
        else:
            annot_text = None

        col_list.append(annot_text)

    row_list.append(col_list)

annot_num_text_df = pd.DataFrame(row_list, index=df_comp.index)
cols = ['a_text_' + str(i + 1) for i in range(len(annot_num_text_df.columns))]
annot_num_text_df.columns = cols
df_comp = pd.concat([df_comp, annot_num_text_df], axis=1)

sort_cols = [
    'security_code', 'firm_name', 'filling_ymd', 'stockholder_name', 'loc', 'stock',
    'stock_rates', 'stock_infos', 'stock_rates_infos', 'a_text_1',
    'a_text_2', 'a_text_3', 'a_text_4', 'a_num_1',
    'a_num_2', 'a_num_3', 'a_num_4', 'a_1',
    'a_2', 'a_3', 'a_4', 'a_5', 'a_6', 'a_7', 'a_8', 'a_9', 'a_10', 'a_11',
    'a_12', 'a_13', 'a_14', 'a_15', 'a_16', 'a_17'
    ]

df_comp = df_comp[sort_cols]

# 証券コードの補完
path = '../../intermediate/securities_na.csv'
edinet_security_table = pd.read_csv(path, encoding='cp932', header=0)
table_dict = {e: s for e, s in zip(edinet_security_table['edinet_code'], edinet_security_table['security_code'])}

# 証券コードNAとそれ以外で分割→補完→マージ
df_comp_nona = df_comp.copy().query('security_code == security_code')
df_comp_na = df_comp.copy().query('not security_code == security_code')
fixed_security_code = list(map(lambda x: table_dict[x], df_comp_na.index))
df_comp_na['security_code'] = fixed_security_code
df_comp = pd.concat([df_comp_nona, df_comp_na], axis=0)

path = '../../output/stockholders_1016.csv'
df_comp.to_csv(path, encoding='utf-8', header=True, index=True)

In [508]:
path = '../../output/stockholders_1016.csv'
df_comp = pd.read_csv(path, encoding='utf-8', header=0, index_col=0)

# 証券コードでソート
df_comp.sort_values('security_code', inplace=True)

# 全体テーブルの出力
path = '../../output/full_data_20211016.csv'
df_comp.to_csv(path, encoding='utf-8', header=True, index=True)

# 株主テーブルの出力
path = '../../output/stockholders_20211016.csv'
df_stockholders = df_comp[[
    'security_code', 'firm_name', 'filling_ymd', 'stockholder_name', 'loc', 'stock',
    'stock_rates', 'stock_infos', 'stock_rates_infos',
    'a_num_1', 'a_num_2', 'a_num_3', 'a_num_4'
]]
df_stockholders.to_csv(path, encoding='utf-8', header=True, index=True)

# 注釈テーブルの出力
path = '../../output/annotation_20211016.csv'
df_annotation = df_comp[[
    'security_code', 'firm_name', 'filling_ymd', 
    'a_1', 'a_2', 'a_3', 'a_4', 'a_5', 'a_6', 'a_7',
    'a_8', 'a_9', 'a_10', 'a_11', 'a_12', 'a_13', 
    'a_14', 'a_15', 'a_16', 'a_17'
]]
df_annotation.to_csv(path, encoding='cp932', header=True, index=True)

In [455]:
# 証券コードがNAの企業
securities_na = df_comp.query('not security_code == security_code')
securities_na = securities_na.drop_duplicates(subset='firm_name')
securities_na.to_csv('../../intermediate/証券コードNA.csv', encoding='cp932', header=True, index=True)

In [515]:
# 企業数の確認
path = '../../output/samples_20211016.csv'
samples = df_comp.drop_duplicates(subset='firm_name').count()
samples.to_csv(path, header=False, index=True)