# XBRLファイルから株主データを取得する

## 1. XBRLからHTML形式で株主テーブルを取得する

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import re
import zipfile
import pickle
from collections import OrderedDict
from bs4 import BeautifulSoup
from arelle import ModelManager
from arelle import Cntlr

In [5]:
def unzip(zip_path):
    zip_files = glob.glob(os.path.join(zip_path, '*.zip'))
    for i, f in enumerate(zip_files):
        len_files = len(zip_files)
        print(f'file:{f}, loading {i+1} / {len_files}')
        with zipfile.ZipFile(f) as zip_f:
            zip_f.extractall(zip_path)
    
    return None

In [None]:
zip_path = 'D:/Workshop_Data/new_release/'
unzip(zip_path)

In [None]:
edinet_code_list = []
firm_name_list = []
ymd_list = []
stk_html_list = []

xbrl_path = 'D:/Workshop_Data/new_release/XBRL/PublicDoc/*.xbrl'
xbrl_files = glob.glob(xbrl_path)
length = len(xbrl_files)

for i, xbrl_file in enumerate(xbrl_files):
    print(f'{i + 1} / {length}')
    ctrl = Cntlr.Cntlr()
    model_manager = ModelManager.initialize(ctrl)
    model_xbrl = model_manager.load(xbrl_file)

    for fact in model_xbrl.facts:
        if fact.concept.qname.localName == 'EDINETCodeDEI':
            print(f'EDINETコード：{fact.value}')
            edinet_code = fact.value
        
        elif fact.concept.qname.localName == 'FilerNameInJapaneseDEI':
            print(f'企業名：{fact.value}')
            firm_name = fact.value

        elif fact.concept.qname.localName == 'FilingDateCoverPage':
            print(f'提出日：{fact.value}')
            ymd = fact.value

        elif fact.concept.qname.localName == 'SummaryOfShareholdersTextBlock':
            # print(f'株主の状況：{fact.value}')
            stockholder = fact.value

    edinet_code_list.append(edinet_code)
    firm_name_list.append(firm_name)
    ymd_list.append(ymd)
    stk_html_list.append(stockholder)

In [2]:
df_stk = pd.DataFrame(data=[edinet_code_list, firm_name_list, ymd_list, stk_html_list]).T
df_stk.columns = ['edinet_code', 'firm_name', 'filling_ymd', 'stk_html']
intermediate_path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/intermediate/xbrl_parsed_0924.pickle'
with open(intermediate_path, mode='wb') as file:
    pickle.dump(df_stk, file)

NameError: name 'edinet_code_list' is not defined

In [73]:
intermediate_path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/intermediate/xbrl_parsed_0924.pickle'
with open(intermediate_path, mode='rb') as file:
    df_stk = pickle.load(file)

In [74]:
class Translate:
    ZENKAKU = ''.join(chr(0xff01 + i) for i in range(94)) # 全角文字
    HANKAKU = ''.join(chr(0x21 + i) for i in range(94)) # 半角文字
    ZENTOHAN = str.maketrans(ZENKAKU, HANKAKU) # 全角→半角
    HANTOZEN = str.maketrans(HANKAKU, ZENKAKU) # 半角→全角
    ROUNDNUM = ''.join(chr(0x2460 + i) for i in range(9)) # 丸文字
    NUM = ''.join(str(i) for i in range(1, 10)) # 数字
    ROUNDTONUM = str.maketrans(ROUNDNUM, NUM) # 丸文字→数字
    ADHOC_DICT = {'注': '※', '*': '※'} # 手動で整形する文字
    ADHOC_TRANS = str.maketrans(ADHOC_DICT) # 変換テーブル

    def __init__(self, value):
        self.value = value
        

    def zentohan(self, zentohan=True):
        if zentohan is True:
            self.value = self.value.translate(self.ZENTOHAN)
            return self
        elif zentohan is False:
            self.value = self.value.translate(self.HANTOZEN)
            return self
        else:
            print('変換失敗')
            return None


    def roundtonum(self):
        self.value = self.value.translate(self.ROUNDTONUM)
        return self


    def adhoc_trans(self):
        self.value = self.value.translate(self.ADHOC_TRANS)
        return self

In [87]:
def fix_td(td):
    td_text = td.text
    fix = Fix(td_text)
    td_fixed = fix.fix_value().value

    return td_fixed

In [76]:
def fix_tr(tr):
    tds = tr.findAll('td')
    tr_fixed = list(map(fix_td, tds))

    return tr_fixed

In [77]:
def div_to_df(div):
    trs = div.findAll('tr')
    trs_fixed = list(map(fix_tr, trs))
    df = pd.DataFrame(trs_fixed)
    df = df.replace('', np.nan)
    df.dropna(axis=0, how='all', inplace=True)
    df.set_axis(df.iloc[0, :], inplace=True, axis=1)
    df = df.iloc[1:, :]

    return df

In [78]:
class Fix:
    def __init__(self, value):
        self.value = value

    def fix_value(self):
        value_split = self.value.splitlines()
        value_join = ''.join(value_split)
        translate = Translate(value_join)
        value_fixed = translate.zentohan().roundtonum().adhoc_trans().value # Translateオブジェクトのvalue変数(str)を抽出
        value_fixed = value_fixed.replace(' ', '').replace('　', '').replace('\xa0', '').replace('\u3000', '')
        self.value = value_fixed

        return self


    def fix_annotation_pre_suff(self):
        pattern_pre = r'^[0-9]\.|^[0-9]|特別利害関係者等\(※\)|^※|^※[0-9]|^※[0-9]|特別利害関係者等(※)[0-9]\.|'
        pattern_suff = r'[0-9]$|、'
        repl = ''
        annot_fixed = re.sub(pattern=pattern_pre, repl=repl, string=self.value, count=10)
        annot_fixed = re.sub(pattern=pattern_suff, repl=repl, string=annot_fixed, count=5)
        self.value = annot_fixed

        return self


    def fix_annotation_adhoc(self, pattern):
        annot_list = [re.sub(pattern=pattern, repl='', string=a) for a in self.value]
        annot_list = [a for a in annot_list if not a == '' or  a == ' ']
        self.value = annot_list

        return self        

In [64]:
def annotation_to_df(html):
    soup  = BeautifulSoup(html, 'html.parser')
    div_last = soup.findAll('div')[-1]
    annotation = div_last.findNextSiblings()

    annot_list = []
    for a in annotation:
        a_text = a.text
        a_split = a_text.split()
        annot_list.append(a_split)

    # print(annot_list)
    annot_fixed_list = [Fix(a).fix_value().value for al in annot_list for a in al 
        if not '自己株式' in a and '特別' in a or '当社' in a or '上位' in a]

    pattern_brackets = r'(\(.+\))|(:.+)'
    brackets = [re.search(pattern=pattern_brackets, string=a) for a in annot_fixed_list]
    brackets = [b.group() for b in brackets if b]
    annot_remove_brackets = [re.sub(pattern_brackets, repl='', string=a) for a in annot_fixed_list if not a == '']
    annot_remove_brackets = [a for a in annot_remove_brackets if not a == '']
    # print(brackets)
    # print(annot_remove_brackets)
    annot_to_df = [''.join([a, b]) for a, b in zip(annot_remove_brackets, brackets)]
    # if annot_to_df == []:
    #     annot_to_df = annot_fixed_list
    # 最終リストを作成（アドホックに削除する）
    ad_pattern_list = [
        r'特別利害関係者等\(※\)', r'特別利害関係者等(※)[0-9]\.', r'^※', r'^※[0-9]', r'^※[0-9]', 
        r'[0-9]$|、', r'^[0-9]\.', r'^[0-9]', r'^\.', r'^:',
        r'上記の当社代表取締役社長宮下尚之の所有株式数は同役員の資産管理会社である株式会社MTMが保有する株式数を含めた実質所有株式数を記載しております。\(※\)',
        r'上記の当社代表取締役CEO中野智哉の所有株式数は同役員の資産管理会社である株式会社中野企画が保有する株式数を含めた実質所有株式数を記載しております。\(※\)',
        r'。なおRed\(※1\)',
        r''
        ]
    for ad_pattern in ad_pattern_list:
        annot_to_df = Fix(annot_to_df).fix_annotation_adhoc(pattern=ad_pattern).value
    df = pd.DataFrame(annot_to_df).T
    # print(annot_to_df)

    return df

In [65]:
annot_list = [annotation_to_df(d) for d in df_stk['stk_html']]

In [68]:
annot_merged = pd.concat(annot_list, axis=0)
path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/intermediate/annotation_table.csv'
annot_merged.to_csv(path, encoding='cp932', header=False, index=False)

In [126]:
annotation_to_df(df_stk['stk_html'][11])

[['（注）１．特別利害関係者等（当社の代表取締役会長）'], ['２．特別利害関係者等（当社の代表取締役社長）'], ['３．特別利害関係者等（役員等により総株主の議決権の過半数が所有されている会社）'], ['４．特別利害関係者等（大株主上位10名）'], ['５．特別利害関係者等（当社の取締役）'], ['６．特別利害関係者等（当社子会社の代表取締役）'], ['７．特別利害関係者等（当社子会社の取締役）'], ['８．当社の従業員'], ['９．自己株式及び新株予約権者の退職に伴い取得した自己新株予約権'], ['10．株式総数に対する所有株式数の割合は、小数点以下第３位を四捨五入しております。'], ['11．(', ')内は、新株予約権による潜在株式数及びその割合であり、内数であります。']]
['(※)1.特別利害関係者等(当社の代表取締役会長)', '(当社の代表取締役社長)', '(役員等により総株主の議決権の過半数が所有されている会社)', '(大株主上位10名)', '(当社の取締役)', '(当社子会社の代表取締役)', '(当社子会社の取締役)']
['特別利害関係者等', '特別利害関係者等', '特別利害関係者等', '特別利害関係者等', '特別利害関係者等', '特別利害関係者等', '当社の従業員']
['特別利害関係者等(※)1.特別利害関係者等(当社の代表取締役会長)', '特別利害関係者等(当社の代表取締役社長)', '特別利害関係者等(役員等により総株主の議決権の過半数が所有されている会社)', '特別利害関係者等(大株主上位10名)', '特別利害関係者等(当社の取締役)', '特別利害関係者等(当社子会社の代表取締役)', '当社の従業員(当社子会社の取締役)']


Unnamed: 0,0,1,2,3,4,5,6
0,特別利害関係者等(※)1.特別利害関係者等(当社の代表取締役会長),特別利害関係者等(当社の代表取締役社長),特別利害関係者等(役員等により総株主の議決権の過半数が所有されている会社),特別利害関係者等(大株主上位10名),特別利害関係者等(当社の取締役),特別利害関係者等(当社子会社の代表取締役),当社の従業員(当社子会社の取締役)


In [323]:
html = df_stk['stk_html'][39]
soup = BeautifulSoup(html, 'html.parser')
div = soup.findAll('div')[-1]
annotation = div.findNextSiblings()
annot_text = [t.text for t in annotation]
print(annot_text)
print(soup)

['(注)\u3000１.\u3000「氏名又は名称」欄の※の番号は、次のとおり株主の属性を示します。', '※１\u3000特別利害関係者等（大株主上位10名）\u3000※２\u3000特別利害関係者等（当社代表取締役社長）\u3000※３\u3000特別利害関係者等（当社取締役）\u3000※４\u3000特別利害関係者等（当社監査役）※５\u3000特別利害関係者等（当社子会社取締役）\u3000※６\u3000当社従業員\u3000', '２.\u3000（\u3000\u3000）内は、新株予約権による潜在株式数及びその割合であり、内数であります。', '３.\u3000株式総数に対する所有株式数の割合は、小数点以下第3位を四捨五入しております。']

<h2 class="smt_head1">第３ 【株主の状況】</h2>
<div class="tbld" style="text-align:left;margin-left:0.1pt;">
<table cellpadding="0" cellspacing="0" class="align_left" style="border-collapse:collapse;border:solid 0pt #000000;width:479.4pt;">
<colgroup>
<col style="width:137.3pt;min-width:137.3pt;"/>
<col style="width:205.5pt;min-width:205.5pt;"/>
<col style="width:68.3pt;min-width:68.3pt;"/>
<col style="width:68.3pt;min-width:68.3pt;"/>
</colgroup>
<tr style="height:33.8pt">
<td style="vertical-align:middle;border-top-style:solid;border-top-width:0.75pt;border-bottom-style:solid;border-bottom-width:0.75pt;border-left-style:solid;border-left-width:0.75pt;border-righ

In [79]:
def table_to_df(html):
    soup = BeautifulSoup(html, 'html.parser')
    divs = soup.findAll('div')
    df_list = list(map(div_to_df, divs))
    df = pd.concat(df_list, axis=0)
    df.reset_index(drop=True, inplace=True)

    return df

In [80]:
df_stk.columns

Index(['edinet_code', 'firm_name', 'filling_ymd', 'stk_html'], dtype='object')

In [88]:
codes = df_stk['edinet_code']
names = df_stk['firm_name']
ymds = df_stk['filling_ymd']
htmls = df_stk['stk_html']
firm_infos = [[code, name, ymd, table_to_df(html)] for code, name, ymd, html in zip(codes, names, ymds, htmls)]
path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/intermediate/html_tables.pickle'
with open(path, mode='wb') as file:
    pickle.dump(obj=firm_infos, file=file)

In [89]:
path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/intermediate/html_tables.pickle'
with open(path, mode='rb') as file:
    firm_infos = pickle.load(file)

count = 0
count_all = 0
tb_four_list = []
for num, info_list in enumerate(firm_infos):
    edinet_code = info_list[0] # EDINETコードを取得
    firm_name = info_list[1] # 企業名を取得
    filling_ymd = info_list[2] # 提出日を取得
    tb = info_list[3] # df変換後のhtml_tableを取得

    if len(tb.columns) == 4:
        count += 1
        # EDINETコード等をdfに追加する
        tb.insert(loc=0, column='edinet_code', value=edinet_code)
        tb.insert(loc=1, column='firm_name', value=firm_name)
        tb.insert(loc=2, column='filling_ymd', value=filling_ymd)

        # 列数や企業番号を列に追加する
        tb['row'] = tb.index
        tb['num'] = num
        new_colname = ['edinet_code', 'firm_name', 'filling_ymd', 'name', 'loc', 'stock', 'rates', 'row', 'num']
        tb.set_axis(new_colname, inplace=True, axis=1)
        tb['r_na'] = tb['rates'].apply(lambda r: 1 if pd.isna(r) else 0)

        tb_r_na = tb[tb['r_na'] == 1].copy()
        tb_r_na['row'] = tb_r_na['row'].apply(lambda x: x - 1)
        r_na_colname = ['edinet_code', 'firm_name', 'filling_ymd', 'tmp1', 'tmp2', 'tmp3', 'tmp4', 'row', 'num', 'r_na']
        tb_r_na.set_axis(r_na_colname, inplace=True, axis=1)

        tb = tb[tb['r_na'] == 0]
        new_tb = pd.merge(left=tb, right=tb_r_na, on=['edinet_code', 'firm_name', 'filling_ymd', 'num', 'row'], how='left')

        
        tb_four_list.append(new_tb)
    count_all += 1

print(f'{count} / {count_all}')

409 / 439


In [90]:
merged = pd.concat(tb_four_list, axis=0)
merged.sort_values(['num', 'row'], ascending=[True, True], inplace=True)

merged_copy = merged.copy()
tmp1 = merged_copy['tmp1']
tmp2 = merged_copy['tmp2']
tmp3 = merged_copy['tmp3']
tmp4 = merged_copy['tmp4']
new_tmp1 = []
new_tmp2 = []
new_tmp3 = []
new_tmp4 = []

regex = r'\([0-9]|,+\)|[0-9]|,+\([0-9]|,+\)'
pattern = re.compile(regex)
for t1, t2, t3, t4 in zip(tmp1, tmp2, tmp3, tmp4):
    try:
        matchObj = pattern.match(t1)
        if not matchObj == None:
            new_t3 = t2
            new_t2 = t1
            new_t1 = np.nan
        else:
            new_t3 = t3
            new_t2 = t2
            new_t1 = t1
    except:
        new_t3 = t3
        new_t2 = t2
        new_t1 = t1
    
    try:
        matchObj = pattern.match(t2)
        if not matchObj == None:
            new_t4 = new_t3
            new_t3 = new_t2
            new_t2 = np.nan
        else:
            new_t4 = t4
            new_t3 = t3
            new_t2 = t2
    except:
        new_t4 = t4
        new_t3 = t3
        new_t2 = t2

    new_tmp1.append(new_t1)
    new_tmp2.append(new_t2)
    new_tmp3.append(new_t3)
    new_tmp4.append(new_t4)


new_merged = merged.copy()
new_merged['tmp1'] = new_tmp1
new_merged['tmp2'] = new_tmp2
new_merged['tmp3'] = new_tmp3
new_merged['tmp4'] = new_tmp4

# 名前の分割
name_split = new_merged['name'].copy().apply(lambda x: str(x).split('※', maxsplit=1)[0])
annot = new_merged['name'].copy().apply(lambda x: x.split('※', maxsplit=1)[1] if len(str(x).split('※')) >= 2 else np.nan)
new_merged['name'] = name_split
new_merged['tmp1'] = annot

# 株数の分割
stock_split = new_merged['stock'].copy().apply(lambda x: str(x).split('(', maxsplit=1)[0])
stock_brackets = new_merged['stock'].copy().apply(lambda x: x.split('(', maxsplit=1)[1] if len(str(x).split('(')) >= 2 else np.nan)
new_merged['stock'] = stock_split
new_merged['tmp3'] = stock_brackets

# 比率の分割
rates_split = new_merged['rates'].copy().apply(lambda x: str(x).split('(', maxsplit=1)[0])
rates_brackets = new_merged['rates'].copy().apply(lambda x: x.split('(', maxsplit=1)[1] if len(str(x).split('(')) >= 2 else np.nan)
new_merged['rates'] = rates_split
new_merged['tmp4'] = rates_brackets

# 変な行の修正
adhoc_1 = new_merged.query('num == 94 and row == 98')[['tmp1', 'tmp2', 'tmp3', 'tmp4', 'row', 'num']]
adhoc_1.set_axis(['name', 'loc', 'stock', 'rates', 'row', 'num'], axis=1, inplace=True)
new_merged = pd.concat([new_merged, adhoc_1])

# 両端の括弧等を取る
for col in new_merged.columns:
    new_merged[col] = new_merged[col].apply(lambda x: str(x).strip(r'\(').strip(r'\)').replace('※', '').replace('、', ','))

# それぞれの列でおかしな所を修正する
new_merged['stock'] = new_merged['stock'].apply(lambda x: x.replace(',', ''))
new_merged['rates'] = new_merged['rates'].apply(lambda x: x.replace(',', ''))
new_merged['tmp1'] = new_merged['tmp1'].apply(lambda x: x.replace('.', ',').replace('()', ',').replace(')(', ',').strip(','))

# tmp1列には数字かカンマ以外残さない関数
def degit_or_comma(value):
    find = re.findall(r'\d+', str(value))
    result = ','.join(find)
    if result == '':
        result = np.nan

    return result


# 2桁の注釈番号にカンマを付ける関数
def comma(value):
    value_splited = []
    s = str(value)
    if str.isnumeric(s) and len(s) >= 2 and not s == '10' and not s == '11':
        value_splited = [v for v in s]
        result = ','.join(value_splited)
    else:
        result = value

    return result

new_merged['tmp1'] = new_merged['tmp1'].apply(comma)
new_merged['tmp1'] = new_merged['tmp1'].apply(degit_or_comma)
new_merged['tmp3'] = new_merged['tmp3'].apply(lambda x: x.replace(',', ''))
new_merged['tmp4'] = new_merged['tmp4'].apply(lambda x: x.replace(',', ''))

In [102]:
path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/intermediate/stockholders_table.csv'
new_merged.to_csv(path, encoding='utf-8', header=True, index=False)

In [104]:
path = 'C:/Users/koeci/Google ドライブ/MBA/ワークショップ/intermediate/stockholders_table.csv'
new_merged = pd.read_csv(path, encoding='utf-8')
new_merged

Unnamed: 0,edinet_code,firm_name,filling_ymd,name,loc,stock,rates,row,num,r_na_x,tmp1,tmp2,tmp3,tmp4,r_na_y
0,E00007,株式会社雪国まいたけ,2020-08-14,BainCapitalSnowHongKongLimited,"香港,アドミラルティ,クイーンズウェイ88,ワン・パシフィック・プレイス25階,スイート2501",20323500,50.90,0,0,0.0,23,,,,
1,E00007,株式会社雪国まいたけ,2020-08-14,㈱神明ホールディングス,兵庫県神戸市中央区栄町通6-1-21,19526500,48.91,1,0,0.0,34,,,,
2,E00007,株式会社雪国まいたけ,2020-08-14,足利厳,新潟県新潟市秋葉区,60700,0.15,2,0,0.0,5,,60700,0.15,
3,E00007,株式会社雪国まいたけ,2020-08-14,小室雅裕,東京都大田区,15200,0.04,3,0,0.0,6,,15200,0.04,
4,E00007,株式会社雪国まいたけ,2020-08-14,計,-,39925900,100.00,4,0,0.0,,,75900,0.19,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17501,E36976,ＰＨＣホールディングス株式会社,2021-09-07,所有株式数955株の株主1名,-,955,0.00,119,438,0.0,,,955,0.00,
17502,E36976,ＰＨＣホールディングス株式会社,2021-09-07,所有株式数910株の株主1名,-,910,0.00,120,438,0.0,,,,,
17503,E36976,ＰＨＣホールディングス株式会社,2021-09-07,所有株式数450株の株主12名,-,5400,0.00,121,438,0.0,,,5400,0.00,
17504,E36976,ＰＨＣホールディングス株式会社,2021-09-07,計,-,121935553,100.00,122,438,0.0,,,5784179,4.74,
