# Read XBRL from Taxonomy

タクソノミ定義の情報を参照して、XBRLから情報抽出を行う。


In [128]:
import os
from pathlib import Path
import edinet

# サントリーホールディングス株式会社/H31.03.26 12:50
DOC_ID = "S100FGSC"

# Data Folder
DATA_ROOT = Path.cwd().joinpath("data")

## Download XBRL

In [129]:
from edinet.xbrl_file import XBRLDir


# Download and load document
xbrl_path = edinet.api.document.get_xbrl(
    DOC_ID, save_dir=DATA_ROOT.joinpath("raw"),
    expand_level="dir")

xbrl_dir = XBRLDir(xbrl_path)



## Download Taxonomy

Get taxonomy file

In [130]:
taxonomies = {
    2013: "https://www.fsa.go.jp/search/20130821/editaxonomy2013New.zip",
    2014: "https://www.fsa.go.jp/search/20140310/1c.zip",
    2015: "https://www.fsa.go.jp/search/20150310/1c.zip",
    2016: "https://www.fsa.go.jp/search/20160314/1c.zip",
    2017: "https://www.fsa.go.jp/search/20170228/1c.zip",
    2018: "https://www.fsa.go.jp/search/20180228/1c_Taxonomy.zip",
    2019: "https://www.fsa.go.jp/search/20190228/1c_Taxonomy.zip"
}

Confirm fiscal year and target taxonomy

In [131]:
from datetime import datetime


fiscal_year_end = xbrl_dir.xbrl.find("jpdei_cor:CurrentFiscalYearEndDateDEI").text
fiscal_year_end = datetime.strptime(fiscal_year_end, "%Y-%m-%d")
taxonomy_year = -1

for y in taxonomies:
    boarder_date = datetime(y, 3, 31)
    if fiscal_year_end > boarder_date:
        taxonomy_year = y
    else:
        break

print(taxonomy_year)

2018


Download taxonomy

In [132]:
from zipfile import ZipFile
import requests


external_dir = DATA_ROOT.joinpath("external")
expand_dir = external_dir.joinpath("taxonomy").joinpath(str(taxonomy_year))
taxonomy_file = external_dir.joinpath(f"{taxonomy_year}_taxonomy.zip")

download = False

if not external_dir.exists():
    external_dir.mkdir(parents=True, exist_ok=True)
    download = True

if not expand_dir.exists():
    expand_dir.mkdir(parents=True, exist_ok=True)
    download = True

if download:
    # Download
    external_dir.mkdir(parents=True, exist_ok=True)
    r = requests.get(taxonomies[taxonomy_year], stream=True)
    with taxonomy_file.open(mode="wb") as f:
        for chunk in r.iter_content(1024):
            f.write(chunk)

    # Extract
    with ZipFile(taxonomy_file, "r") as zip:
        for f in zip.namelist():
            # Avoid Japanese path 
            dirs = f.split("/")
            if dirs[2] == "taxonomy":
                _to = expand_dir.joinpath("/".join(dirs[3:]))
                _to.parent.mkdir(parents=True, exist_ok=True)
                with _to.open("wb") as _to_f:
                    _to_f.write(zip.read(f))

    taxonomy_file.unlink()

expand_dir

WindowsPath('c:/Users/tie301837/Documents/source/xbrl_read_tutorial/data/external/taxonomy/2018')

In [133]:
import os
from bs4 import BeautifulSoup


class TaxonomyReader():


    def __init__(self, root, parent_root, parent_prefix=""):
        self.root = root
        self.parent_root = parent_root
        self.parent_prefix = parent_prefix
        if not self.parent_prefix:
            self.parent_prefix = "http://disclosure.edinet-fsa.go.jp/taxonomy/"
    
    def read(self, path):
        _path = path
        element = ""
        if "#" in _path:
            _path, element = _path.split("#")
            element = "#" + element

        if _path.startswith(self.parent_prefix):
            _path = _path.replace(self.parent_prefix, "")
            _path = os.path.join(self.parent_root, _path)
        else:
            _path = os.path.join(self.root, path)

        xml = None
        with open(_path, encoding="utf-8-sig") as f:
            xml = BeautifulSoup(f, "lxml-xml")
        
        if element:
            xml = xml.select(element)
            if len(xml) > 0:
                xml = xml[0]

        return xml


In [134]:
taxonomy_reader = TaxonomyReader(xbrl_dir._document_folder, expand_dir)
taxonomy_reader.read("http://disclosure.edinet-fsa.go.jp/taxonomy/jpcrp/2018-02-28/jpcrp_rt_2018-02-28.xsd#rol_CoverPage")

<link:roleType id="rol_CoverPage" roleURI="http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_CoverPage">
<link:definition>表紙</link:definition>
<link:usedOn>link:presentationLink</link:usedOn>
<link:usedOn>link:calculationLink</link:usedOn>
<link:usedOn>link:definitionLink</link:usedOn>
<link:usedOn>link:footnoteLink</link:usedOn>
</link:roleType>

## Read Presentation Link

In [135]:
import dataclasses


@dataclasses.dataclass
class Namespace:
    name: str
    namespace: str

    @classmethod
    def read(cls, schema_tag):
        namespaces = []
        namespace_prefix = "xmlns:"
        for a in schema_tag.attrs:
            if a.startswith(namespace_prefix):
                name = a[len(namespace_prefix):]
                definition = schema_tag[a]
                n = Namespace(name, definition)
                namespaces.append(n)

        return n


In [136]:
presentation = xbrl_dir.pre
role_ref_tags = presentation.find_all("link:roleRef")
role_refs = {}
for t in role_ref_tags:
    role_refs[t["roleURI"]] = t["xlink:href"]


roles = {}
for p in presentation.find_all("link:presentationLink"):
    role = p["xlink:role"]
    role_href = role_refs[role]
    role_def = taxonomy_reader.read(role_href).find("link:definition").text
    roles[role_def] = role

print(roles)

{'企業内容等の開示に関する内閣府令 第三号様式 有価証券報告書': 'http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_CabinetOfficeOrdinanceOnDisclosureOfCorporateInformationEtcFormNo3AnnualSecuritiesReport', '表紙': 'http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_CoverPage', '連結経営指標等': 'http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_BusinessResultsOfGroup', '提出会社の経営指標等': 'http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_BusinessResultsOfReportingCompany', '大株主の状況-01': 'http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_MajorShareholders-01', '経理の状況': 'http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_FinancialInformation', '貸借対照表関係': 'http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_NotesBalanceSheet', '損益計算書関係': 'http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_NotesStatementOfIncome', '重要な会計方針、財務諸表': 'http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_NotesSignificantAccountingPoliciesFinancialStatements', '貸借対照表': 'http://disclosure.edinet-fsa.go.jp/role/jppfs/rol_BalanceSheet', '損益計算書': 'http://disclosure.e

Read Structure of "企業内容等の開示に関する内閣府令 第三号様式 有価証券報告書"

In [157]:
document = presentation.find(
                "link:presentationLink",
                {"xlink:role": roles["企業内容等の開示に関する内閣府令 第三号様式 有価証券報告書"]})


class Node():

    def __init__(self, key, order=0):
        self.key = key
        self.parent = None
        self.order = order

    def add_parent(self, parent):
        self.parent = parent
    
    @property
    def path(self):
        path = self.key
        p = self.parent
        while p is not None:
            path = p.key + "/" + path
            p = p.parent
        return path + " " + str(self.order)

nodes = {}
for i, arc in enumerate(document.find_all("link:presentationArc")):
    if not arc["xlink:arcrole"].endswith("parent-child"):
        print("Unexpected arctype.")
        continue
    
    parent = arc["xlink:from"]
    child = arc["xlink:to"]
    order = arc["order"]

    if child in nodes:
        nodes[child].order = order
    else:
        nodes[child] = Node(child, order)

    if parent not in nodes:
        nodes[parent] = Node(parent, i)
    
    nodes[child].add_parent(nodes[parent])

keys = {}
for name in nodes:
    n = nodes[name]
    path = n.path
    keys[path] = n


for key in sorted(keys.keys()):
    indent = "_" * len(key.split("/"))
    print(indent + keys[key].key)


_AcquisitionsByResolutionOfBoardOfDirectorsMeetingHeading
__AcquisitionsByResolutionOfBoardOfDirectorsMeetingNATextBlock
_AcquisitionsByResolutionOfShareholdersMeetingHeading
__AcquisitionsByResolutionOfShareholdersMeetingNATextBlock
_AcquisitionsEtcOfTreasurySharesHeading
__AcquisitionsByResolutionOfBoardOfDirectorsMeetingHeading_2
__AcquisitionsByResolutionOfShareholdersMeetingHeading_2
__AcquisitionsNotBasedOnResolutionOfShareholdersMeetingOrBoardOfDirectorsMeetingHeading_2
__ClassesOfSharesEtcHeading_2
__DisposalsOrHoldingOfAcquiredTreasurySharesHeading_2
_AcquisitionsNotBasedOnResolutionOfShareholdersMeetingOrBoardOfDirectorsMeetingHeading
__AcquisitionsNotBasedOnResolutionOfShareholdersMeetingOrBoardOfDirectorsMeetingNATextBlock
_AnnexedDetailedScheduleOfPropertyPlantAndEquipmentEtcHeading
__AnnexedDetailedScheduleOfPropertyPlantAndEquipmentEtcTextBlock
_AnnexedDetailedScheduleOfProvisionsHeading
__AnnexedDetailedScheduleOfProvisionsTextBlock
_AnnexedDetailedScheduleOfSecuritiesH

## Read Document Schema

Get namespaces

In [37]:
import dataclasses


@dataclasses.dataclass
class Namespace:
    name: str
    namespace: str
    location: str

Read from taxonomy