# Read XBRL from Taxonomy

タクソノミ定義の情報を参照して、XBRLから情報抽出を行う。


In [1]:
import os
from pathlib import Path
import edinet

# サントリーホールディングス株式会社/H31.03.26 12:50
DOC_ID = "S100FGSC"

# Data Folder
DATA_ROOT = Path.cwd().joinpath("data")

## Download XBRL

In [2]:
from edinet.xbrl_file import XBRLDir


# Download and load document
xbrl_path = edinet.api.document.get_xbrl(
    DOC_ID, save_dir=DATA_ROOT.joinpath("raw"),
    expand_level="dir")

xbrl_dir = XBRLDir(xbrl_path)



## Download Taxonomy

Get taxonomy file

In [3]:
taxonomies = {
    2013: "https://www.fsa.go.jp/search/20130821/editaxonomy2013New.zip",
    2014: "https://www.fsa.go.jp/search/20140310/1c.zip",
    2015: "https://www.fsa.go.jp/search/20150310/1c.zip",
    2016: "https://www.fsa.go.jp/search/20160314/1c.zip",
    2017: "https://www.fsa.go.jp/search/20170228/1c.zip",
    2018: "https://www.fsa.go.jp/search/20180228/1c_Taxonomy.zip",
    2019: "https://www.fsa.go.jp/search/20190228/1c_Taxonomy.zip"
}

Confirm fiscal year and target taxonomy

In [4]:
from datetime import datetime


fiscal_year_end = xbrl_dir.xbrl.find("jpdei_cor:CurrentFiscalYearEndDateDEI").text
fiscal_year_end = datetime.strptime(fiscal_year_end, "%Y-%m-%d")
taxonomy_year = -1

for y in taxonomies:
    boarder_date = datetime(y, 3, 31)
    if fiscal_year_end > boarder_date:
        taxonomy_year = y
    else:
        break

print(taxonomy_year)

2018


Download taxonomy

In [5]:
from zipfile import ZipFile
import requests


external_dir = DATA_ROOT.joinpath("external")
expand_dir = external_dir.joinpath("taxonomy").joinpath(str(taxonomy_year))
taxonomy_file = external_dir.joinpath(f"{taxonomy_year}_taxonomy.zip")

download = False

if not external_dir.exists():
    external_dir.mkdir(parents=True, exist_ok=True)
    download = True

if not expand_dir.exists():
    expand_dir.mkdir(parents=True, exist_ok=True)
    download = True

if download:
    # Download
    external_dir.mkdir(parents=True, exist_ok=True)
    r = requests.get(taxonomies[taxonomy_year], stream=True)
    with taxonomy_file.open(mode="wb") as f:
        for chunk in r.iter_content(1024):
            f.write(chunk)

    # Extract
    with ZipFile(taxonomy_file, "r") as zip:
        for f in zip.namelist():
            # Avoid Japanese path 
            dirs = f.split("/")
            if dirs[2] == "taxonomy":
                _to = expand_dir.joinpath("/".join(dirs[3:]))
                _to.parent.mkdir(parents=True, exist_ok=True)
                with _to.open("wb") as _to_f:
                    _to_f.write(zip.read(f))

    taxonomy_file.unlink()

expand_dir

WindowsPath('c:/Users/tie301837/Documents/source/xbrl_read_tutorial/data/external/taxonomy/2018')

In [6]:
import os
from bs4 import BeautifulSoup


class TaxonomyReader():


    def __init__(self, root, parent_root, parent_prefix=""):
        self.root = root
        self.parent_root = parent_root
        self.parent_prefix = parent_prefix
        self._cache = {}
        if not self.parent_prefix:
            self.parent_prefix = "http://disclosure.edinet-fsa.go.jp/taxonomy/"
    
    def read(self, path, label="ja", verbose=True):
        _path = path
        element = ""
        use_parent = False

        if "#" in _path:
            _path, element = _path.split("#")

        if _path.startswith(self.parent_prefix):
            use_parent = True
            _path = _path.replace(self.parent_prefix, "")
            _path = os.path.join(self.parent_root, _path)
        else:
            _path = os.path.join(self.root, _path)

        xml = None
        if _path in self._cache:
            xml = self._cache[_path]
        else:
            with open(_path, encoding="utf-8-sig") as f:
                xml = BeautifulSoup(f, "lxml-xml")
            self._cache[_path] = xml
        
        if element:
            xml = xml.select(f"#{element}")
            if len(xml) > 0:
                xml = xml[0]
            
            if xml.name != "element":
                return Element(xml, None)

            label_ext = "_lab.xml"
            if label == "en":
                label_ext = "_lab-en.xml"
            elif label == "g":
                label_ext = "_gla.xml"

            label = None
            label_dir = os.path.dirname(_path)
            if use_parent:
                label_dir = os.path.join(label_dir, "label")

            for f in os.listdir(label_dir):
                label_path = os.path.join(label_dir, f)
                if f.endswith(label_ext):
                    break

            if label_path in self._cache:
                label_xml = self._cache[label_path]
            else:
                with open(label_path, encoding="utf-8-sig") as f:
                    label_xml = BeautifulSoup(f, "lxml-xml")
                self._cache[label_path] = label_xml
            
            reference = label_xml.find("link:labelArc", {"xlink:from": element})
            if reference is None:
                href = f"{os.path.basename(_path)}#{element}"
                if use_parent:
                    href = f"../{href}"
                location = label_xml.find("link:loc", {"xlink:href": href})
                reference = label_xml.find("link:labelArc", {"xlink:from": location["xlink:label"]})

            if reference:
                labels = label_xml.find_all("link:label", {"id": reference["xlink:to"]})

                if len(labels) > 1:
                    for lb in labels:
                        if lb["xlink:role"].endswith("verboseLabel") and verbose:
                            label = lb
                            break
                        else:
                            label = lb

                elif len(labels) > 0:
                    label = labels[0]
            
            return Element(xml, label)

        return xml


class Element():

    def __init__(self, tag, label):
        self.tag = tag
        self.label = label
    



In [7]:
taxonomy_reader = TaxonomyReader(xbrl_dir._document_folder, expand_dir)
taxonomy_reader.read("http://disclosure.edinet-fsa.go.jp/taxonomy/jpcrp/2018-02-28/jpcrp_rt_2018-02-28.xsd#rol_CoverPage").tag

<link:roleType id="rol_CoverPage" roleURI="http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_CoverPage">
<link:definition>表紙</link:definition>
<link:usedOn>link:presentationLink</link:usedOn>
<link:usedOn>link:calculationLink</link:usedOn>
<link:usedOn>link:definitionLink</link:usedOn>
<link:usedOn>link:footnoteLink</link:usedOn>
</link:roleType>

In [8]:
taxonomy_reader.read("http://disclosure.edinet-fsa.go.jp/taxonomy/jpcrp/2018-02-28/jpcrp_cor_2018-02-28.xsd#jpcrp_cor_NetAssetsPerShareSummaryOfBusinessResults").label

<link:label id="label_NetAssetsPerShareSummaryOfBusinessResults" xlink:label="label_NetAssetsPerShareSummaryOfBusinessResults" xlink:role="http://www.xbrl.org/2003/role/label" xlink:type="resource" xml:lang="ja">１株当たり純資産額</link:label>

In [9]:
taxonomy_reader.read("jpcrp030000-asr-001_E22559-000_2018-12-31_01_2019-03-26.xsd#jpcrp030000-asr_E22559-000_NotesRegardingBusinessRestructuringExpenseTextBlock").label

<link:label id="label_NotesRegardingBusinessRestructuringExpenseTextBlock" xlink:label="label_NotesRegardingBusinessRestructuringExpenseTextBlock" xlink:role="http://www.xbrl.org/2003/role/label" xlink:type="resource" xml:lang="ja">組織再編関連費用の注記</link:label>

## Read Presentation Link

In [10]:
import dataclasses


@dataclasses.dataclass
class Namespace:
    name: str
    namespace: str

    @classmethod
    def read(cls, schema_tag):
        namespaces = []
        namespace_prefix = "xmlns:"
        for a in schema_tag.attrs:
            if a.startswith(namespace_prefix):
                name = a[len(namespace_prefix):]
                definition = schema_tag[a]
                n = Namespace(name, definition)
                namespaces.append(n)

        return n


In [16]:
presentation = xbrl_dir.pre
role_ref_tags = presentation.find_all("link:roleRef")
role_refs = {}
for t in role_ref_tags:
    role_refs[t["roleURI"]] = t["xlink:href"]


roles = {}
for p in presentation.find_all("link:presentationLink"):
    role = p["xlink:role"]
    role_href = role_refs[role]
    role_def = taxonomy_reader.read(role_href).tag.find("link:definition").text
    roles[role_def] = role

# Show roles
for r in roles:
    print(f"{r}\t{roles[r]}")


企業内容等の開示に関する内閣府令 第三号様式 有価証券報告書	http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_CabinetOfficeOrdinanceOnDisclosureOfCorporateInformationEtcFormNo3AnnualSecuritiesReport
表紙	http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_CoverPage
連結経営指標等	http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_BusinessResultsOfGroup
提出会社の経営指標等	http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_BusinessResultsOfReportingCompany
大株主の状況-01	http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_MajorShareholders-01
経理の状況	http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_FinancialInformation
貸借対照表関係	http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_NotesBalanceSheet
損益計算書関係	http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_NotesStatementOfIncome
重要な会計方針、財務諸表	http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_NotesSignificantAccountingPoliciesFinancialStatements
貸借対照表	http://disclosure.edinet-fsa.go.jp/role/jppfs/rol_BalanceSheet
損益計算書	http://disclosure.edinet-fsa.go.jp/role/jppfs/rol_StatementOfIncome
株主資本等変動計算書	http:

Read Structure of "貸借対照表"

In [26]:
document = presentation.find(
                "link:presentationLink",
                {"xlink:role": roles["貸借対照表"]})


class Node():

    def __init__(self, name, reference, order=0):
        self.name = name
        self.alias = name
        self.reference = reference
        self.parent = None
        self.order = order

    def add_parent(self, parent):
        self.parent = parent
    
    @property
    def path(self):
        path = str(self.order) + " " + self.name
        p = self.parent
        while p is not None:
            path = p.name + "/" + path
            p = p.parent
        return path

nodes = {}
node_by_ref = {}
for i, arc in enumerate(document.find_all("link:presentationArc")):
    if not arc["xlink:arcrole"].endswith("parent-child"):
        print("Unexpected arctype.")
        continue

    parent = None
    child = None
    
    if arc["xlink:from"] in nodes:
        parent = nodes[arc["xlink:from"]]
    else:
        node = document.find("link:loc", {"xlink:label": arc["xlink:from"]})
        parent = Node(node["xlink:label"],  node["xlink:href"], 0)

    if arc["xlink:to"] in nodes:
        child = nodes[arc["xlink:to"]]
    else:
        node = document.find("link:loc", {"xlink:label": arc["xlink:to"]})
        child = Node(node["xlink:label"],  node["xlink:href"], arc["order"])

    # Name base
    if parent.name in node:
        child.add_parent(parent)
    else:
        nodes[parent.name] = parent
    nodes[child.name] = child

    # Reference base
    if parent.reference in node_by_ref:
        alias = node_by_ref[parent.reference].name
        node_by_ref[parent.reference].name = parent.name
        node_by_ref[parent.reference].alias = alias
        child.add_parent(node_by_ref[parent.reference])
    else:
        node_by_ref[parent.reference] = parent
    node_by_ref[child.reference] = child


In [27]:
import re


def natural_sort(t):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [convert(c) for c in re.split("([0-9]+)", key)]
    return sorted(t, key=alphanum_key)


path_of_node = {}
for ref in nodes:
    n = nodes[ref]
    path = n.path
    path_of_node[path] = n


for path in natural_sort(path_of_node.keys()):
    indent = "_" * len(path.split("/"))
    print(indent + path)


_0 AssetsAbstract
_0 BalanceSheetHeading
_0 BalanceSheetLineItems
_0 BalanceSheetTable
_0 CapitalSurplusAbstract
_0 ConsolidatedOrNonConsolidatedAxis
_0 CurrentAssetsAbstract
_0 CurrentLiabilitiesAbstract
_0 IntangibleAssetsAbstract
_0 InvestmentsAndOtherAssetsAbstract
_0 LiabilitiesAbstract
_0 NetAssetsAbstract
_0 NoncurrentAssetsAbstract
_0 NoncurrentLiabilitiesAbstract
_0 OtherRetainedEarningsAbstract
_0 PropertyPlantAndEquipmentAbstract
_0 RetainedEarningsAbstract
_0 ShareholdersEquityAbstract
_0 ValuationAndTranslationAdjustmentsAbstract
_1.0 BalanceSheetTable
__BalanceSheetHeading/2.0 BalanceSheetLineItems
___BalanceSheetHeading/BalanceSheetLineItems/1.0 AssetsAbstract
___BalanceSheetHeading/BalanceSheetLineItems/2.0 LiabilitiesAbstract
___BalanceSheetHeading/BalanceSheetLineItems/3.0 NetAssetsAbstract
___BalanceSheetHeading/BalanceSheetLineItems/4.0 LiabilitiesAndNetAssets
____BalanceSheetHeading/BalanceSheetLineItems/AssetsAbstract/1.0 CurrentAssetsAbstract
____BalanceSheetHead

Append Names of tag

In [24]:
for path in natural_sort(path_of_node.keys()):
    indent = "_" * len(path.split("/"))
    node = path_of_node[path]
    label= taxonomy_reader.read(node.reference).label.text

    print(indent + label)


_資産の部
_貸借対照表
_貸借対照表
_貸借対照表
_資本剰余金
_連結個別
_流動資産
_流動負債
_無形固定資産
_投資その他の資産
_負債の部
_純資産の部
_固定資産
_固定負債
_その他利益剰余金
_有形固定資産
_利益剰余金
_株主資本
_評価・換算差額等
_貸借対照表
__貸借対照表
___資産の部
___負債の部
___純資産の部
___負債純資産
____流動資産
____固定資産
____資産
_____現金及び預金
_____その他
_____流動資産
_____売掛金
_____短期貸付金
_____前渡金
_____繰延税金資産
_____未収入金
_____有形固定資産
_____無形固定資産
_____投資その他の資産
_____固定資産
_____繰延資産
______借地権
______その他
______無形固定資産
______投資有価証券
______その他
______貸倒引当金
______投資その他の資産
______関係会社株式
______関係会社長期貸付金
______長期前払費用
______前払年金費用
______建物（純額）
______機械及び装置（純額）
______工具、器具及び備品（純額）
______土地
______建設仮勘定
______その他（純額）
______有形固定資産
____流動負債
____固定負債
____負債
_____短期借入金
_____従業員預り金
_____預り金
_____賞与引当金
_____その他
_____流動負債
_____1年内返済予定の長期借入金
_____1年内償還予定の社債
_____未払金
_____未払費用
_____未払消費税等
_____未払法人税等
_____社債
_____長期借入金
_____繰延税金負債
_____退職給付引当金
_____その他
_____固定負債
____株主資本
____評価・換算差額等
____純資産
_____資本金
_____資本剰余金
_____利益剰余金
_____自己株式
_____株主資本
______資本準備金
______その他資本剰余金
______資本剰余金
______その他利益剰余金
______利益剰余金
_______特別償却準備金
_______固定資産圧縮積立金
_______

## Read Document Schema

Get namespaces

In [0]:
import dataclasses


@dataclasses.dataclass
class Namespace:
    name: str
    namespace: str
    location: str

Read from taxonomy