# Read XBRL from Taxonomy

タクソノミ定義の情報を参照して、XBRLから情報抽出を行う。


In [1]:
import os
from pathlib import Path
import edinet

# サントリーホールディングス株式会社/H31.03.26 12:50
DOC_ID = "S100FGSC"

# Data Folder
DATA_ROOT = Path.cwd().joinpath("data")

## Download XBRL

In [2]:
from edinet.xbrl_file import XBRLDir


# Download and load document
xbrl_path = edinet.api.document.get_xbrl(
    DOC_ID, save_dir=DATA_ROOT.joinpath("raw"),
    expand_level="dir")

xbrl_dir = XBRLDir(xbrl_path)



## Download Taxonomy

Get taxonomy file

In [3]:
taxonomies = {
    2013: "https://www.fsa.go.jp/search/20130821/editaxonomy2013New.zip",
    2014: "https://www.fsa.go.jp/search/20140310/1c.zip",
    2015: "https://www.fsa.go.jp/search/20150310/1c.zip",
    2016: "https://www.fsa.go.jp/search/20160314/1c.zip",
    2017: "https://www.fsa.go.jp/search/20170228/1c.zip",
    2018: "https://www.fsa.go.jp/search/20180228/1c_Taxonomy.zip",
    2019: "https://www.fsa.go.jp/search/20190228/1c_Taxonomy.zip"
}

Confirm fiscal year and target taxonomy

In [4]:
from datetime import datetime


fiscal_year_end = xbrl_dir.xbrl.find("jpdei_cor:CurrentFiscalYearEndDateDEI").text
fiscal_year_end = datetime.strptime(fiscal_year_end, "%Y-%m-%d")
taxonomy_year = -1

for y in taxonomies:
    boarder_date = datetime(y, 3, 31)
    if fiscal_year_end > boarder_date:
        taxonomy_year = y
    else:
        break

print(taxonomy_year)

2018


Download taxonomy

In [5]:
from zipfile import ZipFile
import requests


external_dir = DATA_ROOT.joinpath("external")
expand_dir = external_dir.joinpath("taxonomy").joinpath(str(taxonomy_year))
taxonomy_file = external_dir.joinpath(f"{taxonomy_year}_taxonomy.zip")

download = False

if not external_dir.exists():
    external_dir.mkdir(parents=True, exist_ok=True)
    download = True

if not expand_dir.exists():
    expand_dir.mkdir(parents=True, exist_ok=True)
    download = True

if download:
    # Download
    external_dir.mkdir(parents=True, exist_ok=True)
    r = requests.get(taxonomies[taxonomy_year], stream=True)
    with taxonomy_file.open(mode="wb") as f:
        for chunk in r.iter_content(1024):
            f.write(chunk)

    # Extract
    with ZipFile(taxonomy_file, "r") as zip:
        for f in zip.namelist():
            # Avoid Japanese path 
            dirs = f.split("/")
            if dirs[2] == "taxonomy":
                _to = expand_dir.joinpath("/".join(dirs[3:]))
                _to.parent.mkdir(parents=True, exist_ok=True)
                with _to.open("wb") as _to_f:
                    _to_f.write(zip.read(f))

    taxonomy_file.unlink()

expand_dir

WindowsPath('c:/Users/tie301837/Documents/source/xbrl_read_tutorial/data/external/taxonomy/2018')

In [6]:
class Element():

    def __init__(self, name, element, location, taxonomy):
        self.name = name
        self.element = element
        self.location = location
        self.taxonomy = taxonomy

    def definition(self):
        def_dir = self.taxonomy.root
        path, element_name = self.location.split("#")
        location = self.location

        if path.startswith(self.taxonomy.reference_prefix):
            path = path.replace(self.taxonomy.reference_prefix, "")
            path = os.path.join(self.taxonomy.reference_root, path)
        else:
            path = os.path.join(self.taxonomy.root, path)
        
        xml = self.taxonomy._read_from_cache(path)
        _def = xml.find("xsd:element", {"id": element_name})
        return _def

    def label(self, kind="ja", verbose=True):
        label_ext = "_lab.xml"
        if kind == "en":
            label_ext = "_lab-en.xml"
        elif kind == "g":
            label_ext = "_gla.xml"

        label = None

        label_dir = self.taxonomy.root
        path, element_name = self.location.split("#")
        location = self.location

        if path.startswith(self.taxonomy.reference_prefix):
            path = path.replace(self.taxonomy.reference_prefix, "")
            label_dir_reference = os.path.join(
                                    self.taxonomy.reference_root,
                                    f"{os.path.dirname(path)}/label")
            label_dir = label_dir_reference
            location = f"../{os.path.basename(path)}#{element_name}"

        targets = []
        for f in os.listdir(label_dir):
            label_path = os.path.join(label_dir, f)
            if not label_path.endswith(label_ext):
                continue

            label_xml = self.taxonomy._read_from_cache(label_path)
            targets = self._read_link(
                xml=label_xml, arc_name="link:labelArc", location=location,
                target_name="link:label", target_attribute="id")

        if len(targets) > 1:
            for lb in targets:
                if lb["xlink:role"].endswith("verboseLabel") and verbose:
                    label = lb
                    break
                else:
                    label = lb

        elif len(targets) > 0:
            label = targets[0]
        
        return label

    def _read_link(self, xml, arc_name, location="",
                   target_name="", target_attribute=""):

        # link: href: absolute path to element definition by url format.
        # name: underscore separated name. when used by tag, it is splited by ":"
        # name is solved by namespace so
        # name => link is good approach.

        location = location if location else self.location
        label = xml.find("link:loc", {"xlink:href": location})
        arc = None

        if label is not None:
            arc = xml.find(arc_name, {"xlink:from": label["xlink:label"]})
        else:
            arc = xml.find(arc_name, {"xlink:label": self.name})

        if arc is None:
            return []

        target_name = target_name if target_name else "link:loc"
        target_attribute = target_attribute if target_attribute else "xlink:label"
        targets = []
        if arc is not None:
            targets = xml.find_all(target_name, {target_attribute: arc["xlink:to"]})

        return targets


In [7]:
import os
from bs4 import BeautifulSoup


class Taxonomy():

    def __init__(self, root, reference_root, reference_prefix=""):
        self.root = root
        self.reference_root = reference_root
        self.reference_prefix = reference_prefix
        self._cache = {}
        if not self.reference_prefix:
            self.reference_prefix = "http://disclosure.edinet-fsa.go.jp/taxonomy/"

    def _read_from_cache(self, path):
        xml = None
        if path in self._cache:
            xml = self._cache[path]
        else:
            with open(path, encoding="utf-8-sig") as f:
                xml = BeautifulSoup(f, "lxml-xml")
            self._cache[path] = xml
        return self._cache[path]

    def read(self, href):
        path = href
        element = ""
        use_parent = False

        if "#" in path:
            path, element = path.split("#")

        if path.startswith(self.reference_prefix):
            path = path.replace(self.reference_prefix, "")
            path = os.path.join(self.reference_root, path)
        else:
            path = os.path.join(self.root, path)
        
        xml = self._read_from_cache(path)

        if element:
            xml = xml.select(f"#{element}")
            if len(xml) > 0:
                xml = xml[0]
            xml = Element(element, xml, href, self)

        return xml


In [8]:
taxonomy = Taxonomy(xbrl_dir._document_folder, expand_dir)
taxonomy.read("http://disclosure.edinet-fsa.go.jp/taxonomy/jpcrp/2018-02-28/jpcrp_rt_2018-02-28.xsd#rol_CoverPage").element

<link:roleType id="rol_CoverPage" roleURI="http://disclosure.edinet-fsa.go.jp/role/jpcrp/rol_CoverPage">
<link:definition>表紙</link:definition>
<link:usedOn>link:presentationLink</link:usedOn>
<link:usedOn>link:calculationLink</link:usedOn>
<link:usedOn>link:definitionLink</link:usedOn>
<link:usedOn>link:footnoteLink</link:usedOn>
</link:roleType>

In [9]:
taxonomy.read("http://disclosure.edinet-fsa.go.jp/taxonomy/jpcrp/2018-02-28/jpcrp_cor_2018-02-28.xsd#jpcrp_cor_NetAssetsPerShareSummaryOfBusinessResults").label()

<link:label id="label_NetAssetsPerShareSummaryOfBusinessResults" xlink:label="label_NetAssetsPerShareSummaryOfBusinessResults" xlink:role="http://www.xbrl.org/2003/role/label" xlink:type="resource" xml:lang="ja">１株当たり純資産額</link:label>

In [10]:
taxonomy.read("http://disclosure.edinet-fsa.go.jp/taxonomy/jpcrp/2018-02-28/jpcrp_cor_2018-02-28.xsd#jpcrp_cor_NetAssetsPerShareSummaryOfBusinessResults").definition()

<xsd:element abstract="false" id="jpcrp_cor_NetAssetsPerShareSummaryOfBusinessResults" name="NetAssetsPerShareSummaryOfBusinessResults" nillable="true" substitutionGroup="xbrli:item" type="num:perShareItemType" xbrli:periodType="instant"/>

In [11]:
taxonomy.read("jpcrp030000-asr-001_E22559-000_2018-12-31_01_2019-03-26.xsd#jpcrp030000-asr_E22559-000_NotesRegardingBusinessRestructuringExpenseTextBlock").label(kind="en")

<link:label id="label_NotesRegardingBusinessRestructuringExpenseTextBlock" xlink:label="label_NotesRegardingBusinessRestructuringExpenseTextBlock" xlink:role="http://www.xbrl.org/2003/role/label" xlink:type="resource" xml:lang="en">Notes regarding business restructuring expense</link:label>

In [12]:
taxonomy.read("jpcrp030000-asr-001_E22559-000_2018-12-31_01_2019-03-26.xsd#jpcrp030000-asr_E22559-000_NotesRegardingBusinessRestructuringExpenseTextBlock").definition()

<xsd:element abstract="false" id="jpcrp030000-asr_E22559-000_NotesRegardingBusinessRestructuringExpenseTextBlock" name="NotesRegardingBusinessRestructuringExpenseTextBlock" nillable="true" substitutionGroup="xbrli:item" type="nonnum:textBlockItemType" xbrli:periodType="duration"/>

## Read Presentation Link

List up links

In [13]:
role_ref_tags = xbrl_dir.xbrl.find_all("link:roleRef")
role_ref_elements = [t.element for t in role_ref_tags]
role_refs = {}
for e in role_ref_elements:
    role_refs[e["roleURI"]] = e["xlink:href"]


roles = {}
for r in role_refs:
    role_name = taxonomy.read(role_refs[r]).element.find("link:definition").text
    roles[role_name] = r

# Show roles
for r in roles:
    print(f"{r}\t{roles[r]}")


貸借対照表	http://disclosure.edinet-fsa.go.jp/role/jppfs/rol_BalanceSheet
損益計算書	http://disclosure.edinet-fsa.go.jp/role/jppfs/rol_StatementOfIncome
注記番号	http://disclosure.edinet-fsa.go.jp/role/jppfs/role/NotesNumber


Read one of the link

* Define `Node` class to deal with tree structure

In [14]:
class Node():

    def __init__(self, element, order=0):
        self.element = element
        self.parent = None
        self.order = order

    def add_parent(self, parent):
        self.parent = parent

    @property
    def name(self):
        return self.element["xlink:href"].split("#")[-1]

    @property
    def label(self):
        return self.element["xlink:label"]

    @property
    def location(self):
        return self.element["xlink:href"]

    @property
    def depth(self):
        return len(self.get_parents())

    @property
    def path(self):
        parents = list(reversed(self.get_parents()))
        if len(parents) == 0:
            return self.name
        else:
            path = str(self.order) + " " + self.name
            for p in parents:
                path = p.name + "/" + path
            return path

    def get_parents(self):
        parents = []
        if self.parent is None:
            return parents
        else:
            p = self.parent
            while p is not None:
                parents.insert(0, p)
                p = p.parent
            return parents


Read link (`.pre`) file. 

In [15]:
pre_def = xbrl_dir.pre.find(
            "link:presentationLink", {"xlink:role": roles["貸借対照表"]})

nodes = {}
for i, arc in enumerate(pre_def.find_all("link:presentationArc")):
    if not arc["xlink:arcrole"].endswith("parent-child"):
        print("Unexpected arctype.")
        continue

    parent = Node(pre_def.find("link:loc", {"xlink:label": arc["xlink:from"]}), i)
    child = Node(pre_def.find("link:loc", {"xlink:label": arc["xlink:to"]}), arc["order"])

    if child.name not in nodes:
        nodes[child.name] = child
    else:
        nodes[child.name].order = arc["order"]

    if parent.name not in nodes:
        nodes[parent.name] = parent

    nodes[child.name].add_parent(nodes[parent.name])


* Convert tree structure to table (`pandas` dataframe). 
* Read label from `lab.xml` file.
* Read element definition from  `.xsd` file. 

In [16]:
import pandas as pd


parent_depth = -1
for name in nodes:
    if parent_depth < nodes[name].depth:
        parent_depth= nodes[name].depth

data = []
for name in nodes:
    n = nodes[name]
    item = {}
    parents = n.get_parents()
    parents = parents + ([""] * (parent_depth - len(parents)))

    for i, p in enumerate(parents):
        name = p if isinstance(p, str) else p.name
        order = "0" if isinstance(p, str) else p.order
        item[f"parent_{i}"] = name
        item[f"parent_{i}_order"] = order

    item["element"] = n.name
    item["order"] = n.order
    item["depth"] = n.depth

    # Label
    item["label"] = taxonomy.read(n.location).label().text

    # Definition
    _def = taxonomy.read(n.location).definition()
    item["abstract"] = _def["abstract"]
    item["type"] = _def["type"]

    if "xbrli:periodType" in _def.attrs:
        item["period_type"] = _def["xbrli:periodType"]

    if "xbrli:balance" in _def.attrs:
        item["balance"] = _def["xbrli:balance"]

    data.append(item)


data = pd.DataFrame(data)
data.sort_values(by=[c for c in data.columns if c.endswith("order")], inplace=True)

In [17]:
data.head(10)

Unnamed: 0,parent_0,parent_0_order,parent_1,parent_1_order,parent_2,parent_2_order,parent_3,parent_3_order,parent_4,parent_4_order,parent_5,parent_5_order,element,order,depth,label,abstract,type,period_type,balance
0,jppfs_cor_BalanceSheetHeading,0,,0.0,,0.0,,0,,0,,0,jppfs_cor_BalanceSheetTable,1.0,1,貸借対照表,True,xbrli:stringItemType,duration,
4,jppfs_cor_BalanceSheetHeading,0,,0.0,,0.0,,0,,0,,0,jppfs_cor_BalanceSheetLineItems,2.0,1,貸借対照表,True,xbrli:stringItemType,duration,
2,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetTable,1.0,,0.0,,0,,0,,0,jppfs_cor_ConsolidatedOrNonConsolidatedAxis,1.0,2,連結個別,True,xbrli:stringItemType,duration,
3,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetTable,1.0,jppfs_cor_ConsolidatedOrNonConsolidatedAxis,1.0,,0,,0,,0,jppfs_cor_NonConsolidatedMember,1.0,3,非連結又は個別,True,nonnum:domainItemType,duration,
5,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetLineItems,2.0,,0.0,,0,,0,,0,jppfs_cor_AssetsAbstract,1.0,2,資産の部,True,xbrli:stringItemType,duration,
40,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetLineItems,2.0,,0.0,,0,,0,,0,jppfs_cor_LiabilitiesAbstract,2.0,2,負債の部,True,xbrli:stringItemType,duration,
62,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetLineItems,2.0,,0.0,,0,,0,,0,jppfs_cor_NetAssetsAbstract,3.0,2,純資産の部,True,xbrli:stringItemType,duration,
83,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetLineItems,2.0,,0.0,,0,,0,,0,jppfs_cor_LiabilitiesAndNetAssets,4.0,2,負債純資産,False,xbrli:monetaryItemType,instant,credit
6,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetLineItems,2.0,jppfs_cor_AssetsAbstract,1.0,,0,,0,,0,jppfs_cor_CurrentAssetsAbstract,1.0,3,流動資産,True,xbrli:stringItemType,duration,
15,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetLineItems,2.0,jppfs_cor_AssetsAbstract,1.0,,0,,0,,0,jppfs_cor_NoncurrentAssetsAbstract,2.0,3,固定資産,True,xbrli:stringItemType,duration,


## Read XBRL according to structure

Read namespace

In [18]:
xbrl = xbrl_dir.xbrl
schema = xbrl.find("xbrli:xbrl")
namespaces = {}
for a in schema.element.attrs:
    if a.startswith("xmlns:"):
        namespaces[a.replace("xmlns:", "")] = schema.element.attrs[a]

namespaces

{'link': 'http://www.xbrl.org/2003/linkbase',
 'jpdei_cor': 'http://disclosure.edinet-fsa.go.jp/taxonomy/jpdei/2013-08-31/jpdei_cor',
 'iso4217': 'http://www.xbrl.org/2003/iso4217',
 'jpcrp030000-asr_E22559-000': 'http://disclosure.edinet-fsa.go.jp/jpcrp030000/asr/001/E22559-000/2018-12-31/01/2019-03-26',
 'xbrldi': 'http://xbrl.org/2006/xbrldi',
 'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
 'xlink': 'http://www.w3.org/1999/xlink',
 'jpcrp_cor': 'http://disclosure.edinet-fsa.go.jp/taxonomy/jpcrp/2018-02-28/jpcrp_cor',
 'xbrli': 'http://www.xbrl.org/2003/instance',
 'jppfs_cor': 'http://disclosure.edinet-fsa.go.jp/taxonomy/jppfs/2018-02-28/jppfs_cor'}

Read elements according to structure from link file

In [19]:
xbrl_data = []

for i, row in data.iterrows():
    tag_name = row["element"]

    for n in namespaces:
        if tag_name.startswith(n):
            tag_name = f"{n}:{tag_name.replace(n + '_', '')}"
            break

    tag = xbrl.find(tag_name)
    element = tag.element
    if element is None:
        continue

    item = {}
    for k in data.columns:
        item[k] = row[k]

    for i in range(parent_depth):
        parent_label = data[data["element"] == row[f"parent_{i}"]]["label"]
        item[f"parent_{i}_name"] = "" if len(parent_label) == 0 else parent_label.tolist()[0]
    
    item["value"] = element.text
    item["unit"] = element["unitRef"]

    context_id = element["contextRef"]
    if context_id.endswith("NonConsolidatedMember"):
        item["individual"] = True
    else:
        item["individual"] = False

    context = xbrl.find("xbrli:context", {"id": context_id})
    if item["period_type"] == "duration":
        item["period"] = context.find("xbrli:endDate").text
        item["period_begin"] = context.find("xbrli:startDate").text
    else:
        item["period"] = context.find("xbrli:instant").text
        item["period_begin"] = None

    xbrl_data.append(item)


xbrl_data = pd.DataFrame(xbrl_data)

In [20]:
xbrl_data.head(10)

Unnamed: 0,parent_0,parent_0_order,parent_1,parent_1_order,parent_2,parent_2_order,parent_3,parent_3_order,parent_4,parent_4_order,...,parent_1_name,parent_2_name,parent_3_name,parent_4_name,parent_5_name,value,unit,individual,period,period_begin
0,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetLineItems,2.0,,0.0,,0.0,,0,...,貸借対照表,,,,,2180181000000,JPY,True,2017-12-31,
1,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetLineItems,2.0,jppfs_cor_AssetsAbstract,1.0,,0.0,,0,...,貸借対照表,資産の部,,,,2180181000000,JPY,True,2017-12-31,
2,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetLineItems,2.0,jppfs_cor_AssetsAbstract,1.0,jppfs_cor_CurrentAssetsAbstract,1.0,,0,...,貸借対照表,資産の部,流動資産,,,182641000000,JPY,True,2017-12-31,
3,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetLineItems,2.0,jppfs_cor_AssetsAbstract,1.0,jppfs_cor_CurrentAssetsAbstract,1.0,,0,...,貸借対照表,資産の部,流動資産,,,3809000000,JPY,True,2017-12-31,
4,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetLineItems,2.0,jppfs_cor_AssetsAbstract,1.0,jppfs_cor_CurrentAssetsAbstract,1.0,,0,...,貸借対照表,資産の部,流動資産,,,1177345000000,JPY,True,2017-12-31,
5,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetLineItems,2.0,jppfs_cor_AssetsAbstract,1.0,jppfs_cor_CurrentAssetsAbstract,1.0,,0,...,貸借対照表,資産の部,流動資産,,,3841000000,JPY,True,2017-12-31,
6,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetLineItems,2.0,jppfs_cor_AssetsAbstract,1.0,jppfs_cor_CurrentAssetsAbstract,1.0,,0,...,貸借対照表,資産の部,流動資産,,,945063000000,JPY,True,2017-12-31,
7,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetLineItems,2.0,jppfs_cor_AssetsAbstract,1.0,jppfs_cor_CurrentAssetsAbstract,1.0,,0,...,貸借対照表,資産の部,流動資産,,,1558000000,JPY,True,2017-12-31,
8,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetLineItems,2.0,jppfs_cor_AssetsAbstract,1.0,jppfs_cor_CurrentAssetsAbstract,1.0,,0,...,貸借対照表,資産の部,流動資産,,,1883000000,JPY,True,2017-12-31,
9,jppfs_cor_BalanceSheetHeading,0,jppfs_cor_BalanceSheetLineItems,2.0,jppfs_cor_AssetsAbstract,1.0,jppfs_cor_CurrentAssetsAbstract,1.0,,0,...,貸借対照表,資産の部,流動資産,,,38548000000,JPY,True,2017-12-31,


In [21]:
xbrl_data.to_csv("xbrl_data.csv", index=False, sep="\t")