# Get Intangible Asset

* 「企業結合等関係注記」から、無形資産の情報を取得する

In [1]:
import os
import pandas as pd
import xbrr


ROOT = os.path.join(os.getcwd(), "../data")
intangibles = pd.read_csv(os.path.join(ROOT, "raw/intangibles.csv"))
print(len(intangibles))

920


In [2]:
intangibles.head(5)

Unnamed: 0,is_ifrs,note,cash,edinet_code,sec_code,filer_name,fiscal_year,submit_date,doc_id
0,False,"<h6>\n （企業結合等関係）\n</h6>\n<p style=""margin-left...",-1180000000,E00006,13770,株式会社　サカタのタネ,2018,2019-08-27,S100GUE8
1,False,"<p style=""page-break-before:always; line-heigh...",-2839000000,E00011,19110,住友林業株式会社,2018,2019-06-21,S100G299
2,False,"<h6 class=""smt_head5"">\n (企業結合等関係)\n</h6>\n<p ...",-1080000000,E00014,13320,日本水産株式会社,2018,2019-06-26,S100G50V
3,False,"<h6>\n （企業結合等関係）\n</h6>\n<p style=""text-align:...",-21000000,E00017,13520,株式会社ホウスイ,2018,2019-06-24,S100G4CT
4,True,"<p style=""margin-left: 6px; text-align: left"">...",-1527000000,E00023,57130,住友金属鉱山株式会社,2018,2019-06-25,S100G7YI


In [3]:
import re
import unicodedata
from bs4 import BeautifulSoup


class Section():

    def __init__(self, title, contents=()):
        self.title = title
        self.contents = list(contents)
    
    def add_content(self, content):
        self.contents.append(content)

    def new_title(self, title):
        self.title = title
        self.contents = []

    def __repr__(self):
        if len(self.contents) == 0:
            content = ""
        else:
            content = self.contents[0][:100]
        return f"<Section (title:{self.title}, content:{content})>"


def parse(note):
    html = BeautifulSoup(note, "html.parser")
    pattern = None
    indents = []
    for br in html.find_all("br"):
        br.replace_with("\n")

    hint = "企業結合の概要"
    sections = []
    section = None
    for p in html.find_all("p"):
        content = unicodedata.normalize("NFKC", p.text.strip())
        if pattern is not None and re.search(pattern, content):
            styles = p["style"].split(";")
            if len(indents) == len([s for s in styles if s in indents]):
                sections.append(Section(section.title, section.contents))
                section.new_title(content)
                continue

        if content.endswith(hint):
            index = content.replace(hint, "")
            index = index.replace("(", "\(").replace(")", "\)").replace(".", "\.")
            pattern = re.compile("^" + index.replace("1", "\d"))
            for s in p["style"].split(";"):
                if s in ["padding-left", "margin-left", "text-indent"]:
                    indents.append(s)
            section = Section(content)
        elif section is not None:
            section.add_content(content)


    return sections

In [4]:
parsed = parse(intangibles["note"].iloc[0])
print(parsed)

assert len(parsed) > 0, "Missing section"

[<Section (title:1.企業結合の概要, content:(1) 被取得企業の名称及びその事業の内容)>, <Section (title:2.連結財務諸表に含まれている被取得企業の業績の期間, content:2018年10月31日から2019年5月31日まで)>, <Section (title:3.被取得企業の取得原価及び対価の種類ごとの内訳, content:)>, <Section (title:4.主要な取得関連費用の内容及び金額, content:コンサルティング会社に対する報酬等      32百万円)>, <Section (title:5.発生したのれんの金額、発生原因、償却方法及び償却期間, content:(1) 発生したのれんの金額)>, <Section (title:6.企業結合日に受け入れた資産及び引き受けた負債の額並びにその主な内訳, content:流動資産)>, <Section (title:7.企業結合が連結会計年度の開始の日に完了したと仮定した場合の当連結会計年度の連結損益計算書に及ぼす影響の概算額及びその算定方法, content:当連結会計年度における概算額の算定が困難であるため、記載しておりません。)>, <Section (title:1.取引の概要, content:(1) 対象となった事業の名称及び当該事業の内容)>]


In [5]:
has_intangible = []
for index, row in intangibles.iterrows():
    if "のれん以外" in row["note"]:
        has_intangible.append(True)
    else:
        has_intangible.append(False)

if "has_intangible" not in intangibles.columns:
    intangibles["has_intangible"] = has_intangible

print(f"{len(intangibles[intangibles['has_intangible']])} / {len(intangibles)} companies recognize intangibles")

30 / 920 companies recognize intangibles


In [6]:
import re


assets = {
    "顧客関連資産": re.compile("顧客(関連)?資産"),
    "技術関連資産": re.compile("技術(関連)?資産"),
    "商標権": re.compile("商標権"),
    "借地権": re.compile("借地権")
}
price_pattern = re.compile("\d(\d|,)+?(.+?)円")
year_pattern = re.compile("\d+?年")

breakdowns = []
for index, row in intangibles[intangibles['has_intangible']].iterrows():
    html = BeautifulSoup(row["note"], "html.parser")
    for br in html.find_all("br"):
        br.replace_with("\n")
    content = unicodedata.normalize("NFKC", html.text.strip())
    content = content.replace("\n", " ")
    content = " ".join(content.split())
    start = content.index("のれん以外")
    description = content[start:]
    for a in assets:
        m = re.search(assets[a], description)
        price = ""
        year = ""
        if m is not None:
            detail = description[m.start():]
            price = re.search(price_pattern, detail)
            year = re.search(year_pattern, detail)
            if price is not None:
                price = price.group(0).replace(",", "")
            if year is not None:
                year = year.group(0).replace("年", "")

            if price or year:
                item = {}
                for k in intangibles.columns:
                    if k != "note":
                        item[k] = row[k]
                
                item["asset"] = a
                item["price"] = price
                item["year"] = year
                breakdowns.append(item)

breakdowns = pd.DataFrame(breakdowns)
print(len(breakdowns))

50


In [7]:
breakdowns.head(5)

Unnamed: 0,is_ifrs,cash,edinet_code,sec_code,filer_name,fiscal_year,submit_date,doc_id,has_intangible,asset,price,year
0,False,-2770000000,E00058,18120,鹿島建設株式会社,2018,2019-06-26,S100G4O0,True,顧客関連資産,1107百万円,5
1,False,-2770000000,E00058,18120,鹿島建設株式会社,2018,2019-06-26,S100G4O0,True,その他関連資産,1107百万円,5
2,False,-687000000,E00703,79150,ＮＩＳＳＨＡ株式会社,2018,2019-03-22,S100FF10,True,商標権,88 百万円,7
3,False,-588000000,E00766,40410,日本曹達株式会社,2018,2019-06-27,S100G9AK,True,商標権,4999百万円,10
4,False,-3328000000,E00872,34010,帝人株式会社,2018,2019-06-20,S100G16A,True,顧客関連資産,2907百万円,10


In [8]:
breakdowns.to_csv("asset_breakdowns.csv", encoding="shift_jis", index=False)