In [1]:
import csv
from tqdm.notebook import tqdm

https://www.e-stat.go.jp/classifications/terms/40?search_method=keyword&search_word=&komokuSearchFlg_dummy=1&komokuSearchFlg=1&op=all_search&revision=03&search_kind=40&base_code=&form_build_id=form-0uf_6jsXlpBIzMadxx80ylBKh-31FKHUI7sAxBcpT40&form_id=main_form&searchboxShow1=1&searchboxShow2=0&searchboxShow3=0&source=main&page=&srchcndId= からcsvをダウンロードします

In [2]:
from collections import OrderedDict

def write_roman(num):

    roman = OrderedDict()
    roman[1000] = "M"
    roman[900] = "CM"
    roman[500] = "D"
    roman[400] = "CD"
    roman[100] = "C"
    roman[90] = "XC"
    roman[50] = "L"
    roman[40] = "XL"
    roman[10] = "X"
    roman[9] = "IX"
    roman[5] = "V"
    roman[4] = "IV"
    roman[1] = "I"

    def roman_num(num):
        for r in roman.keys():
            x, y = divmod(num, r)
            yield roman[r] * x
            num -= (r * x)
            if num <= 0:
                break

    return "".join([a for a in roman_num(num)])

In [3]:
roman_class = [write_roman(i) for i in range(1,23)]

In [4]:
roman_class

['I',
 'II',
 'III',
 'IV',
 'V',
 'VI',
 'VII',
 'VIII',
 'IX',
 'X',
 'XI',
 'XII',
 'XIII',
 'XIV',
 'XV',
 'XVI',
 'XVII',
 'XVIII',
 'XIX',
 'XX',
 'XXI',
 'XXII']

In [5]:
def isroman(code):
    return code in roman_class

def isrange(code):
    return "-" in code

def isau(code):
    return "." not in code

In [6]:
#階層構造の整理
# roman -> range -> alpha_upper -> alpha_lower

labels = {}
triples = {"roman-range":[], "range-au":[], "au-al":[]}
with open("icd10.csv","r") as f: 
    csv_file = csv.reader(f)
    current_roman = ""
    current_range = ""
    current_au = ""
    current_al = ""
    
    for r in tqdm(csv_file):
        if r in [['疾病、傷害及び死因の統計分類（基本分類）(ICD-10(2013年版))'], ['分類コード', '項目名']]:
            continue
            
        code = r[0]  
        labels[code] = r[1]
        
        if isroman(code):
            current_roman = code
            current_range = ""
            current_au = ""
            current_al = ""
            continue
        
        if isrange(code):
            current_range = code
            current_au = ""
            current_al = ""
            triples["roman-range"].append((current_roman, current_range))
            continue
            
        if isau(code):
            current_au = code
            current_al = ""
            triples["range-au"].append((current_range, current_au))
            continue
            
        if True: #isal(code):
            current_al = code
            triples["au-al"].append((current_au, current_al))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [9]:
triples["range-au"]

[('A00-A09', 'A00'),
 ('A00-A09', 'A01'),
 ('A00-A09', 'A02'),
 ('A00-A09', 'A03'),
 ('A00-A09', 'A04'),
 ('A00-A09', 'A05'),
 ('A00-A09', 'A06'),
 ('A00-A09', 'A07'),
 ('A00-A09', 'A08'),
 ('A00-A09', 'A09'),
 ('A15-A19', 'A15'),
 ('A15-A19', 'A16'),
 ('A15-A19', 'A17†'),
 ('A15-A19', 'A18'),
 ('A15-A19', 'A19'),
 ('A20-A28', 'A20'),
 ('A20-A28', 'A21'),
 ('A20-A28', 'A22'),
 ('A20-A28', 'A23'),
 ('A20-A28', 'A24'),
 ('A20-A28', 'A25'),
 ('A20-A28', 'A26'),
 ('A20-A28', 'A27'),
 ('A20-A28', 'A28'),
 ('A30-A49', 'A30'),
 ('A30-A49', 'A31'),
 ('A30-A49', 'A32'),
 ('A30-A49', 'A33'),
 ('A30-A49', 'A34'),
 ('A30-A49', 'A35'),
 ('A30-A49', 'A36'),
 ('A30-A49', 'A37'),
 ('A30-A49', 'A38'),
 ('A30-A49', 'A39'),
 ('A30-A49', 'A40'),
 ('A30-A49', 'A41'),
 ('A30-A49', 'A42'),
 ('A30-A49', 'A43'),
 ('A30-A49', 'A44'),
 ('A30-A49', 'A46'),
 ('A30-A49', 'A48'),
 ('A30-A49', 'A49'),
 ('A50-A64', 'A50'),
 ('A50-A64', 'A51'),
 ('A50-A64', 'A52'),
 ('A50-A64', 'A53'),
 ('A50-A64', 'A54'),
 ('A50-A64',

In [10]:
for p in triples["au-al"]:
    if "E" in p[0]:
        print(p)

('E00', 'E00.0')
('E00', 'E00.1')
('E00', 'E00.2')
('E00', 'E00.9')
('E01', 'E01.0')
('E01', 'E01.1')
('E01', 'E01.2')
('E01', 'E01.8')
('E03', 'E03.0')
('E03', 'E03.1')
('E03', 'E03.2')
('E03', 'E03.3')
('E03', 'E03.4')
('E03', 'E03.5')
('E03', 'E03.8')
('E03', 'E03.9')
('E04', 'E04.0')
('E04', 'E04.1')
('E04', 'E04.2')
('E04', 'E04.8')
('E04', 'E04.9')
('E05', 'E05.0')
('E05', 'E05.1')
('E05', 'E05.2')
('E05', 'E05.3')
('E05', 'E05.4')
('E05', 'E05.5')
('E05', 'E05.8')
('E05', 'E05.9')
('E06', 'E06.0')
('E06', 'E06.1')
('E06', 'E06.2')
('E06', 'E06.3')
('E06', 'E06.4')
('E06', 'E06.5')
('E06', 'E06.9')
('E07', 'E07.0')
('E07', 'E07.1')
('E07', 'E07.8')
('E07', 'E07.9')
('E10', 'E10.0')
('E10', 'E10.1')
('E10', 'E10.2†')
('E10', 'E10.3†')
('E10', 'E10.4†')
('E10', 'E10.5')
('E10', 'E10.6')
('E10', 'E10.7')
('E10', 'E10.8')
('E10', 'E10.9')
('E11', 'E11.0')
('E11', 'E11.1')
('E11', 'E11.2†')
('E11', 'E11.3†')
('E11', 'E11.4†')
('E11', 'E11.5')
('E11', 'E11.6')
('E11', 'E11.7')
('E11', 

In [12]:
lst = list(labels.keys())
lst

['I',
 'A00-A09',
 'A00',
 'A00.0',
 'A00.1',
 'A00.9',
 'A01',
 'A01.0',
 'A01.1',
 'A01.2',
 'A01.3',
 'A01.4',
 'A02',
 'A02.0',
 'A02.1',
 'A02.2',
 'A02.8',
 'A02.9',
 'A03',
 'A03.0',
 'A03.1',
 'A03.2',
 'A03.3',
 'A03.8',
 'A03.9',
 'A04',
 'A04.0',
 'A04.1',
 'A04.2',
 'A04.3',
 'A04.4',
 'A04.5',
 'A04.6',
 'A04.7',
 'A04.8',
 'A04.9',
 'A05',
 'A05.0',
 'A05.1',
 'A05.2',
 'A05.3',
 'A05.4',
 'A05.8',
 'A05.9',
 'A06',
 'A06.0',
 'A06.1',
 'A06.2',
 'A06.3',
 'A06.4',
 'A06.5†',
 'A06.6†',
 'A06.7',
 'A06.8',
 'A06.9',
 'A07',
 'A07.0',
 'A07.1',
 'A07.2',
 'A07.3',
 'A07.8',
 'A07.9',
 'A08',
 'A08.0',
 'A08.1',
 'A08.2',
 'A08.3',
 'A08.4',
 'A08.5',
 'A08.5a',
 'A08.5b',
 'A09',
 'A09.0',
 'A09.9',
 'A15-A19',
 'A15',
 'A15.0',
 'A15.1',
 'A15.2',
 'A15.3',
 'A15.4',
 'A15.5',
 'A15.6',
 'A15.7',
 'A15.8',
 'A15.9',
 'A16',
 'A16.0',
 'A16.1',
 'A16.2',
 'A16.3',
 'A16.4',
 'A16.5',
 'A16.7',
 'A16.8',
 'A16.9',
 'A17†',
 'A17.0†',
 'A17.1†',
 'A17.8†',
 'A17.9†',
 'A18',

In [20]:
#エンティティマスター

with open("entity2id_withJPlabel.csv","w",encoding="UTF-8") as f:
    lst = list(labels.keys())
    for i in range(len(lst)):
        s = lst[i] + "," + str(i) + "," + labels[lst[i]] + "\n"
        f.write(s)
        #print(s)

In [23]:
#学習用 entity2id.txt

with open("entity2id.txt","w",encoding="UTF-8") as f:
    lst = list(labels.keys())
    f.write(str(len(lst)) + "\n")
    for i in range(len(lst)):
        s = lst[i] + "\t" + str(i) + "\n"
        f.write(s)
        #print(s)

In [24]:
#学習用 relation2id.txt
with open("relation2id.txt","w",encoding="UTF-8") as f:
    relations = ["P279"]
    f.write(str(len(relations)) + "\n")
    for i in range(len(relations)):
        s = relations[i] + "\t" + str(i) + "\n"
        f.write(s)

In [29]:
count

0

In [28]:
#学習用 triple2id.txt
count=0
for k in triples.keys():
    count+=len(triples[k])       

with open("triple2id.txt","w",encoding="UTF-8") as f:
    f.write(str(count) + "\n")
    for k in triples.keys():
        for p in triples[k]:
            s = str(lst.index(p[1])) + "\t" + str(lst.index(p[0])) + "\t" + "0" + "\n"
            f.write(s)

In [26]:
triples.keys()

dict_keys(['roman-range', 'range-au', 'au-al'])