-
Notifications
You must be signed in to change notification settings - Fork 0
/
listlegal.py
36 lines (31 loc) · 1.02 KB
/
listlegal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import glob
from collections import defaultdict, OrderedDict
from lxml import etree as ET
from acdh_tei_pyutils.tei import TeiReader
from tqdm import tqdm
LIST_LEGAL = "./data/indices/listlegal.xml"
legal_doc = TeiReader(LIST_LEGAL)
files = glob.glob('./data/editions/D_*.xml')
refs = defaultdict(set)
for x in tqdm(files, total=len(files)):
try:
doc = TeiReader(x)
except:
continue
xml_id = x.split('/')[-1]
for ref in doc.any_xpath('.//tei:rs[@type="law"]/@ref'):
title = " ".join(doc.any_xpath('.//tei:title')[0].text.split())
refs[ref].add(f"{title}|{xml_id}")
ref_lookup = OrderedDict(sorted(refs.items()))
for x in legal_doc.any_xpath('.//tei:bibl'):
corresp = x.attrib['corresp']
try:
match = ref_lookup[corresp]
except:
continue
for y in match:
ref = ET.Element("{http://www.tei-c.org/ns/1.0}ref")
ref.attrib['target'] = y.split('|')[1]
ref.text = f"{y.split('|')[0]}"
x.append(ref)
legal_doc.tree_to_file(LIST_LEGAL)