In [6]:
import xml.etree.ElementTree as ET
from pathlib import Path
import re
import csv

NAMESPACES = {
    'EurLex': {'akn': 'http://docs.oasis-open.org/legaldocml/ns/akn/3.0'},
    'Normattiva': {'akn': 'http://docs.oasis-open.org/legaldocml/ns/akn/3.0'},
    'PDL': {'akn': 'http://docs.oasis-open.org/legaldocml/ns/akn/3.0/WD17'}
}


In [None]:
p = '/home/leo/Desktop/dhdk/Master thesis/.project/data/datasets/PDL_18-19/18PDL0001700_PD.xml'
p = '/home/leo/Desktop/dhdk/Master thesis/.project/data/datasets/Normattiva/20201128_20G00181_VIGENZA_20220101.xml'
p = '/home/leo/Desktop/dhdk/Master thesis/.project/data/datasets/EurLex/32010L0035.xml'

tree = ET.parse(p) 
root = tree.getroot() 
namespace = NAMESPACES['EurLex']
definition = root.find(".//akn:definition", namespace)

definition_head = definition.find('.//akn:definitionHead', namespace)
definition_body_elements = definition.findall('.//akn:definitionBody', namespace)

definendum_id = definition_head.attrib.get('href', '')
#definendum = root.find(f".//akn:def[@eId='{definendum_id}']", namespace).text

full_def = root.find(f".//*[@defines='{definendum_id}']")
"".join(full_def.itertext())

In [3]:
NAMESPACES = {
    'EurLex': {'akn': 'http://docs.oasis-open.org/legaldocml/ns/akn/3.0'},
    'Normattiva': {'akn': 'http://docs.oasis-open.org/legaldocml/ns/akn/3.0'},
    'PDL': {'akn': 'http://docs.oasis-open.org/legaldocml/ns/akn/3.0/WD17'}
}


def checkXML(xmlfile, dataset):
    tree = ET.parse(xmlfile)
    root = tree.getroot()
    namespace = NAMESPACES[dataset]

    definitions_el = root.findall('.//akn:definitions', namespace)
    if definitions_el:
        return True
    else:
        return False


datasets = ['EurLex', 'Normattiva', 'PDL']

for dataset in datasets:
    extracted, er, c = 0, 0, 0
    target = Path('../data/datasets/' + dataset)

    for file in target.rglob('*.xml'):
        c += 1
        try:
            definitions = checkXML(file, dataset)
            if definitions:
                extracted += 1
        except Exception as e:
            #print(e, file)
            er += 1
            continue

    print(f'{dataset}: found definitions in {extracted} out of {c} files. {er} errors')

EurLex: found definitions in 889 out of 15283 files. 0 errors
Normattiva: found definitions in 401 out of 3195 files. 0 errors
PDL: found definitions in 78 out of 3709 files. 0 errors


In [9]:
def parseXML(xmlfile, dataset):
    tree = ET.parse(xmlfile)
    root = tree.getroot()
    namespace = NAMESPACES[dataset]

    definitions_el = root.findall('.//akn:definitions', namespace)
    if not definitions_el:
        return None

    definitions = []
    for definition in root.findall('.//akn:definition', namespace):
        try:
            definendum, definiens, references, full_def = parse_definition(definition, root, namespace)
            definitions.append({
                'def_n': definition.find('.//akn:definitionHead', namespace).attrib.get('href', ''),
                'label': definition.find('.//akn:definitionHead', namespace).attrib.get('refersTo', ''),
                'definendum': clean_definendum(definendum),
                'definiens': clean_definiens(definiens),
                'full_definition': clean_full_def(full_def),
                'references': references,
                'provenance': dataset,
                'document': xmlfile.name
            })
        except Exception as e:
            print(xmlfile, e, definition.attrib.get('refersTo', ''))
    return definitions


def parse_definition(definition, root, namespace):
    """Extracts definendum, definiens, and references from a single definition."""
    definition_head = definition.find('.//akn:definitionHead', namespace)
    definition_body_elements = definition.findall('.//akn:definitionBody', namespace)

    definendum_id = definition_head.attrib.get('href', '').lstrip('#')
    definendum = root.find(f".//akn:def[@eId='{definendum_id}']", namespace).text
    
    try:
        full_def = "".join(root.find(f".//*[@defines='#{definendum_id}']").itertext())
    except Exception as e:
        full_def = None

    definiens = []
    references = []
    for body in definition_body_elements:
        body_text, body_references = extract_body_and_references(body, root, namespace)
        definiens.append(body_text)
        references.extend(body_references)

    return definendum, ' '.join(definiens), references, full_def


def extract_body_and_references(body, root, namespace):
    """Extracts text and references from a single definition body."""
    body_id = body.attrib.get('href', '').lstrip('#')
    body_element = root.find(f".//akn:defBody[@eId='{body_id}']", namespace)
    body_text = ''.join(body_element.itertext())
    references = [
        ref.attrib.get('href', '') for ref in body_element.findall('.//akn:ref', namespace)
    ]
    return body_text, references


def clean_definendum(text: str):
    text = text.strip()
    if text.startswith(("«", "\'", "\"")) and text.endswith(("»", "\'", "\"")):
        text = text[1:-1]

    return text.strip()


def clean_definiens(text: str):
    if text.startswith(':'):
        text = text.lstrip(':')
    if text.startswith(','):
        text = text.lstrip(',')

    text = text.strip().replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)

    return text


def clean_full_def(text: str):
    text = text.strip().replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('""', '')
    
    return text

def save_definitions(definitions, output_file):
    Path(output_file).parent.mkdir(parents=True, exist_ok=True)
    Path(output_file).touch()
    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=definitions[0].keys(), delimiter="\t", quoting=csv.QUOTE_MINIMAL)
        writer.writeheader()
        for row in definitions:
            writer.writerow(row)

In [None]:
# single file test

t = '/home/leo/Desktop/dhdk/Master thesis/.project/data/datasets/Normattiva/20210227_21G00022_VIGENZA_20240101.xml'
t2 = '/home/leo/Desktop/dhdk/Master thesis/.project/data/datasets/EurLex/32014L0049.xml'
t3 = '/home/leo/Desktop/dhdk/Master thesis/.project/data/datasets/EurLex/32021R0403.xml'
t4 = '/home/leo/Desktop/dhdk/Master thesis/.project/data/datasets/EurLex/32010L0035.xml'
t5 = '/home/leo/Desktop/dhdk/Master thesis/.project/data/datasets/EurLex/32018R0545.xml'

definitions = parseXML(Path(t5), 'EurLex')

#save_definitions(definitions, '/home/leo/Desktop/dhdk/Master thesis/.project/data/definitions/definitions.tsv')
definitions

<div class="alert">
  <p>Do we need to format the lists (e.g. with (a), (b), etc.)?? Can it be beneficial to RAG?</p>
</div> 


Ciao Michele, due dubbi:
1. Forse ne avevamo già parlato ma quello non c'è un dataset migliore per EurLex? Perché ci sono errori di annotazione abbastanza importanti (tipo blocchi di definizioni con attributo Source "unibio" che vanno ad interferire con altri blocchi con Source "unibo", questo in 72 casi, oppure definizioni che a causa della presenza di virgolette singole e doppie (e.g. ‘status free from “disease”  ’ means a disease-free status of...) sono uscite annotate male e risultano come stringhe vuote).
2. Conviene fare gli embedding di definendum e definiens ("x means y") o di separarli (o quantomeno di fare l'embedding dei definendi e della stringa completa in due db diversi)?

In [10]:
datasets = ['EurLex', 'Normattiva', 'PDL']

extracted, er, c = 0, 0, 0
for dataset in datasets:
    target = Path('../data/datasets/' + dataset)

    for file in target.rglob('*.xml'):
        c += 1
        try:
            definitions = parseXML(file, dataset)
            if definitions:
                extracted += 1
                save_definitions(definitions, Path('../data/definitions') / file.name.replace('.xml', '.tsv'))
        except Exception as e:
            #print(e, file)
            er += 1
            continue

print(f'extracted definitions from {extracted} files. {er} errors out of {c} files.')

../data/datasets/EurLex/32014L0040.xml 'NoneType' object has no attribute 'text' #roll-your-ownTobacco
../data/datasets/EurLex/32021R1173.xml 'NoneType' object has no attribute 'text' #high-endSupercomputer
../data/datasets/EurLex/32021R1173.xml 'NoneType' object has no attribute 'text' #hyper-connected
../data/datasets/EurLex/32021R1173.xml 'NoneType' object has no attribute 'text' #industrial-gradeSupercomputer
../data/datasets/EurLex/32021R1173.xml 'NoneType' object has no attribute 'text' #mid-rangeSupercomputer
../data/datasets/EurLex/32010R1061.xml 'NoneType' object has no attribute 'text' #left-onMode
../data/datasets/EurLex/32010R1061.xml 'NoneType' object has no attribute 'text' #end-user
../data/datasets/EurLex/32012R0600.xml 'NoneType' object has no attribute 'itertext' detectionRisk
../data/datasets/EurLex/32012R0600.xml 'NoneType' object has no attribute 'itertext' accreditation
../data/datasets/EurLex/32012R0600.xml 'NoneType' object has no attribute 'itertext' verifier
.

72 EurLex documents have a typo in the declaration of the definitions. There are two distinct <definitions> tags that, one of which is sourced to "unibio" and the second to "unibo". Inside of each element, the ids of the definitions start in both cases from 1, resulting in duplicate references.

In [None]:
# insert tsv into sqlite db
for file in Path('../data/definitions').rglob('*.tsv'):
    with open(file, 'r') as f:
        for line in f:
            try:
                label, showAs, definition_body = line.strip().split('\t')
                #print(label, showAs, definition_body)
            except:
                print(line)

In [None]:
import sqlite3

def add_definitions(conn, project):
    pass


def populate_sqlite_db(definitions_folder, db_file):
    with sqlite3.connect(db_file) as conn:
        cursor = conn.cursor()
        cursor.execute('''
            CREATE TABLE definitions(
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                project_id INTEGER,
                label TEXT NOT NULL,
                showAs TEXT NOT NULL,
                definition_body TEXT NOT NULL,
                FOREIGN KEY(project_id) REFERENCES projects(id)
            )
        ''')

        for project in definitions_folder.glob('*'):
            project_id = add_project(conn, project)
            for definition_file in project.glob('*.tsv'):
                add_definitions(conn, project_id, definition_file)



with sqlite3.connect('../definitions.db') as conn:
    cursor = conn.cursor()
    sql = ''' INSERT INTO projects(name,begin_date,end_date)
            VALUES(?,?,?) '''

    cursor.execute(insert_statement)
    conn.commit()

In [5]:
import polars as pl

df = pl.read_csv('../data/definitions/*.tsv', separator='\t')#, has_header=False, schema={'label': pl.Utf8, 'showAs': pl.Utf8, 'definition_body': pl.Utf8}, truncate_ragged_lines=True)
df.head()

label,definendum,definiens,full_definition,references,provenance,document
str,str,str,str,str,str,str
"""#georeferenziazione""","""georeferenziazione""","""tecnica di attribuzione di coo…","""a) georeferenziazione : tecnic…","""[]""","""PDL""","""18PDL0001470_PD.xml"""
"""#sistemaInformativoGeografico""","""sistema informativo geografico""","""sistema informatico, hardware …","""b) sistema informativo geograf…","""[]""","""PDL""","""18PDL0001470_PD.xml"""
"""#gestoreDelSottoprodottoUmido(…","""gestore del sottoprodotto umid…","""il soggetto pubblico o privato…","""a) gestore del sottoprodotto u…","""[]""","""PDL""","""18PDL0001550_PD.xml"""
"""#scartiAlimentari""","""scarti alimentari""","""sottoprodotti della produzione…","""b) scarti alimentari : sottopr…","""[]""","""PDL""","""18PDL0001550_PD.xml"""
"""#compostaggio""","""compostaggio""","""fermentazione della materia or…","""c) compostaggio : fermentazion…","""[]""","""PDL""","""18PDL0001550_PD.xml"""


In [7]:
df.select(pl.len()) # starting len

len
u32
14729


#### Null filtering

In [8]:
df.filter(pl.all_horizontal(pl.all().is_null())) # all nulls

label,definendum,definiens,full_definition,references,provenance,document
str,str,str,str,str,str,str


In [10]:
df.filter(pl.col('definendum').is_null())

label,definendum,definiens,full_definition,references,provenance,document
str,str,str,str,str,str,str
"""#""",,"""juveniles"" means:""",""""" "" juveniles"" means: specimen…","""[]""","""EurLex""","""32010R0724.xml"""
"""#""",,"""information"" means:""","""(1) "" inside information "" mea…","""[]""","""EurLex""","""32011R1227.xml"""
"""#""",,"""transitional measures provided…",""""" "" transitional measures prov…","""[]""","""EurLex""","""32012R0284.xml"""
"""#""",,"""transitional measures provided…",""""" "" transitional measures prov…","""[]""","""EurLex""","""32012R0996.xml"""
"""#""",,"""low ALs"" means levels which re…",""""" "" low ALs"" means levels whic…","""[]""","""EurLex""","""32013L0035.xml"""
…,…,…,…,…,…,…
"""#""",,"""Black Sea"" means maritime wate…",""""" "" Black Sea"" means maritime …","""['/akn/eu/act/regulation/ep/20…","""EurLex""","""32017R0087.xml"""
"""#""",,"""settlement instruction"" means …",""""" "" settlement instruction"" me…","""[]""","""EurLex""","""32017R0389.xml"""
"""#""",,"""trading venue operator"" means …",""""" "" trading venue operator"" me…","""[]""","""EurLex""","""32017R1005.xml"""
"""#""",,"""composite product"" means compo…",""""" "" composite product"" means c…","""[]""","""EurLex""","""32019R0759.xml"""


In [11]:
df.filter(pl.col('definiens').is_null())

label,definendum,definiens,full_definition,references,provenance,document
str,str,str,str,str,str,str
"""#portaleDeiServiziTelematici""","""portale dei servizi telematici""",,"""b) portale dei servizi telemat…","""[]""","""Normattiva""","""20110418_011G0087_VIGENZA_2024…"
"""#incaricoGliIncarichi""","""«incarico», gli incarichi""",,"""h) «incarico», gli incarichi :""","""[]""","""Normattiva""","""20201215_20G00190_ORIGINALE.xm…"
"""#enteCreditizio""","""ente creditizio""",,"""s) «ente creditizio» : ((il so…","""[]""","""Normattiva""","""20230317_23G00035_VIGENZA_2023…"


In [109]:
df.filter(pl.col('label').is_null())

label,showAs,definition_body
str,str,str


In [110]:
df = df.drop_nulls()
df.select(pl.len())


len
u32
9858


#### Drop duplicates

In [111]:
df = df.unique()
df.select(pl.len())

len
u32
9298


In [145]:

pl.Config.set_tbl_rows(30)
pl.Config.set_fmt_str_lengths(300)
#df.filter(pl.col('showAs') == 'insect')
df.filter(pl.col('showAs').str.contains('insects'))

label,showAs,definition_body
str,str,str


### Unwanted characters

"«", " ", "'"

In [118]:
df.filter(pl.col('showAs').str.starts_with("normativa di "))

label,showAs,definition_body
str,str,str
"""#normativaDiArmonizzazioneDellUnioneEuropea""","""normativa di armonizzazione dell'Unione europea""","""la normativa dell'Unione europea che armonizza le condizioni di commercializzazione dei prodotti."""
"""#normativaDiArmonizzazioneDellUnione""","""normativa di armonizzazione dell'Unione""","""qualunque normativa dell'Unione che armonizza le condizioni di commercializzazione dei prodotti;"""
"""#normativaDiArmonizzazioneDellUnioneEuropea""","""normativa di armonizzazione dell'Unione europea""","""la normativa dell'Unione europea che armonizza le condizioni di commercializzazione dei prodotti;"""
"""#normativaDiArmonizzazioneDellUnione""","""normativa di armonizzazione dell'Unione""","""la normativa dell'Unione europea che armonizza le condizioni di commercializzazione dei prodotti;"""
"""#normativaDiArmonizzazioneDellUnione""","""normativa di armonizzazione dell'Unione""","""normativa dell'Unione europea che armonizza le condizioni di commercializzazione del dispositivo antiabbandono quale prodotto;"""
"""#normativaDiArmonizzazioneDellUnione""","""normativa di armonizzazione dell'Unione""","""la normativa dell'Unione che armonizza le condizioni di commercializzazione dei prodotti;"""


In [136]:
df.group_by('showAs').agg(pl.len()).sort('len', descending=True)

showAs,len
str,u32
"""competent authority""",29
"""manufacturer""",26
"""placing on the market""",24
"""autorita' competente""",23
"""making available on the market""",20
"""importer""",19
"""distributor""",18
"""immissione sul mercato""",17
"""conformity assessment""",17
"""technical specification""",16


---

In [17]:
from LegalDefAgent.src.config import DB_CONFIG

AttributeError: 'dict' object has no attribute 'XML_DATA_DIR'

In [None]:
from LegalDefAgent.src.utils import setup_logging

setup_logging()

builder.build_database()

2024-12-08 16:20:34,555 - INFO - Extracting definitions from XML files in dataset EurLex...
2024-12-08 16:20:54,081 - INFO - Extracting definitions from XML files in dataset Normattiva...
2024-12-08 16:21:11,368 - INFO - Extracting definitions from XML files in dataset PDL...
2024-12-08 16:21:12,471 - INFO - Extracted definitions from 1348 files. 0 errors out of 22187 files.
2024-12-08 16:21:12,801 - DEBUG - https://huggingface.co:443 "GET /api/models/BAAI/bge-m3/revision/main HTTP/1.1" 200 4793


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

2024-12-08 16:21:13,918 - INFO - loading existing colbert_linear and sparse_linear---------
2024-12-08 16:21:13,991 - INFO - Building vector database...
2024-12-08 16:21:14,007 - INFO - Pass in the local path /home/leo/Desktop/dhdk/Master thesis/.project/LegalDefAgent/vec_db/definitions_vectors.db, and run it using milvus-lite


KeyboardInterrupt: 