# v2 Evaluation

1. Quantitative TOGLIENDOLI DAI RETRIEVED
    1. "Quale è la definizione di \"{term}\" nella Legislazione Italiana?".format(term=item['term']) x 4000+ di **normattiva** dal vectorstore. 
    2. "What's the definition of \"{term}\" in the European Legislation?".format(term=item['term']) x Lexdrafter 1330 escludendo. 

2. Qualitative
    1. "Quale è la definizione di \"{term}\" nella Legislazione Italiana?".format(term=item['term']) x quanti termini tradotti da salvo
    2. "What's the definition of \"{term}\" in the European Legislation?".format(term=item['term']) x

    se ci sono in entrambi i dataset li si esclude da entrambi

---

## Quantitative

In [1]:
import polars as pl

In [2]:
defs = pl.read_csv("../../data/definitions_corpus/definitions.csv")

definendums_list = (defs.select(
    pl.col('label')
    .str.replace('#', '')
    .str.replace(r'([a-zà-ÿ])([A-Z])', r'${1} ${2}', n=-1)
    .str.to_lowercase()
)
)['label'].to_list()

print(definendums_list)



### LexDrafter

In [3]:
lexdrafter_df = pl.read_json('../datasets/lexdrafter/corpus.json', schema={"celex_id": pl.String, "term": pl.String, "term_definition": pl.String})

lexdrafter_df

celex_id,term,term_definition
str,str,str
"""32019R2014""","""mains""","""'mains' means the electricity …"
"""32019R2014""","""electric mains""","""'electric mains' means the ele…"
"""32019R2014""","""automatic washing machine""","""'automatic washing machine' me…"
"""32019R2014""","""household washing machine""","""'household washing machine' me…"
"""32019R2014""","""household washer-dryer""","""'household washer-dryer' means…"
…,…,…
"""31998D0181""","""GATT""","""'GATT' means ‘GATT 1947’ or ‘G…"
"""31998D0181""","""Intellectual property""","""'Intellectual property' includ…"
"""31998D0181""","""Energy Charter Protocol""","""'Energy Charter Protocol' mean…"
"""31998D0181""","""Protocol""","""'Protocol' means a treaty, the…"


In [4]:
from rapidfuzz import process, fuzz

lexdrafter_retrieval = []
lexdrafter_generation = []

for entry in lexdrafter_df.rename({'celex_id': 'CELEX_ID', 'term': 'Term', 'term_definition': 'original_definition'}).to_dicts():
    x = process.extract(entry['Term'].lower(), definendums_list, scorer=fuzz.ratio, limit=1, score_cutoff=95)
    if x: 
        lexdrafter_retrieval.append(entry)
    else:
        lexdrafter_generation.append(entry)

print(len(lexdrafter_retrieval), len(lexdrafter_generation))

707 623


In [1]:
from LegalDefAgent.src.agents import definitions_agent_eval
import uuid
from pathlib import Path
import json
import ast
import time
import logging

async def get_definition(question, model):
    inputs = {"messages": [("user", question)]}
    configurable = {"configurable": {"user_id": "1", "thread_id": uuid.uuid4().hex, "model": model}}

    async for msg in definitions_agent_eval.astream(inputs, configurable, stream_mode="values"):
        chat = msg

    return chat

  from .autonotebook import tqdm as notebook_tqdm


2025-05-07 09:07:41,062 - INFO - Logging configured
2025-05-07 09:07:41,669 - INFO - PyTorch version 2.6.0 available.
2025-05-07 09:07:41,672 - INFO - Polars version 1.21.0 available.


Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 41817.59it/s]


2025-05-07 09:07:49,231 - INFO - loading existing colbert_linear and sparse_linear---------


Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 59381.37it/s]


2025-05-07 09:07:53,710 - INFO - loading existing colbert_linear and sparse_linear---------
2025-05-07 09:07:54,835 - INFO - Logging configured


Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 135445.77it/s]


2025-05-07 09:08:00,687 - INFO - loading existing colbert_linear and sparse_linear---------


Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 53773.13it/s]


2025-05-07 09:08:06,428 - INFO - loading existing colbert_linear and sparse_linear---------
2025-05-07 09:08:06,501 - INFO - Logging configured


In [3]:
model = 'together-llama-3.3-70B-Instruct-Turbo'
question = "What's the definition of \"standard rating conditions\" in the European Legislation?"

res = await get_definition(question, model)
response_json = json.loads(res['messages'][-1].content)
response_json


: 

: 

In [6]:
model = 'together-llama-3.3-70B-Instruct-Turbo'
dataset_name = 'quantitative_lexdrafter'
task = 'full'

Path('logs').mkdir(exist_ok=True)

logging.basicConfig(
    filename=f'logs/evaluation_log_{model}_{dataset_name}_{task}.txt',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True
)


last = 0

output_file = f'results/{model}_{dataset_name}_{task}.json'

if Path(output_file).exists():
    with open(output_file, 'r') as f:
        results = ast.literal_eval(f.read())
        last = results[-1]['#']
    logging.info(f"Loaded {len(results)} existing results")
else:
    results = []
    logging.info("Starting new results file")

dataset = lexdrafter_generation + lexdrafter_retrieval

for i, item in enumerate(dataset[last:], last+1):
    result = {"entry": item, "#": i}
    term = item['Term']
    question = "What's the definition of \"{term}\" in the European Legislation?".format(term=term)
    try:
        logging.info(f"Processing term {i}/{len(dataset)}: {term}")
        res = await get_definition(question, model)
        response_json = json.loads(res['messages'][-1].content)
        result['response'] = response_json
        logging.info(f"Received response for term {i}: {response_json}")
        results.append(result)

        with open(output_file, 'w') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)

        print(f'{i}/{len(dataset)}', term)
        time.sleep(2)
    
    except Exception as e:
        logging.error(f"Error processing term {i} ({term}): {str(e)}")
        item['response'] = {"error": str(e)}
        print(f'{i}/{len(dataset)} Error', term)
        results.append(item)
    
    logging.info(f"Processing complete. Processed {len(results)} terms.")


logging.info(f"Processing complete. Processed {len(results)} terms.")

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


505/1330 contingency
506/1330 generation shift key
507/1330 remedial action
508/1330 reliability margin
509/1330 market time
510/1330 congestion income
511/1330 market congestion
512/1330 physical congestion
513/1330 clearing price
514/1330 scheduled exchange
515/1330 scheduled exchange calculator
516/1330 day-ahead market time-frame
517/1330 day-ahead firmness deadline
518/1330 day-ahead market gate closure time
519/1330 intraday market time-frame
520/1330 intraday cross-zonal gate opening time
521/1330 intraday cross-zonal gate closure time
522/1330 capacity management module
523/1330 non-standard intraday product
524/1330 shipping agent
525/1330 firmness
526/1330 force majeure
527/1330 economic surplus for the single day-ahead or intraday coupling
528/1330 nominal heat output
529/1330 minimum heat output
530/1330 maximum continuous heat output
531/1330 Pr
532/1330 η
533/1330 roll-in cabinet
534/1330 nominal heat output
535/1330 minimum heat output
536/1330 heavy-duty cabinet
537/133

In [None]:
# retry failed items

import ast 

model = 'together-llama-3.3-70B-Instruct-Turbo'
dataset_name = 'quantitative_lexdrafter'
task = 'full'

output_file = f'results/{model}_{dataset_name}_{task}.json'

with open(f'/home/leo/Desktop/dhdk/Master thesis/project/evaluation/v2/results/together-llama-3.3-70B-Instruct-Turbo_quantitative_lexdrafter_full.json', 'r') as json_file:
    data = json.load(json_file)

retry_results = []

for i, item in enumerate(data):
    if 'error' in item['response']:
        try:
            term = item['Term']
            question = "What's the definition of \"{term}\" in the European Legislation?".format(term=term)
            logging.info(f"Processing item {i}/{len(data)}: {item['Term']}")
            res = await get_definition(question, model)
            response_json = json.loads(res['messages'][-1].content)
            item['response'] = response_json
            print(response_json)
            retry_results.append(item)


            logging.info(item)
            print(f'{i}/{len(data)}', item)
            time.sleep(2)
        
        except Exception as e:
            logging.error(f"Error processing item {i} ({item['Term']}): {str(e)}")
            item['response'] = {"error": str(e)}
            print(f'{i}/{len(data)} Error', item)
            results.append(item)

2025-05-07 09:10:39,026 - INFO - Processing item 5/1329: digital photo frame
2025-05-07 09:10:42,876 - INFO - HTTP Request: POST https://api.together.xyz/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-07 09:10:42,902 - INFO - Searching for definitions: definendum='digital photo frame', question='What\'s the definition of "digital photo frame" in the European Legislation?', legislation='EU', date_filters=''
2025-05-07 09:10:45,065 - INFO - Retrieved 10 definitions from vectorstore
2025-05-07 09:10:45,066 - INFO - Filtering by legislation: EU
2025-05-07 09:10:45,390 - INFO - HTTP Request: POST https://api.together.xyz/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
2025-05-07 09:10:45,392 - INFO - Retrying request to /chat/completions in 4.000000 seconds
2025-05-07 09:10:51,579 - INFO - HTTP Request: POST https://api.together.xyz/v1/chat/completions "HTTP/1.1 200 OK"
{'generated_definition': {'generated_definition': 'digital photo frame means a device designed to display digital imag

In [5]:
import json 

with open(f'/home/leo/Desktop/dhdk/Master thesis/project/evaluation/v2/results/together-llama-3.3-70B-Instruct-Turbo_quantitative_lexdrafter_full.json', 'r') as json_file:
    data = json.load(json_file)

errors = 0
predictions = []
references = []
references_bleurt = []

for item in data:
    print(item)
    if item['response'].get('error', None) or item.get('error', None) or item['response'].get('type', None):
        errors += 1
        continue

    try:
        if item.get('entry', None):
            predictions.append(item['response']['generated_definition']['generated_definition'])
            references.append(item['entry']['original_definition'])
        else:
            predictions.append(item['response']['generated_definition']['generated_definition'])
            references.append(item['original_definition'])
    except KeyError:
        errors += 1
        continue

{'entry': {'CELEX_ID': '32019R2014', 'Term': 'electric mains', 'original_definition': "'electric mains' means the electricity supply from the grid of 230 (±10 %) volts of alternating current at 50 Hz;"}, '#': 1, 'response': {'generated_definition': {'generated_definition': 'electric mains: means the electricity supply from the grid of 230 (± 10 %) volt of alternating current at 50 Hz', 'sources': ['mains: means the electricity supply from the grid of 230 (± 10 %) volts of alternating current at 50 Hz;or "electric mains"', 'mains: means the electricity supply from the grid of 230 (± 10 %) volts of alternating current at 50 Hz;or "electric mains"', 'mains: means the electricity supply from the grid of 230 (± 10 %) volts of alternating current at 50 Hz;or "electric mains"', 'mains: means the electricity supply from the grid of 230 (±10 %) volts of alternating current at 50 Hz;or "electric mains"', 'mains: means the electricity supply from the grid of 230 (± 10 %) volt of alternating curre

In [3]:
errors, len(predictions), len(references), len(data)

(38, 1291, 1291, 1329)

In [2]:
import pickle

with open('lexdrafter_predictions.pkl', 'wb') as f:
    pickle.dump(predictions, f)

with open('lexdrafter_references.pkl', 'wb') as f:
    pickle.dump(references, f)

#### BLEU

#### Measures

In [18]:
!pip install -q evaluate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [19]:
import evaluate

bleu = evaluate.load("bleu")

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
# Compute the BLEU score
bleu4_results = bleu.compute(predictions=predictions, references=references, max_order=4)
print(f"BLEU-4: {bleu4_results['bleu']}")

# Compute the BLEU score
bleu3_results = bleu.compute(predictions=predictions, references=references, max_order=3)
print(f"BLEU-3: {bleu3_results['bleu']}")

# Compute the BLEU score
blue2_results = bleu.compute(predictions=predictions, references=references, max_order=2)
print(f"BLEU-2: {blue2_results['bleu']}")

# Compute the BLEU score
blue1_results = bleu.compute(predictions=predictions, references=references, max_order=1)
print(f"BLEU-1: {blue1_results['bleu']}")

BLEU-4: 0.09925569700044896
BLEU-3: 0.12410359979231986
BLEU-2: 0.1687923533985763
BLEU-1: 0.26623412632396165


In [36]:
!pip install -q rouge_score bert-score


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [33]:
import evaluate

rouge = evaluate.load("rouge")

In [34]:
rouge_results = rouge.compute(predictions=predictions, references=references)

print(f"ROUGE-1: {rouge_results['rouge1']}")
print(f"ROUGE-2: {rouge_results['rouge2']}")
print(f"ROUGE-L: {rouge_results['rougeL']}")
print(f"ROUGE-L-sum: {rouge_results['rougeLsum']}")

ROUGE-1: 0.35896147599687084
ROUGE-2: 0.1909227456551807
ROUGE-L: 0.3082640997686694
ROUGE-L-sum: 0.3086035391273647


In [6]:
import evaluate
bertscore = evaluate.load("bertscore")

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
bertscore_results = bertscore.compute(predictions=predictions, references=references, model_type="nlpaueb/legal-bert-base-uncased")

KeyError: 'nlpaueb/legal-bert-base-uncased'

In [8]:
precision_scores = bertscore_results['precision']

mean_precision = sum(precision_scores) / len(precision_scores)

print(f"BERTScore_precision: {bertscore_results['precision']}")
print("--------")
print(f"Mean BERTScore precision: {mean_precision}")

BERTScore_precision: [0.9703935384750366, 0.7072979807853699, 0.7395894527435303, 0.8255431652069092, 0.8038264513015747, 0.7502430081367493, 0.7438927292823792, 0.8174741864204407, 0.7674136161804199, 0.7744236588478088, 0.8503953814506531, 0.8004714250564575, 0.7624220848083496, 0.787544846534729, 0.7613939046859741, 0.8597382307052612, 0.7462673187255859, 0.8152337074279785, 0.8256646394729614, 0.6427185535430908, 0.9747970104217529, 0.6985601782798767, 0.6306087970733643, 0.6939095258712769, 0.7172672748565674, 0.836570143699646, 0.7341911196708679, 0.7370988726615906, 0.7425327301025391, 0.846203625202179, 0.7980053424835205, 0.8708533644676208, 0.7003936171531677, 0.9784262776374817, 0.7798240780830383, 0.7744236588478088, 0.7467228174209595, 0.7580801844596863, 0.7758642435073853, 0.774465024471283, 0.7841202020645142, 0.7841556072235107, 0.7992233037948608, 0.8311782479286194, 0.7839040756225586, 0.7692473530769348, 0.6930981874465942, 0.8409447073936462, 0.9336578845977783, 0.

In [9]:
recall_scores = bertscore_results['recall']

mean_recall = sum(recall_scores) / len(recall_scores)

print(f"BERTScore_recall: {bertscore_results['recall']}")
print("--------")
print(f"Mean BERTScore recall: {mean_recall}")

BERTScore_recall: [0.9443389177322388, 0.8143723011016846, 0.8549717664718628, 0.8383463025093079, 0.777347207069397, 0.8219079375267029, 0.7449568510055542, 0.8024176955223083, 0.725195586681366, 0.8120999932289124, 0.8457667231559753, 0.7894755601882935, 0.7158576250076294, 0.8020091652870178, 0.7493349313735962, 0.7945057153701782, 0.8646516799926758, 0.804591178894043, 0.8462163209915161, 0.7946552634239197, 0.9116789698600769, 0.7469905614852905, 0.810017466545105, 0.797533392906189, 0.7292066812515259, 0.7595916390419006, 0.7311683297157288, 0.731065571308136, 0.748099684715271, 0.8046199083328247, 0.8026447892189026, 0.7965332269668579, 0.8326758742332458, 0.9520540237426758, 0.7806786894798279, 0.8120999932289124, 0.7787611484527588, 0.7839158773422241, 0.8665916919708252, 0.7529631853103638, 0.7453611493110657, 0.7762752771377563, 0.779544472694397, 0.8332756757736206, 0.870888352394104, 0.8373579978942871, 0.7654491662979126, 0.8550490140914917, 0.9302338361740112, 0.95422631

In [10]:
f1_scores = bertscore_results['f1']

mean_f1 = sum(f1_scores) / len(f1_scores)

print(f"BERTScore_F1: {bertscore_results['f1']}")
print("--------")
print(f"Mean BERTScore F1: {mean_f1}")

BERTScore_F1: [0.9571889638900757, 0.7570679187774658, 0.7931060791015625, 0.8318954706192017, 0.7903650999069214, 0.7844420671463013, 0.7444244027137756, 0.8098759651184082, 0.7457075715065002, 0.7928144931793213, 0.848074734210968, 0.7949354648590088, 0.7384064793586731, 0.794711172580719, 0.7553163170814514, 0.8258358240127563, 0.8011095523834229, 0.8098775148391724, 0.8358141779899597, 0.7106567025184631, 0.9421820044517517, 0.7219641208648682, 0.709141731262207, 0.7421216368675232, 0.7231876850128174, 0.7962245941162109, 0.73267662525177, 0.73406982421875, 0.7453058362007141, 0.8248879909515381, 0.8003184199333191, 0.8320370316505432, 0.7608277201652527, 0.9650600552558899, 0.7802512049674988, 0.7928144931793213, 0.762405514717102, 0.7707815766334534, 0.818722128868103, 0.763562798500061, 0.7642495632171631, 0.7801955342292786, 0.7892612814903259, 0.8322256207466125, 0.8251100778579712, 0.8018589615821838, 0.7274791598320007, 0.8479382395744324, 0.9319427013397217, 0.9643445611000

### Italian

In [7]:
import polars as pl

In [8]:
italian_df = pl.read_csv('../../data/definitions_corpus/definitions.csv').filter(pl.col('dataset') != 'EurLex')

In [9]:
italian_df = italian_df.with_columns(pl.col('label').str.replace('#', '').str.replace(r'([a-zà-ÿ])([A-Z])', r'${1} ${2}', n=-1).str.to_lowercase())
italian_dict = italian_df.select(["label", "definition_text"]).rename({"label": "term", "definition_text": "term_definition"}).to_dicts()
italian_dict, len(italian_dict)

([{'term': 'accordi internazionali',
   'term_definition': "accordi internazionali: gli accordi vigenti in materia di cooperazione o di reciproca assistenza tecnico - militare nel settore della difesa, conclusi dall'Italia con uno o piu' Stati esteri;"},
  {'term': 'intese internazionali',
   'term_definition': "intese internazionali: protocolli, memorandum, intese, o altri documenti comunque denominati, posti in essere dal Ministero della difesa con uno o piu' Stati esteri, discendenti dagli accordi di cui alla lettera a);"},
  {'term': 'attivita di supporto tecnico-amministrativo',
   'term_definition': "attivita' di supporto tecnico-amministrativo: qualsiasi attivita' di assistenza tecnica, ingegneristica, logistica, manutentiva, addestrativa, formativa, amministrativa, legale, nonche' di coordinamento della contrattualistica e degli aspetti connessi alla gestione finanziaria, anche nella fase di pianificazione e definizione dell'esigenza e del relativo impatto sui costi, discendent

In [10]:
from LegalDefAgent.src.agents import definitions_agent_eval
import uuid
from pathlib import Path
import json
import ast
import time
import random
import logging

async def get_definition(question, model):
    inputs = {"messages": [("user", question)]}
    configurable = {"configurable": {"user_id": "1", "thread_id": uuid.uuid4().hex, "model": model}}

    async for msg in definitions_agent_eval.astream(inputs, configurable, stream_mode="values"):
        chat = msg

    return chat

In [12]:
random.Random(42).shuffle(italian_dict)

In [13]:
model = 'together-llama-3.3-70B-Instruct-Turbo'
dataset_name = 'quantitative_italian'
task = 'full'

Path('logs').mkdir(exist_ok=True)

logging.basicConfig(
    filename=f'logs/evaluation_log_{model}_{dataset_name}_{task}.txt',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True
)

last = 0

output_file = f'results/{model}_{dataset_name}_{task}.json'

if Path(output_file).exists():
    with open(output_file, 'r') as f:
        results = ast.literal_eval(f.read())
        last = results[-1]['#']
    logging.info(f"Loaded {len(results)} existing results")
else:
    results = []
    logging.info("Starting new results file")

dataset = italian_dict

for i, item in enumerate(dataset[last:], last+1):
    result = {"entry": item, "#": i}
    term = item['term']
    question = "Quale è la definizione di \"{term}\" nella legislazione italiana?".format(term=term)
    try:
        logging.info(f"Processing term {i}/{len(dataset)}: {term}")
        res = await get_definition(question, model)
        response_json = json.loads(res['messages'][-1].content)
        result['response'] = response_json
        logging.info(f"Received response for term {i}: {response_json}")
        results.append(result)

        with open(output_file, 'w') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)

        print(f'{i}/{len(dataset)}', term)
        time.sleep(2)
    
    except Exception as e:
        logging.error(f"Error processing term {i} ({term}): {str(e)}")
        item['response'] = {"error": str(e)}
        print(f'{i}/{len(dataset)} Error', term)
        results.append(item)
    
    logging.info(f"Processing complete. Processed {len(results)} terms.")


logging.info(f"Processing complete. Processed {len(results)} terms.")

1/4985 organismo nazionale di accreditamento
2/4985 la lettera dd)esostituita dalla seguente
3/4985 conto termico
4/4985 immissione sul mercato
5/4985 organizzazione per lapprovvigionamento
6/4985 soggetti competenti in materia ambientale
7/4985 nave ancorata
8/4985 fondo di solidarieta
9/4985 anticipo finanziario agaranzia pensionistica(ape)
10/4985 trasferimento
11/4985 enti di promozione sportiva
12/4985 flusso di co2
13/4985 Error rete
14/4985 ispezioni
15/4985 marca temporale
16/4985 sezione speciale
17/4985 verifica
18/4985 diritto speciale
19/4985 macchine mobili non stradali destinate ad esclusivo uso professionale
20/4985 esponenti
21/4985 cei
22/4985 persona adeguatamente formata
23/4985 fonti secondarie
24/4985 societa figlia
25/4985 oggetto esclusivo oprincipale
26/4985 commissari straordinari
27/4985 rete sistema informativo
28/4985 nsis
29/4985 sistema informativo geografico
30/4985 tipo di veicolo
31/4985 operatore
32/4985 soggetto interessato
33/4985 non proliferazione 

CancelledError: 

In [61]:
import json 

with open(f'/home/leo/Desktop/dhdk/Master thesis/project/evaluation/v2/results/together-llama-3.3-70B-Instruct-Turbo_quantitative_italian_last80.json', 'r') as json_file:
    data = json.load(json_file)

errors = 0
predictions = []
references = []
references_bleurt = []

for item in data:
    print(item['entry'])
    if item['response'].get('error', None) or item.get('error', None) or item['response'].get('type', None):
        errors += 1
        continue

    try:
        if item.get('entry', None):
            print('sfa', item['entry']['response'])
            predictions.append(item['entry']['generated_definition']['generated_definition'])
            references.append(item['entry']['original_definition'])
        else:
            predictions.append(item['response']['generated_definition']['generated_definition'])
            references.append(item['original_definition'])
    except KeyError:
        errors += 1
        continue

{'term': 'organismo nazionale di accreditamento', 'term_definition': "organismo nazionale di accreditamento: unico organismo autorizzato dallo Stato a svolgere attivita' di accreditamento nel territorio nazionale, di cui all'articolo 4, della legge 23 luglio 2009, n. 99;"}
{'term': 'la lettera dd)esostituita dalla seguente', 'term_definition': 'la lettera dd) e\' sostituita dalla seguente: "dd) reti di comunicazione elettronica: i sistemi di trasmissione e, se del caso, le apparecchiature di commutazione o di instradamento e altre risorse, inclusi gli elementi di rete non attivi, che consentono di trasmettere segnali via cavo, via radio, a mezzo di fibre ottiche o con altri mezzi elettromagnetici, comprese le reti satellitari, le reti terrestri mobili e fisse (a commutazione di circuito e a commutazione di pacchetto, compresa Internet), le reti utilizzate per la diffusione circolare dei programmi sonori e televisivi, i sistemi per il trasporto della corrente elettrica, nella misura in 

KeyError: 'entry'

In [58]:
len(predictions), len(references)

(0, 0)

In [55]:
errors

80

#### BLEU

#### Measures

In [None]:
!pip install -q evaluate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import evaluate

bleu = evaluate.load("bleu")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Compute the BLEU score
bleu4_results = bleu.compute(predictions=predictions, references=references, max_order=4)
print(f"BLEU-4: {bleu4_results['bleu']}")

# Compute the BLEU score
bleu3_results = bleu.compute(predictions=predictions, references=references, max_order=3)
print(f"BLEU-3: {bleu3_results['bleu']}")

# Compute the BLEU score
blue2_results = bleu.compute(predictions=predictions, references=references, max_order=2)
print(f"BLEU-2: {blue2_results['bleu']}")

# Compute the BLEU score
blue1_results = bleu.compute(predictions=predictions, references=references, max_order=1)
print(f"BLEU-1: {blue1_results['bleu']}")

BLEU-4: 0.09925569700044896
BLEU-3: 0.12410359979231986
BLEU-2: 0.1687923533985763
BLEU-1: 0.26623412632396165


In [None]:
!pip install -q rouge_score bert-score


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import evaluate

rouge = evaluate.load("rouge")

In [None]:
rouge_results = rouge.compute(predictions=predictions, references=references)

print(f"ROUGE-1: {rouge_results['rouge1']}")
print(f"ROUGE-2: {rouge_results['rouge2']}")
print(f"ROUGE-L: {rouge_results['rougeL']}")
print(f"ROUGE-L-sum: {rouge_results['rougeLsum']}")

ROUGE-1: 0.35896147599687084
ROUGE-2: 0.1909227456551807
ROUGE-L: 0.3082640997686694
ROUGE-L-sum: 0.3086035391273647


In [None]:
import evaluate
bertscore = evaluate.load("bertscore")

In [None]:
bertscore_results = bertscore.compute(predictions=predictions, references=references, model_type="distilbert-base-uncased")

In [None]:
f1_scores = bertscore_results['f1']

mean_f1 = sum(f1_scores) / len(f1_scores)

print(f"BERTScore_F1: {bertscore_results['f1']}")
print("--------")
print(f"Mean BERTScore F1: {mean_f1}")

BERTScore_F1: [0.9571889638900757, 0.7570679187774658, 0.7931060791015625, 0.8318954706192017, 0.7903650999069214, 0.7844420671463013, 0.7444244027137756, 0.8098759651184082, 0.7457075715065002, 0.7928144931793213, 0.848074734210968, 0.7949354648590088, 0.7384064793586731, 0.7947112917900085, 0.7553163170814514, 0.8258358240127563, 0.8011095523834229, 0.8098775148391724, 0.8358141779899597, 0.7106567025184631, 0.9421820044517517, 0.7219641208648682, 0.709141731262207, 0.7421216368675232, 0.7231876850128174, 0.7962245941162109, 0.73267662525177, 0.73406982421875, 0.7453058362007141, 0.8248879909515381, 0.8003184199333191, 0.8320370316505432, 0.7608277201652527, 0.9650600552558899, 0.7802512049674988, 0.7928144931793213, 0.762405514717102, 0.7707815766334534, 0.818722128868103, 0.763562798500061, 0.7642495632171631, 0.7801955342292786, 0.7892612814903259, 0.8322256207466125, 0.8251100778579712, 0.8018589615821838, 0.7274791598320007, 0.8479382395744324, 0.9319427013397217, 0.964344561100

## Qualitative

In [5]:
import polars as pl

In [6]:
evs = pl.read_csv('./Definition-Definizione.csv')
evs

Definition,Definizione
str,str
"""Non-personal Data""","""Dato non personale"""
"""Connected device""","""Dispositivo connesso"""
"""IoT device""","""Dispositivo IoT"""
"""Open Data""","""Dato aperto"""
"""Satellite Data""","""Dato satelittare"""
…,…
"""Traffic Data""","""Dati relativi al Traffico """
"""GPS Data""","""Dati GPS"""
"""International Cooperation Poli…","""Politica di Cooperazione Inter…"
"""National Monitoring Body""","""Ente nazionale di controllo"""


In [26]:
from rapidfuzz import process, fuzz

en_list = []
it_list = []

for entry in evs.iter_rows():
    x = process.extract(entry[0].lower(), definendums_list, scorer=fuzz.token_sort_ratio, limit=1, score_cutoff=95)
    y = process.extract(entry[1].lower(), definendums_list, scorer=fuzz.token_sort_ratio, limit=1, score_cutoff=95)
    if x:
        print('found in eu: ', entry[0])
        pass
    elif y:
        print('found in it: ', entry[1])
        pass
    else:
        en_list.append(entry[0])
        it_list.append(entry[1])


print(len(en_list), len(it_list))

found in eu:  National Competition Authority
found in eu:  Organic Product
found in it:  Analisi del rischio
found in eu:  Electric Vehicle
found in eu:  Hybrid Vehicle
found in eu:  Traffic Data
73 73


### English

In [8]:
en_list

['Non-personal Data',
 'Connected device',
 'IoT device',
 'Open Data',
 'Satellite Data',
 'Cookie',
 'Synthetic Data',
 'International Data Transfer',
 'Data Sharing activity',
 'Data Buyer',
 'Dangerous Product',
 'Dangerous Product Importer',
 'Intermediary Platform',
 'Internet Platform',
 'Content Creator',
 'Digital Internal Market',
 'Data Monetization',
 'Smart Contract',
 'Gender-based Crime',
 'Victim-blaming',
 'Hate Speech',
 'Stalking',
 'Ransomware',
 'International Terrorism',
 'Cooperation Duties',
 'Law Enforcement Policy',
 'Fair compensation',
 'Law Enforcement Officer ',
 'Cryptocurrency',
 'Decentralised Finance',
 'Autonomous Trading Agent',
 'Automated trading',
 'Multi-year Agreement',
 'Restrictive Practice',
 'Confidential Information',
 'Non-animal origin ',
 'Imported Good',
 'Air Pollution',
 'Non-organic Origin',
 'Animal Welfare',
 'National Health Authority ',
 'Plant-based Product',
 'Risk Assessment',
 'Risk Evaluation',
 'Autonomous Vehicle',
 'Air q

In [None]:
from LegalDefAgent.src.agents import definitions_agent_eval
import uuid
from pathlib import Path
import json
import ast
import time
import logging

async def get_definition(question, model):
    inputs = {"messages": [("user", question)]}
    configurable = {"configurable": {"user_id": "1", "thread_id": uuid.uuid4().hex, "model": model}}

    async for msg in definitions_agent_eval.astream(inputs, configurable, stream_mode="values"):
        chat = msg

    return chat

  from .autonotebook import tqdm as notebook_tqdm


2025-05-06 19:40:30,338 - INFO - Logging configured
2025-05-06 19:40:30,982 - INFO - PyTorch version 2.6.0 available.
2025-05-06 19:40:30,986 - INFO - Polars version 1.21.0 available.


In [11]:
model = 'together-llama-3.3-70B-Instruct-Turbo'
dataset_name = 'qualitative_en'
task = 'generation'

Path('logs').mkdir(exist_ok=True)

logging.basicConfig(
    filename=f'logs/evaluation_log_{model}_{dataset_name}_{task}.txt',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True
)


last = 0

output_file = f'results/{model}_{dataset_name}_{task}.json'

if Path(output_file).exists():
    with open(output_file, 'r') as f:
        results = ast.literal_eval(f.read())
        last = results[-1]['#']
    logging.info(f"Loaded {len(results)} existing results")
else:
    results = []
    logging.info("Starting new results file")

dataset = en_list

for i, term in enumerate(dataset[last:], last+1):
    result = {"term": term, "#": i}
    question = "What's the definition of \"{term}\" in the European Legislation?".format(term=term)
    try:
        logging.info(f"Processing term {i}/{len(dataset)}: {term}")
        res = await get_definition(question, model)
        response_json = json.loads(res['messages'][-1].content)
        result['response'] = response_json
        logging.info(f"Received response for term {i}: {response_json}")
        results.append(result)

        with open(output_file, 'w') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)

        print(f'{i}/{len(dataset)}', term)
        time.sleep(2)
    
    except Exception as e:
        logging.error(f"Error processing term {i} ({term}): {str(e)}")
        term['response'] = {"error": str(e)}
        print(f'{i}/{len(dataset)} Error', term)
        results.append(term)
    
    logging.info(f"Processing complete. Processed {len(results)} terms.")


logging.info(f"Processing complete. Processed {len(results)} terms.")

56/74 Place of Production
57/74 International Distance Selling 
58/74 Lead Consumer Authority 
59/74 Financial Stability 
60/74 Credit Risk Assessment
61/74 Credit Risk Evaluation
62/74 International Investment
63/74 Dominant Position
64/74 Automated Creditworthiness Decision
65/74 National Bond Authority 
66/74 Market Disruption Crisis
67/74 Videosurveillance
68/74 Trojan
69/74 EU Criminal Record 
70/74 Human Trafficking 
71/74 GPS Data
72/74 International Cooperation Policy
73/74 National Monitoring Body
74/74 Law Enforcement Data Sharing


---

### Italian

In [25]:
it_list

['Dato non personale',
 'Dispositivo connesso',
 'Dispositivo IoT',
 'Dato aperto',
 'Dato satelittare',
 'Cookie',
 'Dato Sintetico',
 'Trasferimento internazionale di dati',
 'Attività di Condivisione di Dati',
 'Acquirente di dati',
 'Prodotto pericoloso',
 'Importatore di un prodotto pericolo',
 'Piattaforma di Intermediazione',
 'Piattaforma su Internet',
 'Creatore di Contenuti',
 'Mercato Digitale Interno',
 'Monetizzazione di Dati',
 'Contratto Intelligente',
 'Crimine di Genere',
 'Colpevolizzazione della Vittima',
 "Discorso d'odio",
 'Stalking',
 'Ransomware',
 'Terrorismo internazionale',
 'Obblighi di Cooperazione',
 'Politica delle Attività di Contrasto',
 'Equa Compensazione',
 "Ufficiale delle Forze dell'Ordine",
 'Criptovaluta',
 'Finanza Decentralizzata',
 'Agente autonomo di trading ',
 'Trading automatizzato',
 'Contratto Pluriennale',
 'Pratica Restrittiva',
 'Informazione Confidenziale',
 'Origine Non Animale',
 'Bene Importato',
 "Inquinamento dell'Aria",
 'Origi

In [13]:
model = 'together-llama-3.3-70B-Instruct-Turbo'
dataset_name = 'qualitative_it'
task = 'generation'

Path('logs').mkdir(exist_ok=True)

logging.basicConfig(
    filename=f'logs/evaluation_log_{model}_{dataset_name}_{task}.txt',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True
)


last = 0

output_file = f'results/{model}_{dataset_name}_{task}.json'

if Path(output_file).exists():
    with open(output_file, 'r') as f:
        results = ast.literal_eval(f.read())
        last = results[-1]['#']
    logging.info(f"Loaded {len(results)} existing results")
else:
    results = []
    logging.info("Starting new results file")

dataset = it_list

for i, term in enumerate(dataset[last:], last+1):
    result = {"term": term, "#": i}
    question = "Quale è la definizione di \"{term}\" nella legislazione italiana?".format(term=term)
    try:
        logging.info(f"Processing term {i}/{len(dataset)}: {term}")
        res = await get_definition(question, model)
        response_json = json.loads(res['messages'][-1].content)
        result['response'] = response_json
        logging.info(f"Received response for term {i}: {response_json}")
        results.append(result)

        with open(output_file, 'w') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)

        print(f'{i}/{len(dataset)}', term)
        time.sleep(2)
    
    except Exception as e:
        logging.error(f"Error processing term {i} ({term}): {str(e)}")
        term['response'] = {"error": str(e)}
        print(f'{i}/{len(dataset)} Error', term)
        results.append(term)
    
    logging.info(f"Processing complete. Processed {len(results)} terms.")


logging.info(f"Processing complete. Processed {len(results)} terms.")

2025-05-06 12:34:15,296 - INFO - Logging configured
65/74 Autorità Nazionale per i Titoli di Stato
66/74 Crisi da Instabilità del Mercato
67/74 Videosorveglianza
68/74 Trojan
69/74 Casellario Giudiziale Europeo
70/74 Traffico di Esseri Umani
71/74 Dati GPS
72/74 Politica di Cooperazione Internazionale
73/74 Ente nazionale di controllo
74/74 Condivisione di dati per le attività di contrasto


In [53]:
# to excel

import json 


# v2
def parse_definitions_to_df(data) -> pl.DataFrame:
    """
    Parse JSON data containing legal definitions into a Polars DataFrame.
    
    Args:
        data: List of dictionaries containing legal definitions
        
    Returns:
        Polars DataFrame with columns: celex_id, term, number, response_type, definition, sources
    """
    parsed_data = []
    
    for item in data:
        print(item)
        try:
            row = {
                'term': item.get('term'),
                #'number': item.get('#'),
                'generated definition': item['response']['generated_definition'].get('generated_definition') if 'generated_definition' in item['response'] else None,
                'sources': item['response']['generated_definition'].get('sources') if 'generated_definition' in item['response'] else None,
            }
        except Exception as e:
            row = {
                'term': item.get('term'),
                #'number': item.get('#'),
                'generated definition': item['response']['generated_definition'],
                'sources': item['response']['sources']
            }

        
        parsed_data.append(row)
    
    df = pl.DataFrame(parsed_data)
    
    df = df.with_columns([
        #pl.col('number').cast(pl.Int64),
        pl.col('term').cast(pl.Utf8),
        pl.col('generated definition').cast(pl.Utf8),
        pl.col('sources')
    ])
    
    return df


with open('./results/together-llama-3.3-70B-Instruct-Turbo_qualitative_it_generation.json') as file:
    results = json.load(file)

df = parse_definitions_to_df(results).to_pandas().to_excel(f'together-llama-3.3-70B-Instruct-Turbo_qualitative_it_generation.xlsx', index=True)
df

{'term': 'Dato non personale', '#': 1, 'response': {'generated_definition': {'generated_definition': "Dato non personale: qualsiasi informazione che non riguarda una persona fisica identificata o identificabile, ai sensi dell'articolo 4 del Regolamento (UE) 2016/679 del Parlamento europeo e del Consiglio del 27 aprile 2016.", 'sources': ['dato personale: qualunque informazione relativa a persona fisica, persona giuridica, ente od associazione, identificati o identificabili, anche indirettamente, mediante riferimento a qualsiasi altra informazione, ivi compreso un numero di identificazione personale;', 'dati personali: qualsiasi informazione riguardante una persona fisica identificata o identificabile («interessato»);', "dati personali: qualsiasi informazione riguardante una persona fisica identificata o identificabile in relazione a nome, numero di identificazione, dati relativi all'ubicazione, identificativo online, uno o piu' elementi caratteristici della sua identita' fisica, fisiol

In [41]:
parsed_data

NameError: name 'parsed_data' is not defined

In [37]:
print(df)

None
