In [3]:
import polars as pl

Create 2 datasets:

1. Definitions that ARE in the corpus
    This is used to evaluate the retrieval. Possibly divided by language?

2. Definitions that ARE NOT in the corpus
    This is used to evaluate the generation.


Evaluation metrics:
- BLEU
This is used to measure lexical similarity
- BertScore
This is used to measure semantic similarity

Possibly also:
- BLEURT
This is used to measure the quality of the generated definitions. Or maybe just use 

## Datasets preparation

In [4]:
defs = pl.read_csv("../data/definitions_corpus/definitions.csv")
definendums_list = (defs.select(
    pl.col('label')
    .str.replace('#', '')
    .str.replace(r'([a-zà-ÿ])([A-Z])', r'${1} ${2}', n=-1)  # Add space between lowercase and uppercase letters
    .str.to_lowercase()  # Convert to lowercase after splitting
)
)['label'].to_list()
definendums_list

['obstacles to trade',
 'injury',
 'adverse trade effects',
 'union industry',
 'union enterprise',
 'services',
 'ancillary services undertaking',
 'authorisation',
 'branch',
 'close links',
 'competent authority',
 'commodity and emission allowance dealer',
 'control',
 'compliance with the group capital test',
 'credit institution',
 'derivatives',
 'financial institution',
 'gender neutral remuneration policy',
 'group',
 'consolidated situation',
 'group supervisor',
 'home member state',
 'host member state',
 'initial capital',
 'investment firm',
 'investment firm group',
 'investment holding company',
 'investment services and activities',
 'management body',
 'management body in its supervisory function',
 'mixed financial holding company',
 'means aparent undertaking other than afinancial holding company',
 'senior management',
 'parent undertaking',
 'subsidiary',
 'systemic risk',
 'union parent investment firm',
 'union parent investment holding company',
 'union parent 

In [3]:
defs.filter(pl.col('label') == '#datoPersonale')

id,definition_text,def_n,label,dataset,document_id,frbr_work,frbr_expression
i64,str,str,str,str,str,str,str
14797,"""dato personale: qualunque info…","""#def_3""","""#datoPersonale""","""Normattiva""","""20120713_012G0119_ORIGINALE.xm…","""/akn/it/act/decreto/ministeroG…","""/akn/it/act/decreto/ministeroG…"
16385,"""dato personale: qualsiasi info…","""#def_2""","""#datoPersonale""","""PDL""","""19PDL0023580_PD.xml""","""/akn/it/bill/propostaDiLegge/2…","""/akn/it/bill/propostaDiLegge/2…"


### LEOS

In [4]:
leos_eval_dataset = pl.read_excel("./datasets/leos/Evaluation_LEOS_annotato_def.xlsx")

In [5]:
leos_dict = leos_eval_dataset.select(["CELEX", "Definition"]).rename({"CELEX": "CELEX_ID", "Definition": "Term"}).to_dicts()

In [6]:
from rapidfuzz import process, fuzz

leos_retrieval = []
leos_generation = []

for el in leos_dict:
    x = process.extract(el['Term'].lower(), definendums_list, scorer=fuzz.token_sort_ratio, limit=1, score_cutoff=95)
    if x: 
        print(el, x)
        leos_retrieval.append(el)
    else:
        leos_generation.append(el)

{'CELEX_ID': '32014L0065', 'Term': 'National Competition Authority'} [('national competition authority', 100.0, 768)]
{'CELEX_ID': '32015L0412', 'Term': 'Organic Product'} [('organic product', 100.0, 8540)]
{'CELEX_ID': '32016R1628', 'Term': 'Electric Vehicle'} [('electric vehicle', 100.0, 1632)]
{'CELEX_ID': '32016R1630', 'Term': 'Hybrid Vehicle'} [('hybrid vehicle', 100.0, 5229)]
{'CELEX_ID': '32016L0685', 'Term': 'Traffic Data'} [('traffic data', 100.0, 10445)]


In [7]:
print(len(leos_retrieval), len(leos_generation))

5 85


### Edge cases

In [9]:
import polars as pl

pl.Config.set_tbl_rows(80)
defs.group_by(['label', 'dataset']).agg(pl.all()).sort('label').filter(pl.col('label').is_duplicated()).group_by('label').agg(pl.all()).sort('label').to_pandas().to_excel('duplicates.xlsx', header=True)

In [167]:
defs.filter(pl.col('label') == '#client')

id,definition_text,def_n,label,dataset,document_id,frbr_work,frbr_expression
i64,str,str,str,str,str,str,str
1162,"""client: means any natural or l…","""#def_9""","""#client""","""EurLex""","""32014L0065.xml""","""/akn/eu/act/directive/2014-05-…","""/akn/eu/act/directive/2014-05-…"
4507,"""local firm: means a firm deali…","""#def_4""","""#client""","""EurLex""","""32013R0575.xml""","""/akn/eu/act/regulation/2013-06…","""/akn/eu/act/regulation/2013-06…"
8088,"""client: means any counterparty…","""#def_4""","""#client""","""EurLex""","""32019R2033.xml""","""/akn/eu/act/regulation/2019-11…","""/akn/eu/act/regulation/2019-11…"
8907,"""client: means a client as defi…","""#def_1""","""#client""","""EurLex""","""32017R2154.xml""","""/akn/eu/act/regulation/2017-09…","""/akn/eu/act/regulation/2017-09…"
9221,"""client: means any prospective …","""#def_7""","""#client""","""EurLex""","""32020R1503.xml""","""/akn/eu/act/regulation/2020-10…","""/akn/eu/act/regulation/2020-10…"
9308,"""client: means a client as defi…","""#def_17""","""#client""","""EurLex""","""32021R0023.xml""","""/akn/eu/act/regulation/2020-12…","""/akn/eu/act/regulation/2020-12…"
9931,"""client: means a client as defi…","""#def_7""","""#client""","""EurLex""","""32014R0600.xml""","""/akn/eu/act/regulation/2014-05…","""/akn/eu/act/regulation/2014-05…"
10329,"""client: means an undertaking w…","""#def_15""","""#client""","""EurLex""","""32012R0648.xml""","""/akn/eu/act/regulation/2012-07…","""/akn/eu/act/regulation/2012-07…"
11209,"""client: means any natural or l…","""#def_1""","""#client""","""EurLex""","""32010L0043.xml""","""/akn/eu/act/directive/2010-07-…","""/akn/eu/act/directive/2010-07-…"
11421,"""credit institution: means an u…","""#def_b""","""#client""","""EurLex""","""32013R0575R(02).xml""","""/akn/eu/act/regulation/2003-06…","""/akn/eu/act/regulation/2003-06…"


In [164]:
import re

result = {}

date_pattern = r"'date':\s*'([^']+)'"
definition_pattern = r"'definition':\s*'([^']+)'"

with open("../notebooks/definition_timeline.csv", "r") as f:
    file = f.read()
    for line in file.split('\n')[1:]:
        definendum = re.search(r'""\s*([^"]*)\s*""', line).group(1).strip()
        if definendum not in result:
            result[definendum] = {}

        tl = {}
        for el in line.split('","'):
            date_match = re.search(date_pattern, el).group(1)
            definition_match = re.search(definition_pattern, el).group(1)
            result[definendum][date_match] = definition_match

result

{'fishing activities': {'2012-07-08': '6. "" fishing activities "" means fishing, including joint fishing operations, fish processing operations, the transhipment or landing of fish or fish products and any other commercial activity in preparation for or related to fishing;',
  '2016-02-05': '6. "" fishing activities "" means fishing, including joint fishing operations, fish processing operations, the transhipment or landing of fishery resources or products thereof and any other commercial activity in preparation for, or related to, fishing, including packaging, transporting, refuelling or resupplying;'},
 'port': {'2012-07-08': '13. "" port "" means any place used for landing or a place close to the shore designated by a Contracting Party for transhipping fishery resources.',
  '2016-02-05': '13. "" port "" means any place on shore used for landing or for the provision of services in relation to, or in support of, fishing activities, or a place on or close to the shore designated by a

In [166]:
def json_to_excel(json_data, output_path='definitions_history.xlsx'):
    """
    Convert definitions JSON to an Excel file using Polars.
    
    Args:
        json_data (dict): Dictionary containing definitions data
        output_path (str): Path for the output Excel file
    """
    # Create lists to store the flattened data
    records = []
    
    # Flatten the nested JSON structure
    for term, date_definitions in json_data.items():
        for date, definition in date_definitions.items():
            records.append({
                'Term': term,
                'Date': date,
                'Definition': definition
            })
    
    # Create a Polars DataFrame
    df = pl.DataFrame(records)
    
    # Sort the DataFrame by Term and Date
    df = df.sort(['Term', 'Date'])
    
    # Create two different views of the data
    
    # View 2: Pivot table view (terms as rows, dates as columns)
    pivot_df = df.pivot(
        values='Definition',
        index='Term',
        columns='Date'
    )
    
    return df


df = json_to_excel(result)
df.sort('Term')

df.to_pandas().to_excel('definitions_with_modifications.xlsx', header=True)


  pivot_df = df.pivot(


### LexDrafter

In [8]:
import polars as pl


lexdrafter_df = pl.read_json('./datasets/lexdrafter/llama_definition_combined.json')['celex_id', 'term', 'original_definition']

#lexdrafter_df = lexdrafter_df.unique(['term', 'original_definition'], keep='first', maintain_order=True)
lexdrafter_df

celex_id,term,original_definition
str,str,str
"""32019R2016""","""0-star compartment""","""'0-star compartment' means a f…"
"""32019R2019""","""0-star compartment""","""'0-star compartment' means a f…"
"""32019R2016""","""1-star compartment""","""'1-star compartment' means a f…"
"""32019R2019""","""1-star compartment""","""'1-star compartment' means a f…"
"""32019R2016""","""2-star compartment""","""'2-star compartment' means a f…"
…,…,…
"""32015R1189""","""woody biomass""","""'woody biomass' means biomass …"
"""32013R0347""","""works""","""'works' means the purchase, su…"
"""32013R0543""","""year-ahead forecast margin""","""'year-ahead forecast margin' m…"
"""32017R1485""","""year-ahead""","""'year-ahead' means the year pr…"


In [9]:
import json

# Load the JSON data from files
with open("llama_definition_combined.json", "r") as file:
    larger_data = json.load(file)

with open("vicuna_definition_combined.json", "r") as file:
    smaller_data = json.load(file)

# Convert smaller data to a set of tuples for faster lookup
smaller_data_set = {
    (item["term"], item["celex_id"])
    for item in smaller_data
    if item["generated_definition"] != "NO JSON AS AN OUTPUT OBTAINED"
}

# Select records from larger data that are also in smaller data
common_records = [
    item for item in larger_data if (item["term"], item["celex_id"]) in smaller_data_set
]

# Optionally, save the common records to a new JSON file
with open("llama_subset_records.json", "w") as file:
    json.dump(common_records, file, indent=4)

# Print the number of common records found
print(f"Number of common records: {len(common_records)}")

FileNotFoundError: [Errno 2] No such file or directory: 'llama_definition_combined.json'

In [12]:
celex_list = [el.split('.')[0] for el in defs['document_id'].to_list()]
for entry in lexdrafter_df.to_dicts():
    if entry['celex_id'] in celex_list:
        print(entry)

{'celex_id': '32019R2016', 'term': '0-star compartment', 'original_definition': "'0-star compartment' means a frozen compartment with a target temperature and storage conditions of 0 °C, as set out in Annex IV, Table 3;"}
{'celex_id': '32019R2019', 'term': '0-star compartment', 'original_definition': "'0-star compartment' means a frozen compartment with a target temperature and storage conditions of 0 °C, as set out in Annex III, Table 3;"}
{'celex_id': '32019R2016', 'term': '1-star compartment', 'original_definition': "'1-star compartment' means a frozen compartment with a target temperature and storage conditions of - 6 °C, as set out in Annex IV, Table 3;"}
{'celex_id': '32019R2019', 'term': '1-star compartment', 'original_definition': "'1-star compartment' means a frozen compartment with a target temperature and storage conditions of - 6 °C, as set out in Annex III, Table 3;"}
{'celex_id': '32019R2016', 'term': '2-star compartment', 'original_definition': "'2-star compartment' mean

In [None]:
lexdrafter_retrieval = []
lexdrafter_generation = []

for entry in lexdrafter_df.to_dicts():
    x = process.extract(entry['term'].lower(), definendums_list, scorer=fuzz.token_sort_ratio, limit=1, score_cutoff=95)
    if x: 
        print(entry, x)
        lexdrafter_retrieval.append(entry)
    else:
        lexdrafter_generation.append(entry)

{'celex_id': '32019R2016', 'term': '0-star compartment', 'original_definition': "'0-star compartment' means a frozen compartment with a target temperature and storage conditions of 0 °C, as set out in Annex IV, Table 3;"} [('0-star compartment', 100.0, 3413)]
{'celex_id': '32019R2019', 'term': '0-star compartment', 'original_definition': "'0-star compartment' means a frozen compartment with a target temperature and storage conditions of 0 °C, as set out in Annex III, Table 3;"} [('0-star compartment', 100.0, 3413)]
{'celex_id': '32019R2016', 'term': '1-star compartment', 'original_definition': "'1-star compartment' means a frozen compartment with a target temperature and storage conditions of - 6 °C, as set out in Annex IV, Table 3;"} [('1-star compartment', 100.0, 3414)]
{'celex_id': '32019R2019', 'term': '1-star compartment', 'original_definition': "'1-star compartment' means a frozen compartment with a target temperature and storage conditions of - 6 °C, as set out in Annex III, Tab

In [None]:
print(len(lexdrafter_retrieval), len(lexdrafter_generation))

607 596


---

### Run the evaluation

In [8]:
from LegalDefAgent.src.agents import definitions_agent_eval
import uuid

async def get_definition(definendum, model):
    inputs = {"messages": [("user", f"what's the definition of {definendum}?")]}
    configurable = {"configurable": {"user_id": "1", "thread_id": uuid.uuid4().hex, "model": model}}

    async for msg in definitions_agent_eval.astream(inputs, configurable, stream_mode="values"):
        chat = msg

    return chat


##model = 'groq-llama-3.3-70b-versatile'
#model = 'gpt-4o-mini'
#definendum = 'dati personali before 2017?'

#chat = await get_definition(definendum, model)
#chat

2025-02-02 18:54:43,804 - INFO - Logging configured
2025-02-02 18:54:45,022 - INFO - PyTorch version 2.3.1 available.
2025-02-02 18:54:45,024 - INFO - Polars version 1.1.0 available.


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

2025-02-02 18:54:48,202 - INFO - loading existing colbert_linear and sparse_linear---------


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

2025-02-02 18:54:49,581 - INFO - loading existing colbert_linear and sparse_linear---------


In [12]:
import json
import logging
from pathlib import Path
import time
import ast


DATASET_MAP = {
    'leos': {
        'generation': leos_generation,
        'retrieval': leos_retrieval
    },
    'lexdrafter': {
        #'generation': lexdrafter_generation,
        #'retrieval': lexdrafter_retrieval
    }
}

#model = 'groq-llama-3.3-70b-versatile'
model = 'together-llama-3.3-70B-Instruct-Turbo'
dataset_name = 'leos'
task = 'retrieval'


Path('logs').mkdir(exist_ok=True)

logging.basicConfig(
    filename=f'logs/evaluation_log_{model}_{dataset_name}_{task}.txt',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True
)

last = 0

output_file = f'{model}_{dataset_name}_{task}.json'

if Path(output_file).exists():
    with open(output_file, 'r') as f:
        results = ast.literal_eval(f.read())
        last = results[-1]['#']
    logging.info(f"Loaded {len(results)} existing results")
else:
    results = []
    logging.info("Starting new results file")

dataset = DATASET_MAP[dataset_name][task]

for i, item in enumerate(dataset[last:], last+1):
    item.update({"#": i})
    try:
        logging.info(f"Processing item {i}/{len(dataset)}: {item['Term']}")
        res = await get_definition(item['Term'], model)
        response_json = json.loads(res['messages'][-1].content)
        item['response'] = response_json
        results.append(item)

        with open(output_file, 'w') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)

        logging.info(item)
        print(f'{i}/{len(dataset)}', item)
        time.sleep(2)
    
    except Exception as e:
        logging.error(f"Error processing item {item['CELEX_ID']} ({item['Term']}): {str(e)}")
        item['response'] = {"error": str(e)}
        print(f'{i}/{len(dataset)} Error', item)
        results.append(item)

    #logging.error(f"Error processing item {item['CELEX_ID']} ({item['Term']}): {str(e)}")
    #item['response'] = {"error": str(e)}
    #print('Error', item)
    #results.append(item)

logging.info(f"Processing complete. Processed {len(results)} items.")

1/5 {'CELEX_ID': '32014L0065', 'Term': 'National Competition Authority', '#': 1, 'response': {'retrieved_definition': [{'dataset': 'EurLex', 'document_id': '32019L0001', 'date': '2018-12-11', 'definition': '(1) " national competition authority " means an authority designated by a Member State pursuant to Article 35 of Regulation (EC) No 1/2003 as being responsible for the application of Articles 101 and 102'}]}}
2/5 {'CELEX_ID': '32015L0412', 'Term': 'Organic Product', '#': 2, 'response': {'retrieved_definition': [{'dataset': 'EurLex', 'document_id': '32018R0848', 'date': '2018-06-14', 'definition': '(2) " organic product " means a product resulting from organic production, other than a product produced during the conversion period referred to in Article 10. The products of hunting or fishing of wild animals are not considered as organic products;'}]}}
3/5 {'CELEX_ID': '32016R1628', 'Term': 'Electric Vehicle', '#': 3, 'response': {'retrieved_definition': [{'dataset': 'EurLex', 'documen

In [11]:
with open(output_file, 'w') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

In [128]:
# retry failed items

output_file = f'{model}_generation.json'
if Path(output_file).exists():
    with open(output_file, 'r') as f:
        results = ast.literal_eval(f.read())

for i, item in enumerate(results):
    if 'error' in item['response']:
        try:
            logging.info(f"Processing item {i}/{len(results)}: {item['term']}")
            res = await get_definition(item['term'], model)
            item['response'] = res['response']

            with open(output_file, 'w') as f:
                json.dump(results, f, indent=2, ensure_ascii=False)

            logging.info(item)
            print(item)
            time.sleep(2)

        except Exception as e:
            logging.error(f"Error processing item {item['celex_id']} ({item['term']}): {str(e)}")
            item['response'] = {"error": str(e)}
            print('Error', item)
            results.append(item)


{'celex_id': '32017R1485', 'term': 'ACE', 'original_definition': "'ACE' means the sum of the power control error (‘ΔP’), that is the real-time difference between the measured actual real time power interchange value (‘P’) and the control program (‘P0’) of a specific LFC area or LFC block and the frequency control error (‘K*Δf’), that is the product of the K-factor and the frequency deviation of that specific LFC area or LFC block, where the area control error equals ΔP+K*Δf;", '#': 2, 'response': {'generated_definition': '"ACE" means an acronym for "Adverse Childhood Experiences," which refers to a range of traumatic events occurring before the age of 18 that can have lasting effects on an individual\'s health and well-being.'}}
{'celex_id': '32009R0107', 'term': 'Conditional access', 'original_definition': "'Conditional access' means a provider-controlled broadcasting service requiring a market subscription television service.", '#': 8, 'response': {'most_relevant_definition_text': '(

----

#### Format excel

In [39]:
results

[{'CELEX_ID': '32016R0679',
  'Term': 'Non-personal Data',
  '#': 1,
  'response': {'retrieved_definition': [{'dataset': 'EurLex',
     'document_id': '32017R1004',
     'date': '2021-07-14',
     'definition': '(6) " detailed data " means data based on primary data in a form which does not allow natural persons or legal entities to be identified directly or indirectly;'}]}},
 {'CELEX_ID': '32016R0680',
  'Term': 'Connected device',
  '#': 2,
  'response': {'generated_definition': {'generated_definition': 'Connected device: means any electronic device that is equipped with the capability to connect to a network, enabling it to send, receive, or exchange data with other devices or systems, including but not limited to smartphones, smart home appliances, wearables, and industrial equipment.',
    'sources': ['active device: means any device, the operation of which depends on a source of energy other than that generated by the human body for that purpose, or by gravity, and which acts by 

In [15]:
def parse_definitions_to_df(data) -> pl.DataFrame:
    """
    Parse JSON data containing legal definitions into a Polars DataFrame.
    
    Args:
        data: List of dictionaries containing legal definitions
        
    Returns:
        Polars DataFrame with columns: celex_id, term, number, response_type, definition, sources
    """
    parsed_data = []
    
    for item in data:
        row = {
            'celex_id': item.get('CELEX_ID'),
            'term': item.get('Term'),
            'number': item.get('#'),
            'response_type': None,
            'definition': None,
            'sources': None
        }
        
        # Handle response parsing
        response = item.get('response', {})
        
        # Check if it's a retrieved definition
        if 'retrieved_definition' in response:
            row['response_type'] = 'retrieved'
            retrieved_def = response['retrieved_definition']
            if retrieved_def and isinstance(retrieved_def, list) and len(retrieved_def) > 0:
                row['definition'] = retrieved_def[0].get('definition')
        
        # Check if it's a generated definition
        elif 'generated_definition' in response:
            row['response_type'] = 'generated'
            generated_def = response['generated_definition']
            if isinstance(generated_def, dict):
                row['definition'] = generated_def.get('generated_definition')
                row['sources'] = ';\n '.join(generated_def.get('sources', []))
        
        elif 'error' in response:
            row['response_type'] = 'error'
        
        parsed_data.append(row)
    
    # Create Polars DataFrame
    df = pl.DataFrame(parsed_data)
    
    # Ensure consistent column types
    df = df.with_columns([
        pl.col('celex_id').cast(pl.Utf8),
        pl.col('term').cast(pl.Utf8),
        pl.col('number').cast(pl.Int64),
        pl.col('response_type').cast(pl.Utf8),
        pl.col('definition').cast(pl.Utf8),
        pl.col('sources').cast(pl.Utf8)
    ])
    
    return df


parse_definitions_to_df(results).to_pandas().to_excel(f'{model}_{dataset_name}_{task}.xlsx', index=True)

# Calculate Metrics

## Retrieval

---

## Generation

In [None]:
!pip install -q evaluate

In [32]:
import json

model = 'gpt-4o-mini'
dataset_name = 'leos'

# Load the JSON data
with open(f'../evaluation/{model}_{dataset_name}_generation.json', 'r') as json_file:
    data = json.load(json_file)

In [35]:
# Initialize lists to store predictions and references
predictions = []
references = []

error_rate = 0

for item in data:
  celex_id = item["CELEX_ID"]
  term = item['Term']
  if item['response'].get('error', None):
    error_rate += 1
    continue

  if item['response'].get('generated_definition', None):
  
    # Add the generated text to predictions list
    predictions.append(item['response']['generated_definition'])

 #   references.append(item['original_definition'])
  

print(f"Error rate: {error_rate}/{len(data)}")
print(f"Generation rate: {len(predictions)}/{len(data)}")


Error rate: 0/85
Generation rate: 74/85


---

### BLEU

In [174]:
import evaluate
bleu = evaluate.load("bleu")

In [175]:
# Compute the BLEU score
bleu4_results = bleu.compute(predictions=predictions, references=references, max_order=4)
print(f"BLEU-4: {bleu4_results['bleu']}")

BLEU-4: 0.03377430498596426


In [176]:
# Compute the BLEU score
bleu3_results = bleu.compute(predictions=predictions, references=references, max_order=3)
print(f"BLEU-3: {bleu3_results['bleu']}")

BLEU-3: 0.05462286464508013


In [177]:
# Compute the BLEU score
blue2_results = bleu.compute(predictions=predictions, references=references, max_order=2)
print(f"BLEU-2: {blue2_results['bleu']}")

BLEU-2: 0.10474183559431799


In [178]:
# Compute the BLEU score
blue1_results = bleu.compute(predictions=predictions, references=references, max_order=1)
print(f"BLEU-1: {blue1_results['bleu']}")

BLEU-1: 0.22034534534534533


---

### Rouge

In [179]:
!pip install rouge_score


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [180]:
import evaluate

rouge = evaluate.load("rouge")

In [181]:
rouge_results = rouge.compute(predictions=predictions, references=references)

print(f"ROUGE-1: {rouge_results['rouge1']}")
print(f"ROUGE-2: {rouge_results['rouge2']}")
print(f"ROUGE-L: {rouge_results['rougeL']}")
print(f"ROUGE-L-sum: {rouge_results['rougeLsum']}")

ROUGE-1: 0.28700232593864694
ROUGE-2: 0.1126557239478996
ROUGE-L: 0.24237744353515672
ROUGE-L-sum: 0.24267500762796892


---

### BertScore

In [182]:
import evaluate
bertscore = evaluate.load("bertscore")

In [183]:
bertscore_results = bertscore.compute(predictions=predictions, references=references, model_type="distilbert-base-uncased")

#bertscore_results = bertscore.compute(predictions=predictions, references=references, model_type="nlpaueb/legal-bert-base-uncased")



In [184]:
# Extract the F1 scores
f1_scores = bertscore_results['f1']

# Calculate the mean of F1 scores
mean_f1 = sum(f1_scores) / len(f1_scores)

print(f"BERTScore_F1: {bertscore_results['f1']}")
print("--------")
print(f"Mean BERTScore F1: {mean_f1}")

BERTScore_F1: [0.6628682613372803, 0.7948177456855774, 0.7066606283187866, 0.7370871901512146, 0.8117414712905884, 0.8624147176742554, 0.821397066116333, 0.8440598845481873, 0.8292337656021118, 0.7387688159942627, 0.7683778405189514, 0.7282744646072388, 0.7693001627922058, 0.8510336875915527, 0.8086915016174316, 0.7795334458351135, 0.7334291338920593, 0.6941763758659363, 0.7499181628227234, 0.8104115724563599, 0.751677930355072, 0.8395684957504272, 0.7646101117134094, 0.7377980947494507, 0.7628700137138367, 0.8065521121025085, 0.7722948789596558, 0.8634458184242249, 0.8530982732772827, 0.7596115469932556, 0.845868706703186, 0.7957202792167664, 0.7315095663070679, 0.8110401034355164, 0.8180986046791077, 0.8063411116600037, 0.7652119398117065, 0.8566089868545532, 0.7643646597862244, 0.804564893245697, 0.7928428053855896, 0.8262407779693604, 0.7473896741867065, 0.7301179766654968, 0.8012781143188477, 0.8491677641868591, 0.843391478061676, 0.8081800937652588, 0.8018554449081421, 0.78462553

In [185]:
# Extract the precision scores
precision_scores = bertscore_results['precision']

# Calculate the mean of precision scores
mean_precision = sum(precision_scores) / len(precision_scores)

print(f"BERTScore_precision: {bertscore_results['precision']}")
print("--------")
print(f"Mean BERTScore precision: {mean_precision}")

BERTScore_precision: [0.6905890703201294, 0.7803431749343872, 0.7009632587432861, 0.7631615400314331, 0.7751771211624146, 0.8396382331848145, 0.7815644145011902, 0.8053774833679199, 0.7711076140403748, 0.7797475457191467, 0.7952648401260376, 0.7289170026779175, 0.7562060356140137, 0.8284916877746582, 0.7748162150382996, 0.7513279318809509, 0.713668942451477, 0.6842294931411743, 0.7461778521537781, 0.7817378044128418, 0.766782820224762, 0.7985286116600037, 0.7924431562423706, 0.7410317063331604, 0.7846895456314087, 0.7840816974639893, 0.7780098915100098, 0.8402496576309204, 0.8162318468093872, 0.7808570265769958, 0.8427900075912476, 0.7731714248657227, 0.7429155111312866, 0.7729219198226929, 0.8111650943756104, 0.7988394498825073, 0.7817986011505127, 0.8675676584243774, 0.8157909512519836, 0.7866048812866211, 0.7815214991569519, 0.7864677309989929, 0.7311968803405762, 0.7303738594055176, 0.8313219547271729, 0.8258938789367676, 0.8291853070259094, 0.7853443026542664, 0.7710431218147278, 

In [186]:
# Extract the recall scores
recall_scores = bertscore_results['recall']

# Calculate the mean of recall scores
mean_recall = sum(recall_scores) / len(recall_scores)

print(f"BERTScore_recall: {bertscore_results['recall']}")
print("--------")
print(f"Mean BERTScore recall: {mean_recall}")

BERTScore_recall: [0.6372870206832886, 0.8098393678665161, 0.7124512791633606, 0.7127357125282288, 0.8519259691238403, 0.8864614367485046, 0.865507960319519, 0.8866457343101501, 0.8968374133110046, 0.7018821239471436, 0.7432494163513184, 0.7276331186294556, 0.7828558683395386, 0.8748366832733154, 0.8456642627716064, 0.8099392652511597, 0.7543147802352905, 0.7044167518615723, 0.7536962032318115, 0.8412688970565796, 0.7371565699577332, 0.8850553631782532, 0.7386659383773804, 0.734592616558075, 0.7422310709953308, 0.8303483128547668, 0.7666632533073425, 0.8879589438438416, 0.893452525138855, 0.7394915223121643, 0.8489699363708496, 0.8196238875389099, 0.7204484939575195, 0.8531131744384766, 0.8251515626907349, 0.8139850497245789, 0.749314546585083, 0.845923662185669, 0.719037652015686, 0.8233640789985657, 0.8044969439506531, 0.870250940322876, 0.7643160223960876, 0.7298622131347656, 0.7733301520347595, 0.8737913370132446, 0.8580928444862366, 0.8323836922645569, 0.8352330923080444, 0.790676

---

### BLEURT
>BLEURT is an evaluation metric for Natural Language Generation. It takes a pair of sentences as input, a reference and a candidate, and it returns a score that indicates to what extent the candidate is fluent and conveys the meaning of the reference.