In [23]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [24]:
# Try to avoid OOM error when training the model
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [25]:
!pip install --upgrade unsloth peft bitsandbytes accelerate trl

Collecting trl
  Using cached trl-0.26.2-py3-none-any.whl.metadata (11 kB)


In [26]:
!pip install evaluate



# **IMPORTS**

In [27]:
from unsloth import FastLanguageModel
import torch
from torch.nn.utils.rnn import pad_sequence

In [28]:
import scipy
import json
import re
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import evaluate
from scipy.stats import spearmanr, kendalltau
from datasets import Dataset
from tqdm import tqdm
import glob


from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training, PeftModel

# **CARGO LOS DATOS**

In [29]:
def load_jsonl(path):
    data = []
    try:
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                data.append(json.loads(line))
        return pd.DataFrame(data)
    except FileNotFoundError:
        print(f"Error: No se encontró el archivo {path}")
        return pd.DataFrame()

#df_es = load_jsonl("/kaggle/input/basse-es-jsonl/BASSE_es.jsonl")
df_eu = load_jsonl("/kaggle/input/basse-eu-jsonl/BASSE_eu.jsonl")
df_test_gl = load_jsonl("/kaggle/input/basse-gl-jsonl/BASSE.gl.jsonl")

# **DIVIDO EL CONJUNTO DE DATOS**

In [30]:
# 1 -> Separamos test sets para ES (10%)
#train_dev_es, test_es = train_test_split(df_es, test_size=0.1, random_state=42, shuffle=True)
train_dev_eu, test_eu = train_test_split(df_eu, test_size=0.2, random_state=42, shuffle=True)
# 2 -> Creamos train sets (80% total) y dev sets (10% total) para ES y EU
#train_es, dev_es = train_test_split(train_dev_es, test_size=0.1111, random_state=42, shuffle=True)
train_eu, dev_eu = train_test_split(train_dev_eu, test_size=8/36, random_state=42, shuffle=True)

# 3 -> Ajustamos el tamaño del train set (train_set_length ==> modeloES = modeloEU = modeloES-EU)
#train_es_fewshot = train_es.head(20)
#train_eu_fewshot = train_eu.head(20)

# 4 -> Shuffle ES + EU
df_train = train_eu
df_dev = dev_eu

# 5 -> Ponemos nombres claros a los test
df_test_eu = test_eu.reset_index(drop=True)
# df_test_gl 

print(f"TRAIN:    {len(df_train)}")
print(f"DEV:      {len(df_dev)}")
print("-" * 30)
print(f"TEST EU:  {len(df_test_eu)}")
print(f"TEST GL:  {len(df_test_gl)}")

TRAIN:    28
DEV:      8
------------------------------
TEST EU:  9
TEST GL:  15


# **COMO UN RESUMEN TIENE MAS DE UNA ANOTACIÓN, HAGO LA MEDIA**

In [31]:
def extraer_resumenes(df):
    res = []

    for _, row in df.iterrows():
        original = row.get('original_document', '')
        
        model_data = row.get('model_summaries', {})
        
        if isinstance(model_data, dict):
            for model_name, contenido in model_data.items():
                summary = contenido.get("summ", None)
                anns = contenido.get("anns", {})
                
                consistency_vals = anns.get("Consistency", None)

                if summary and consistency_vals:
                    if isinstance(consistency_vals, list):
                        score = np.mean(consistency_vals)
                    else:
                        score = float(consistency_vals)
                        
                    media = int(round(score))
                    
                    res.append((summary, media, original))

    return pd.DataFrame(res, columns=["summary", "consistency", "original_document"])

In [32]:
df_train_redondeo = extraer_resumenes(df_train)
df_dev_redondeo = extraer_resumenes(df_dev)
df_test_eu_redondeo = extraer_resumenes(df_test_eu)
df_test_gl_redondeo = extraer_resumenes(df_test_gl)

In [33]:
print(f"Ejemplos procesados TRAIN: {len(df_train_redondeo)}")
print(f"Ejemplos procesados DEV: {len(df_dev_redondeo)}")
print(f"Ejemplos procesados TEST eu: {len(df_test_eu_redondeo)}")
print(f"Ejemplos procesados TEST gl: {len(df_test_gl_redondeo)}")

Ejemplos procesados TRAIN: 624
Ejemplos procesados DEV: 168
Ejemplos procesados TEST eu: 198
Ejemplos procesados TEST gl: 150


# **CONSTRUYO EL PROMPT**

In [34]:
def construct_metric_prompt_simple(row, target_metric):
    """Construye un prompt de instrucción para evaluar un resumen usando una métrica específica"""
    
    summary = row['summary']
    original_document = row['original_document']
    score = row[target_metric]
    
    prompt = f"""[INSTRUCCIÓN]
As an expert evaluator, analyze the following summary.
Evaluate solely the {target_metric} criterion on a scale of 1 to 5, where 1 is the lowest score and 5 is the highest.

Criterion: Consistency: the factual alignment between the summary and the summarized source. A factually consistent summary contains only statements that are entailed by the source document. Annotators were also asked to penalize summaries that contained hallucinated facts. That is to say, does the summary hallucinate or make up information? If the summary contains information not found in the original document, we penalize it. For temporal expressions (today, yesterday, this year), if the expression is consistent with the original information, we assume that the summary is consistent and do not penalize.",

Score 1: The summary does not contain any ideas from the original text.
Score 2: The summary contains a large amount of incorrect information.
Score 3: The summary contains several incorrect pieces of information.
Score 4: The summary contains once incorrect piece of information.	
Score 5: The summary is completely factual.

Provide only the score for the criterion indicated below in the exact format. **Do not add any justification, explanation, or additional text**, just the score.

Expected output format:
- [CONSISTENCY]: score

[SUMMARY] This is the summary to evaluate:
{summary}

[ORIGINAL DOCUMENT] This is the original document on which the summary is based.
    {original_document}

Provide your evaluation in the exact format: [CONSISTENCY]: (N) where N is a number from 1 to 5
"""
    
    response = f"[{target_metric.upper()}]: {int(round(score))}"
    return {"prompt": prompt, "completion": response}

In [35]:
# Función prepare_metric_dataset ahora puede usar los 3 test sets
def prepare_metric_datasets(df_train, df_dev, df_test_eu, df_test_gl, target_metric):
    def build_data(df, split_name):
        data = []
        for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"Preparando {split_name} para {target_metric}"):
            data.append(construct_metric_prompt_simple(row, target_metric))
        return data

    train_data = build_data(df_train, "train")
    dev_data = build_data(df_dev, "dev")
    test_eu_data = build_data(df_test_eu, "test_eu")
    test_gl_data = build_data(df_test_gl, "test_gl")


    train_dataset = Dataset.from_dict({
        "prompt": [d["prompt"] for d in train_data],
        "completion": [d["completion"] for d in train_data]
    })
    
    dev_dataset = Dataset.from_dict({
        "prompt": [d["prompt"] for d in dev_data],
        "completion": [d["completion"] for d in dev_data]
    })
    
    test_eu_dataset = Dataset.from_dict({
        "prompt": [d["prompt"] for d in test_eu_data],
        "completion": [d["completion"] for d in test_eu_data]
    })

 
    test_gl_dataset = Dataset.from_dict({
        "prompt": [d["prompt"] for d in test_gl_data],
        "completion": [d["completion"] for d in test_gl_data]
    })

    return train_dataset, dev_dataset, test_eu_dataset, test_gl_dataset


In [36]:
target_metric = "consistency"
#train_dataset, dev_dataset, test_dataset = prepare_metric_datasets(df_train_redondeo, df_dev_redondeo, df_test_redondeo, target_metric)
train_dataset, dev_dataset, test_eu_dataset,test_gl_dataset = prepare_metric_datasets(df_train_redondeo, df_dev_redondeo, df_test_eu_redondeo, df_test_gl_redondeo, target_metric)

print(f"\nDataset de entrenamiento: {len(train_dataset)} ejemplos")
print(f"Dataset de desarrollo: {len(dev_dataset)} ejemplos")
print(f"Dataset de prueba es: {len(test_eu_dataset)} ejemplos")
print(f"Dataset de prueba gl: {len(test_gl_dataset)} ejemplos")

Preparando train para consistency: 100%|██████████| 624/624 [00:00<00:00, 19122.83it/s]
Preparando dev para consistency: 100%|██████████| 168/168 [00:00<00:00, 18581.38it/s]
Preparando test_eu para consistency: 100%|██████████| 198/198 [00:00<00:00, 19796.24it/s]
Preparando test_gl para consistency: 100%|██████████| 150/150 [00:00<00:00, 19814.99it/s]


Dataset de entrenamiento: 624 ejemplos
Dataset de desarrollo: 168 ejemplos
Dataset de prueba es: 198 ejemplos
Dataset de prueba gl: 150 ejemplos





# **CARGO EL MODELO**

In [37]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-bnb-4bit", 
    max_seq_length = 5020,
    dtype = None,
    load_in_4bit = True,  
)
FastLanguageModel.for_training(model)

==((====))==  Unsloth 2025.12.9: Fast Mistral patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layer

In [38]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, #0.1
    bias = "none",    
    use_gradient_checkpointing = "unsloth", 
    random_state = 3407,
    use_rslora = False,  
    loftq_config = None, 
)


In [39]:
model.print_trainable_parameters()

trainable params: 20,971,520 || all params: 7,262,703,616 || trainable%: 0.2888


In [40]:
def preprocess(example):
    prompt = example["prompt"].strip()
    completion = example["completion"].strip()

    full_text = f"{prompt}\n### Completion:\n{completion}"
    input_text = f"{prompt}\n### Completion:\n"

    tokenized = tokenizer(full_text, return_tensors="pt", padding=False, truncation=True)
    input_ids = tokenized["input_ids"][0]
    attention_mask = tokenized["attention_mask"][0]

    labels = input_ids.clone()
    prompt_len = len(tokenizer(input_text, return_tensors="pt").input_ids[0])
    labels[:prompt_len] = -100 

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }


In [41]:
train_dataset = train_dataset.map(preprocess)
dev_dataset = dev_dataset.map(preprocess)

Map:   0%|          | 0/624 [00:00<?, ? examples/s]

Map:   0%|          | 0/168 [00:00<?, ? examples/s]

In [42]:
class DataCollator:
    def __call__(self, features):
        # Asegurar que todos son tensores
        input_ids = [torch.tensor(f["input_ids"], dtype=torch.long) for f in features]
        attention_mask = [torch.tensor(f["attention_mask"], dtype=torch.long) for f in features]
        labels = [torch.tensor(f["labels"], dtype=torch.long) for f in features]

        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
        attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
        labels = pad_sequence(labels, batch_first=True, padding_value=-100)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

In [43]:
training_args = TrainingArguments(
    output_dir="outputs",
    per_device_train_batch_size=1, #2 
    gradient_accumulation_steps=8, #4
    per_device_eval_batch_size=1,         # I added this
    eval_accumulation_steps=10,            # I added this: Move eval results to CPU periodically
    gradient_checkpointing=True, # I added this. O meu input é longo e non quero erro OOM
    learning_rate=5e-5,
    num_train_epochs=3,
    fp16=True,
    bf16=False,
    logging_steps=1,
    eval_strategy="steps",    
    eval_steps=20,
    save_strategy="steps",          
    save_steps=20,
    save_total_limit=1,             
    load_best_model_at_end=True,   
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=None,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=DataCollator(),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# **ENTRENO EL MODELO**

In [44]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 624 | Num Epochs = 3 | Total steps = 234
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 20,971,520 of 7,262,703,616 (0.29% trained)


Step,Training Loss,Validation Loss
20,0.0934,0.119325
40,0.0863,0.112477
60,0.0874,0.11517
80,0.1251,0.114196
100,0.08,0.118593
120,0.1141,0.110708
140,0.1194,0.134036
160,0.098,0.111625
180,0.0947,0.117965
200,0.1334,0.111999


In [45]:
# Grab one batch from the dataloader
dataloader = trainer.get_train_dataloader()
iterator = iter(dataloader)
batch = next(iterator)

print(f"Input IDs shape: {batch['input_ids'].shape}")
print(f"Labels shape:    {batch['labels'].shape}")

if batch['input_ids'].shape != batch['labels'].shape:
    print("CRITICAL: Mismatch detected before training started!")

Input IDs shape: torch.Size([1, 2672])
Labels shape:    torch.Size([1, 2672])


# **GUARDO EL MODELO ENTRENADO**

In [46]:
trainer.save_model("./output/final_model")
tokenizer.save_pretrained("./output/final_model")

('./output/final_model/tokenizer_config.json',
 './output/final_model/special_tokens_map.json',
 './output/final_model/tokenizer.model',
 './output/final_model/added_tokens.json',
 './output/final_model/tokenizer.json')

In [47]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name =  "./output/final_model",
    max_seq_length = 5020,
    load_in_4bit = True,
)

==((====))==  Unsloth 2025.12.9: Fast Mistral patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Will load ./output/final_model as a legacy tokenizer.


# **LAS REFERENCIAS**

In [48]:
def get_references(eval_dataset):
    respuestas = []
    for x in eval_dataset:
        respuesta = [x["response"]]
        match = re.search(r"\[CONSISTENCY\]:\s*(\d+)", respuesta[0])
        if match:
            respuestas.append(int(match.group(1)))
        else: 
            respuestas.append(None)

    return respuestas

In [49]:
test_eval_dataset_eu = []
for i in range(len(test_eu_dataset)):
    test_eval_dataset_eu.append({
        "prompt": test_eu_dataset[i]["prompt"],
        "response": test_eu_dataset[i]["completion"]
    })

In [50]:
referencias_eu = get_references(test_eval_dataset_eu)
print("Referencias EU:", referencias_eu)

Referencias EU: [3, 4, 2, 3, 2, 5, 3, 4, 3, 3, 5, 2, 3, 4, 5, 3, 5, 5, 4, 4, 5, 5, 4, 4, 5, 4, 3, 4, 3, 5, 4, 5, 4, 4, 4, 4, 4, 1, 4, 3, 3, 4, 4, 4, 4, 5, 3, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 3, 4, 5, 4, 2, 5, 5, 5, 5, 5, 5, 2, 4, 5, 5, 4, 4, 4, 3, 5, 5, 4, 5, 5, 5, 5, 5, 5, 3, 5, 4, 2, 4, 5, 4, 5, 4, 4, 4, 4, 4, 4, 4, 3, 4, 5, 4, 5, 5, 4, 4, 5, 5, 5, 4, 4, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 5, 5, 4, 5, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 3, 5, 5, 4, 5, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4, 4, 5, 5, 5, 4, 4, 4, 4, 5, 5, 5, 5, 5]


In [51]:
test_eval_dataset_gl = []
for i in range(len(test_gl_dataset)):
    test_eval_dataset_gl.append({
        "prompt": test_gl_dataset[i]["prompt"],
        "response": test_gl_dataset[i]["completion"]
    })
    
referencias_gl = get_references(test_eval_dataset_gl)
print("Referencias GL:", referencias_gl)

Referencias GL: [5, 4, 5, 2, 4, 4, 5, 5, 5, 3, 4, 4, 3, 3, 3, 4, 5, 4, 4, 5, 5, 5, 5, 2, 2, 2, 5, 5, 5, 4, 5, 4, 5, 3, 4, 5, 5, 4, 5, 5, 5, 5, 5, 3, 4, 4, 5, 5, 5, 5, 5, 5, 4, 4, 2, 3, 5, 5, 5, 5, 4, 5, 5, 2, 1, 4, 5, 5, 5, 5, 5, 5, 5, 2, 3, 2, 5, 5, 5, 5, 4, 4, 5, 2, 3, 4, 5, 5, 5, 5, 4, 4, 5, 1, 3, 4, 5, 5, 5, 5, 5, 5, 5, 1, 2, 2, 5, 5, 5, 5, 5, 5, 4, 1, 4, 4, 5, 5, 5, 1, 5, 5, 5, 1, 4, 1, 5, 5, 5, 5, 5, 4, 5, 1, 2, 4, 5, 5, 5, 5, 5, 4, 5, 2, 4, 3, 5, 4, 4, 5]


# **EVALUO EL MODELO**

In [52]:
def evaluate_metric_model(model, dataset, tokenizer, metric):
    all_predictions = []
    model.eval()
    all_predictions = []
    all_references = []

    
    # Iterate over the dataset
    for idx, example in enumerate(tqdm(dataset, desc=f"Evaluating {metric} model")):

        prompt = example["prompt"] 
        
        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=5020,
            padding=True
        )
        inputs = inputs.to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=50,
                do_sample=False,  # Generación determinista (greedy)
                pad_token_id=tokenizer.eos_token_id
            )
        
        prompt_length = len(inputs["input_ids"][0])
        new_tokens = outputs[0][prompt_length:]
        generated_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
        match = re.search(r"\[CONSISTENCY\]:\s*(\d+)", generated_text)
        print(generated_text)
        if match:
            all_predictions.append(float(match.group(1)))
        else:
            all_predictions.append(None)

    
    return all_predictions



In [53]:
predictions_eu = evaluate_metric_model(model, test_eu_dataset,  tokenizer, "Consistency")

Evaluating Consistency model:   1%|          | 1/198 [00:03<11:09,  3.40s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   1%|          | 2/198 [00:05<09:22,  2.87s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   2%|▏         | 3/198 [00:08<08:49,  2.72s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   2%|▏         | 4/198 [00:10<08:32,  2.64s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   3%|▎         | 5/198 [00:13<08:55,  2.78s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   3%|▎         | 6/198 [00:17<09:32,  2.98s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   4%|▎         | 7/198 [00:20<09:33,  3.00s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   4%|▍         | 8/198 [00:23<09:22,  2.96s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   5%|▍         | 9/198 [00:26<09:09,  2.91s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   5%|▌         | 10/198 [00:28<08:51,  2.83s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   6%|▌         | 11/198 [00:31<08:51,  2.84s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   6%|▌         | 12/198 [00:34<08:36,  2.78s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   7%|▋         | 13/198 [00:36<08:19,  2.70s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   7%|▋         | 14/198 [00:39<08:11,  2.67s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   8%|▊         | 15/198 [00:41<08:05,  2.65s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   8%|▊         | 16/198 [00:44<07:53,  2.60s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   9%|▊         | 17/198 [00:46<07:44,  2.57s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   9%|▉         | 18/198 [00:49<07:36,  2.54s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  10%|▉         | 19/198 [00:51<07:33,  2.54s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  10%|█         | 20/198 [00:54<07:25,  2.50s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  11%|█         | 21/198 [00:56<07:11,  2.44s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  11%|█         | 22/198 [00:59<07:51,  2.68s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  12%|█▏        | 23/198 [01:03<08:23,  2.88s/it]

[CONSISTENCY]: 4


Evaluating Consistency model:  12%|█▏        | 24/198 [01:06<08:47,  3.03s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  13%|█▎        | 25/198 [01:09<09:00,  3.12s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  13%|█▎        | 26/198 [01:13<08:59,  3.14s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  14%|█▎        | 27/198 [01:16<09:06,  3.19s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  14%|█▍        | 28/198 [01:19<09:10,  3.24s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  15%|█▍        | 29/198 [01:27<12:47,  4.54s/it]

[COHERENCE]: (N) where N is a number from 1 to 5
[SUPPORT]: (N) where N is a number from 1 to 5
[FLUENCY]: (N) where N is


Evaluating Consistency model:  15%|█▌        | 30/198 [01:30<11:56,  4.27s/it]

[CONSISTENCY]: 4


Evaluating Consistency model:  16%|█▌        | 31/198 [01:34<11:16,  4.05s/it]

[CONSISTENCY]: 4


Evaluating Consistency model:  16%|█▌        | 32/198 [01:37<10:41,  3.86s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  17%|█▋        | 33/198 [01:41<10:11,  3.71s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  17%|█▋        | 34/198 [01:45<10:15,  3.75s/it]

[CONSISTENCY]: 4


Evaluating Consistency model:  18%|█▊        | 35/198 [01:48<09:55,  3.66s/it]

[CONSISTENCY]: 4


Evaluating Consistency model:  18%|█▊        | 36/198 [01:52<10:04,  3.73s/it]

[CONSISTENCY]: 4


Evaluating Consistency model:  19%|█▊        | 37/198 [01:55<09:44,  3.63s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  19%|█▉        | 38/198 [01:59<09:59,  3.75s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  20%|█▉        | 39/198 [02:03<09:33,  3.61s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  20%|██        | 40/198 [02:06<09:23,  3.57s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  21%|██        | 41/198 [02:09<09:06,  3.48s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  21%|██        | 42/198 [02:13<08:51,  3.41s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  22%|██▏       | 43/198 [02:14<07:32,  2.92s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  22%|██▏       | 44/198 [02:16<06:38,  2.59s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  23%|██▎       | 45/198 [02:18<05:58,  2.34s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  23%|██▎       | 46/198 [02:20<05:28,  2.16s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  24%|██▎       | 47/198 [02:22<05:20,  2.12s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  24%|██▍       | 48/198 [02:24<05:03,  2.03s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  25%|██▍       | 49/198 [02:25<04:49,  1.94s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  25%|██▌       | 50/198 [02:27<04:45,  1.93s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  26%|██▌       | 51/198 [02:29<04:56,  2.02s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  26%|██▋       | 52/198 [02:31<04:44,  1.95s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  27%|██▋       | 53/198 [02:33<04:49,  1.99s/it]

[CONSISTENCY]: 4


Evaluating Consistency model:  27%|██▋       | 54/198 [02:35<04:37,  1.93s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  28%|██▊       | 55/198 [02:37<04:26,  1.86s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  28%|██▊       | 56/198 [02:39<04:22,  1.85s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  29%|██▉       | 57/198 [02:40<04:18,  1.84s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  29%|██▉       | 58/198 [02:42<04:07,  1.77s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  30%|██▉       | 59/198 [02:44<03:58,  1.71s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  30%|███       | 60/198 [02:45<03:51,  1.67s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  31%|███       | 61/198 [02:47<03:52,  1.70s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  31%|███▏      | 62/198 [02:49<03:44,  1.65s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  32%|███▏      | 63/198 [02:50<03:39,  1.63s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  32%|███▏      | 64/198 [02:53<04:25,  1.98s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  33%|███▎      | 65/198 [02:56<04:55,  2.23s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  33%|███▎      | 66/198 [02:59<05:23,  2.45s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  34%|███▍      | 67/198 [03:02<05:34,  2.56s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  34%|███▍      | 68/198 [03:05<05:52,  2.71s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  35%|███▍      | 69/198 [03:08<06:11,  2.88s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  35%|███▌      | 70/198 [03:11<06:13,  2.92s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  36%|███▌      | 71/198 [03:14<06:27,  3.05s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  36%|███▋      | 72/198 [03:17<06:14,  2.98s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  37%|███▋      | 73/198 [03:20<06:03,  2.91s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  37%|███▋      | 74/198 [03:23<06:01,  2.91s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  38%|███▊      | 75/198 [03:25<05:52,  2.87s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  38%|███▊      | 76/198 [03:28<05:47,  2.85s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  39%|███▉      | 77/198 [03:31<05:41,  2.82s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  39%|███▉      | 78/198 [03:34<05:34,  2.79s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  40%|███▉      | 79/198 [03:36<05:31,  2.78s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  40%|████      | 80/198 [03:39<05:21,  2.73s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  41%|████      | 81/198 [03:42<05:18,  2.73s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  41%|████▏     | 82/198 [03:45<05:18,  2.75s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  42%|████▏     | 83/198 [03:47<05:08,  2.69s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  42%|████▏     | 84/198 [03:50<05:01,  2.65s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  43%|████▎     | 85/198 [03:53<05:23,  2.86s/it]

[CONSISTENCY]: 4


Evaluating Consistency model:  43%|████▎     | 86/198 [03:56<05:37,  3.01s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  44%|████▍     | 87/198 [04:00<05:46,  3.12s/it]

[CONSISTENCY]: 4


Evaluating Consistency model:  44%|████▍     | 88/198 [04:03<05:50,  3.19s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  45%|████▍     | 89/198 [04:07<06:04,  3.35s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  45%|████▌     | 90/198 [04:10<06:01,  3.35s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  46%|████▌     | 91/198 [04:14<05:57,  3.34s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  46%|████▋     | 92/198 [04:17<05:47,  3.27s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  47%|████▋     | 93/198 [04:20<05:45,  3.29s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  47%|████▋     | 94/198 [04:23<05:44,  3.32s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  48%|████▊     | 95/198 [04:27<05:51,  3.41s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  48%|████▊     | 96/198 [04:30<05:46,  3.39s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  49%|████▉     | 97/198 [04:34<05:40,  3.37s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  49%|████▉     | 98/198 [04:37<05:38,  3.39s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  50%|█████     | 99/198 [04:40<05:35,  3.39s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  51%|█████     | 100/198 [04:44<05:27,  3.34s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  51%|█████     | 101/198 [04:47<05:33,  3.44s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  52%|█████▏    | 102/198 [04:51<05:24,  3.38s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  52%|█████▏    | 103/198 [04:54<05:17,  3.34s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  53%|█████▎    | 104/198 [04:57<05:14,  3.35s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  53%|█████▎    | 105/198 [05:00<05:07,  3.31s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  54%|█████▎    | 106/198 [05:03<04:41,  3.06s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  54%|█████▍    | 107/198 [05:05<04:23,  2.90s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  55%|█████▍    | 108/198 [05:08<04:16,  2.85s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  55%|█████▌    | 109/198 [05:11<04:02,  2.73s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  56%|█████▌    | 110/198 [05:14<04:07,  2.82s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  56%|█████▌    | 111/198 [05:16<03:55,  2.71s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  57%|█████▋    | 112/198 [05:19<03:55,  2.73s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  57%|█████▋    | 113/198 [05:21<03:39,  2.59s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  58%|█████▊    | 114/198 [05:24<03:40,  2.63s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  58%|█████▊    | 115/198 [05:26<03:33,  2.57s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  59%|█████▊    | 116/198 [05:29<03:30,  2.57s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  59%|█████▉    | 117/198 [05:31<03:24,  2.53s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  60%|█████▉    | 118/198 [05:34<03:28,  2.60s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  60%|██████    | 119/198 [05:37<03:22,  2.56s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  61%|██████    | 120/198 [05:39<03:25,  2.63s/it]

[CONSISTENCY]: 4


Evaluating Consistency model:  61%|██████    | 121/198 [05:42<03:14,  2.53s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  62%|██████▏   | 122/198 [05:44<03:06,  2.45s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  62%|██████▏   | 123/198 [05:46<03:03,  2.45s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  63%|██████▎   | 124/198 [05:49<03:01,  2.45s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  63%|██████▎   | 125/198 [05:51<02:54,  2.40s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  64%|██████▎   | 126/198 [05:53<02:49,  2.35s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  64%|██████▍   | 127/198 [05:55<02:39,  2.25s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  65%|██████▍   | 128/198 [05:58<02:36,  2.23s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  65%|██████▌   | 129/198 [06:00<02:29,  2.17s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  66%|██████▌   | 130/198 [06:02<02:27,  2.18s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  66%|██████▌   | 131/198 [06:04<02:22,  2.13s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  67%|██████▋   | 132/198 [06:06<02:21,  2.15s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  67%|██████▋   | 133/198 [06:08<02:17,  2.12s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  68%|██████▊   | 134/198 [06:10<02:21,  2.21s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  68%|██████▊   | 135/198 [06:13<02:16,  2.17s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  69%|██████▊   | 136/198 [06:15<02:15,  2.19s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  69%|██████▉   | 137/198 [06:17<02:09,  2.12s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  70%|██████▉   | 138/198 [06:19<02:15,  2.26s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  70%|███████   | 139/198 [06:21<02:07,  2.16s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  71%|███████   | 140/198 [06:24<02:07,  2.20s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  71%|███████   | 141/198 [06:26<02:02,  2.14s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  72%|███████▏  | 142/198 [06:28<02:02,  2.18s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  72%|███████▏  | 143/198 [06:30<01:57,  2.13s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  73%|███████▎  | 144/198 [06:32<01:53,  2.10s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  73%|███████▎  | 145/198 [06:34<01:50,  2.08s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  74%|███████▎  | 146/198 [06:36<01:45,  2.03s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  74%|███████▍  | 147/198 [06:38<01:43,  2.03s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  75%|███████▍  | 148/198 [06:40<01:41,  2.02s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  75%|███████▌  | 149/198 [06:42<01:37,  1.99s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  76%|███████▌  | 150/198 [06:44<01:34,  1.97s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  76%|███████▋  | 151/198 [06:46<01:33,  1.99s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  77%|███████▋  | 152/198 [06:48<01:34,  2.06s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  77%|███████▋  | 153/198 [06:50<01:32,  2.06s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  78%|███████▊  | 154/198 [06:52<01:32,  2.11s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  78%|███████▊  | 155/198 [06:54<01:30,  2.09s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  79%|███████▉  | 156/198 [06:57<01:29,  2.14s/it]

[CONSISTENCY]: 4


Evaluating Consistency model:  79%|███████▉  | 157/198 [06:59<01:27,  2.12s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  80%|███████▉  | 158/198 [07:01<01:28,  2.21s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  80%|████████  | 159/198 [07:04<01:29,  2.31s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  81%|████████  | 160/198 [07:06<01:24,  2.23s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  81%|████████▏ | 161/198 [07:08<01:22,  2.24s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  82%|████████▏ | 162/198 [07:10<01:24,  2.34s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  82%|████████▏ | 163/198 [07:12<01:18,  2.25s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  83%|████████▎ | 164/198 [07:15<01:16,  2.25s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  83%|████████▎ | 165/198 [07:17<01:12,  2.19s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  84%|████████▍ | 166/198 [07:19<01:13,  2.30s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  84%|████████▍ | 167/198 [07:21<01:08,  2.22s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  85%|████████▍ | 168/198 [07:24<01:06,  2.23s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  85%|████████▌ | 169/198 [07:26<01:02,  2.17s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  86%|████████▌ | 170/198 [07:28<00:59,  2.12s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  86%|████████▋ | 171/198 [07:30<00:55,  2.07s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  87%|████████▋ | 172/198 [07:32<00:55,  2.12s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  87%|████████▋ | 173/198 [07:34<00:51,  2.06s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  88%|████████▊ | 174/198 [07:36<00:48,  2.02s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  88%|████████▊ | 175/198 [07:37<00:44,  1.93s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  89%|████████▉ | 176/198 [07:39<00:42,  1.93s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  89%|████████▉ | 177/198 [07:41<00:39,  1.87s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  90%|████████▉ | 178/198 [07:43<00:37,  1.89s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  90%|█████████ | 179/198 [07:45<00:35,  1.86s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  91%|█████████ | 180/198 [07:47<00:33,  1.87s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  91%|█████████▏| 181/198 [07:48<00:31,  1.83s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  92%|█████████▏| 182/198 [07:50<00:28,  1.81s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  92%|█████████▏| 183/198 [07:52<00:26,  1.78s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  93%|█████████▎| 184/198 [07:54<00:25,  1.83s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  93%|█████████▎| 185/198 [07:56<00:23,  1.80s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  94%|█████████▍| 186/198 [07:57<00:21,  1.79s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  94%|█████████▍| 187/198 [07:59<00:19,  1.78s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  95%|█████████▍| 188/198 [08:01<00:17,  1.77s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  95%|█████████▌| 189/198 [08:03<00:15,  1.76s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  96%|█████████▌| 190/198 [08:04<00:14,  1.76s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  96%|█████████▋| 191/198 [08:06<00:12,  1.75s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  97%|█████████▋| 192/198 [08:08<00:10,  1.80s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  97%|█████████▋| 193/198 [08:10<00:08,  1.79s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  98%|█████████▊| 194/198 [08:11<00:06,  1.74s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  98%|█████████▊| 195/198 [08:13<00:05,  1.70s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  99%|█████████▉| 196/198 [08:15<00:03,  1.72s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  99%|█████████▉| 197/198 [08:16<00:01,  1.72s/it]

[CONSISTENCY]: 5


Evaluating Consistency model: 100%|██████████| 198/198 [08:18<00:00,  2.52s/it]

[CONSISTENCY]: 5





In [54]:
print("EU test:", predictions_eu)

EU test: [5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, None, 4.0, 4.0, 5.0, 5.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0]

In [55]:
cantidad_none = predictions_eu.count(None)
total_eu = len(predictions_eu)
porcentaje_eu = cantidad_none / total_eu

print(f"Cantidad de valores None: {cantidad_none}, Total de predicciones: {total_eu}")
print(f"Porcentaje de Nones: {porcentaje_eu:.2%}")

Cantidad de valores None: 1, Total de predicciones: 198
Porcentaje de Nones: 0.51%


## **QUITO LOS NONE**

In [56]:
filtrados = [(h, p) for h, p in zip(referencias_eu, predictions_eu) if h is not None and p is not None]
hum_limpio, predictions_limpio = zip(*filtrados)

In [57]:
print(hum_limpio)
print(predictions_limpio)

(3, 4, 2, 3, 2, 5, 3, 4, 3, 3, 5, 2, 3, 4, 5, 3, 5, 5, 4, 4, 5, 5, 4, 4, 5, 4, 3, 4, 5, 4, 5, 4, 4, 4, 4, 4, 1, 4, 3, 3, 4, 4, 4, 4, 5, 3, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 3, 4, 5, 4, 2, 5, 5, 5, 5, 5, 5, 2, 4, 5, 5, 4, 4, 4, 3, 5, 5, 4, 5, 5, 5, 5, 5, 5, 3, 5, 4, 2, 4, 5, 4, 5, 4, 4, 4, 4, 4, 4, 4, 3, 4, 5, 4, 5, 5, 4, 4, 5, 5, 5, 4, 4, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 5, 5, 4, 5, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 3, 5, 5, 4, 5, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4, 4, 5, 5, 5, 4, 4, 4, 4, 5, 5, 5, 5, 5)
(5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 5.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.

## **MÉTRICAS**

## **SPEARMANR**

In [58]:
# Mide si el modelo puede ordenar los modelos de peor a mejor 
# en comparación con las puntuaciones humanas (no exactamente la misma puntuación)
s, p = spearmanr(hum_limpio, predictions_limpio)
print(f"Spearman EU: {s:.3f}, p-value: {p:.3f}")

Spearman ES: -0.027, p-value: 0.709


## **KENDALLTAU**

In [59]:
# Mide la capacidad para ordenar los resumenes según su calidad, 
# pero se calcula por parejas (de estos dos cuál es el mejor?)
tau, p_value = scipy.stats.kendalltau(hum_limpio, predictions_limpio)
print(f"Kendalltau EU: {tau:.3f}, p-value: {p_value:.3f}")

Kendalltau ES: -0.025, p-value: 0.708


## **MAE**

In [60]:
# Error medio absoluto
# Mide la distancia media entre las puntuaciones predichas por los modelos 
# y las dadas por los humanos
# Es decir, menos de 0,5 (ej.: humano -> 5, modelo -> 4.60)
mae = np.mean(np.abs(np.array(hum_limpio) - np.array(predictions_limpio)))
print(f"MAE EU: {mae}")

MAE ES: 0.7106598984771574


# **TEST GL**

In [61]:
predictions_gl = evaluate_metric_model(model, test_gl_dataset,  tokenizer, "Consistency")
print("GL test:", predictions_gl)

Evaluating Consistency model:   1%|          | 1/150 [00:01<03:41,  1.49s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   1%|▏         | 2/150 [00:02<03:41,  1.50s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   2%|▏         | 3/150 [00:04<03:38,  1.49s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   3%|▎         | 4/150 [00:05<03:39,  1.50s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   3%|▎         | 5/150 [00:07<03:37,  1.50s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   4%|▍         | 6/150 [00:08<03:35,  1.50s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   5%|▍         | 7/150 [00:10<03:30,  1.47s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   5%|▌         | 8/150 [00:11<03:29,  1.48s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   6%|▌         | 9/150 [00:13<03:23,  1.44s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   7%|▋         | 10/150 [00:14<03:17,  1.41s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   7%|▋         | 11/150 [00:17<04:14,  1.83s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   8%|▊         | 12/150 [00:20<04:53,  2.12s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   9%|▊         | 13/150 [00:22<05:05,  2.23s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:   9%|▉         | 14/150 [00:25<05:15,  2.32s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  10%|█         | 15/150 [00:27<05:19,  2.37s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  11%|█         | 16/150 [00:30<05:22,  2.40s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  11%|█▏        | 17/150 [00:32<05:14,  2.37s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  12%|█▏        | 18/150 [00:34<05:16,  2.40s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  13%|█▎        | 19/150 [00:37<05:16,  2.42s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  13%|█▎        | 20/150 [00:39<05:13,  2.41s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  14%|█▍        | 21/150 [00:41<04:52,  2.27s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  15%|█▍        | 22/150 [00:43<04:36,  2.16s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  15%|█▌        | 23/150 [00:45<04:18,  2.03s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  16%|█▌        | 24/150 [00:47<04:06,  1.96s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  17%|█▋        | 25/150 [00:48<03:56,  1.89s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  17%|█▋        | 26/150 [00:50<03:51,  1.87s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  18%|█▊        | 27/150 [00:52<03:45,  1.83s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  19%|█▊        | 28/150 [00:54<03:45,  1.85s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  19%|█▉        | 29/150 [00:55<03:33,  1.77s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  20%|██        | 30/150 [00:57<03:25,  1.71s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  21%|██        | 31/150 [00:59<03:23,  1.71s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  21%|██▏       | 32/150 [01:00<03:23,  1.73s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  22%|██▏       | 33/150 [01:02<03:17,  1.69s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  23%|██▎       | 34/150 [01:04<03:17,  1.71s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  23%|██▎       | 35/150 [01:06<03:17,  1.71s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  24%|██▍       | 36/150 [01:07<03:15,  1.72s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  25%|██▍       | 37/150 [01:09<03:06,  1.65s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  25%|██▌       | 38/150 [01:10<03:02,  1.63s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  26%|██▌       | 39/150 [01:12<02:56,  1.59s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  27%|██▋       | 40/150 [01:13<02:54,  1.59s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  27%|██▋       | 41/150 [01:15<03:09,  1.74s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  28%|██▊       | 42/150 [01:18<03:17,  1.83s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  29%|██▊       | 43/150 [01:19<03:19,  1.86s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  29%|██▉       | 44/150 [01:22<03:23,  1.92s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  30%|███       | 45/150 [01:24<03:25,  1.96s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  31%|███       | 46/150 [01:26<03:26,  1.98s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  31%|███▏      | 47/150 [01:28<03:21,  1.96s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  32%|███▏      | 48/150 [01:29<03:18,  1.95s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  33%|███▎      | 49/150 [01:31<03:15,  1.94s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  33%|███▎      | 50/150 [01:33<03:13,  1.93s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  34%|███▍      | 51/150 [01:37<04:11,  2.54s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  35%|███▍      | 52/150 [01:45<06:30,  3.99s/it]

[CONSISTENCY]: 5

[ORIGINAL DOCUMENT] This is the original document on which the summary is based.
    O pasado xoves presentouse en Lugo a plataforma


Evaluating Consistency model:  35%|███▌      | 53/150 [01:48<06:22,  3.95s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  36%|███▌      | 54/150 [01:52<06:15,  3.91s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  37%|███▋      | 55/150 [01:56<06:06,  3.85s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  37%|███▋      | 56/150 [02:00<05:58,  3.81s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  38%|███▊      | 57/150 [02:03<05:47,  3.74s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  39%|███▊      | 58/150 [02:07<05:42,  3.72s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  39%|███▉      | 59/150 [02:10<05:33,  3.67s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  40%|████      | 60/150 [02:14<05:26,  3.62s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  41%|████      | 61/150 [02:16<04:51,  3.27s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  41%|████▏     | 62/150 [02:19<04:26,  3.03s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  42%|████▏     | 63/150 [02:21<03:58,  2.74s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  43%|████▎     | 64/150 [02:23<03:42,  2.58s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  43%|████▎     | 65/150 [02:25<03:30,  2.47s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  44%|████▍     | 66/150 [02:28<03:22,  2.41s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  45%|████▍     | 67/150 [02:30<03:11,  2.31s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  45%|████▌     | 68/150 [02:32<03:07,  2.29s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  46%|████▌     | 69/150 [02:34<03:00,  2.23s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  47%|████▋     | 70/150 [02:36<02:55,  2.19s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  47%|████▋     | 71/150 [02:38<02:38,  2.01s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  48%|████▊     | 72/150 [02:40<02:30,  1.93s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  49%|████▊     | 73/150 [02:41<02:18,  1.79s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  49%|████▉     | 74/150 [02:43<02:11,  1.74s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  50%|█████     | 75/150 [02:44<02:07,  1.70s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  51%|█████     | 76/150 [02:46<02:03,  1.66s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  51%|█████▏    | 77/150 [02:47<01:56,  1.60s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  52%|█████▏    | 78/150 [02:49<01:53,  1.57s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  53%|█████▎    | 79/150 [02:50<01:50,  1.55s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  53%|█████▎    | 80/150 [02:52<01:47,  1.54s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  54%|█████▍    | 81/150 [02:53<01:45,  1.53s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  55%|█████▍    | 82/150 [02:55<01:43,  1.52s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  55%|█████▌    | 83/150 [02:56<01:39,  1.48s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  56%|█████▌    | 84/150 [02:58<01:38,  1.49s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  57%|█████▋    | 85/150 [02:59<01:37,  1.50s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  57%|█████▋    | 86/150 [03:01<01:37,  1.52s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  58%|█████▊    | 87/150 [03:02<01:35,  1.51s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  59%|█████▊    | 88/150 [03:04<01:31,  1.48s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  59%|█████▉    | 89/150 [03:05<01:28,  1.46s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  60%|██████    | 90/150 [03:06<01:26,  1.44s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  61%|██████    | 91/150 [03:09<01:38,  1.67s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  61%|██████▏   | 92/150 [03:11<01:43,  1.79s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  62%|██████▏   | 93/150 [03:13<01:45,  1.85s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  63%|██████▎   | 94/150 [03:15<01:47,  1.91s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  63%|██████▎   | 95/150 [03:17<01:48,  1.97s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  64%|██████▍   | 96/150 [03:19<01:48,  2.01s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  65%|██████▍   | 97/150 [03:22<01:58,  2.24s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  65%|██████▌   | 98/150 [03:24<01:53,  2.19s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  66%|██████▌   | 99/150 [03:26<01:47,  2.11s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  67%|██████▋   | 100/150 [03:28<01:42,  2.05s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  67%|██████▋   | 101/150 [03:29<01:36,  1.96s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  68%|██████▊   | 102/150 [03:31<01:32,  1.93s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  69%|██████▊   | 103/150 [03:33<01:26,  1.83s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  69%|██████▉   | 104/150 [03:35<01:23,  1.81s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  70%|███████   | 105/150 [03:36<01:20,  1.79s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  71%|███████   | 106/150 [03:38<01:18,  1.78s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  71%|███████▏  | 107/150 [03:40<01:15,  1.76s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  72%|███████▏  | 108/150 [03:42<01:12,  1.72s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  73%|███████▎  | 109/150 [03:43<01:09,  1.68s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  73%|███████▎  | 110/150 [03:45<01:05,  1.63s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  74%|███████▍  | 111/150 [03:47<01:10,  1.81s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  75%|███████▍  | 112/150 [03:49<01:13,  1.94s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  75%|███████▌  | 113/150 [03:51<01:13,  1.99s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  76%|███████▌  | 114/150 [03:53<01:13,  2.05s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  77%|███████▋  | 115/150 [03:55<01:12,  2.06s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  77%|███████▋  | 116/150 [03:58<01:10,  2.07s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  78%|███████▊  | 117/150 [04:00<01:08,  2.07s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  79%|███████▊  | 118/150 [04:02<01:06,  2.08s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  79%|███████▉  | 119/150 [04:04<01:04,  2.09s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  80%|████████  | 120/150 [04:06<01:02,  2.07s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  81%|████████  | 121/150 [04:07<00:54,  1.89s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  81%|████████▏ | 122/150 [04:09<00:50,  1.79s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  82%|████████▏ | 123/150 [04:10<00:45,  1.69s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  83%|████████▎ | 124/150 [04:12<00:41,  1.59s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  83%|████████▎ | 125/150 [04:13<00:39,  1.56s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  84%|████████▍ | 126/150 [04:15<00:36,  1.53s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  85%|████████▍ | 127/150 [04:16<00:34,  1.50s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  85%|████████▌ | 128/150 [04:18<00:32,  1.49s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  86%|████████▌ | 129/150 [04:19<00:30,  1.44s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  87%|████████▋ | 130/150 [04:20<00:28,  1.42s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  87%|████████▋ | 131/150 [04:23<00:32,  1.73s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  88%|████████▊ | 132/150 [04:25<00:35,  1.97s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  89%|████████▊ | 133/150 [04:28<00:36,  2.12s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  89%|████████▉ | 134/150 [04:30<00:35,  2.22s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  90%|█████████ | 135/150 [04:33<00:34,  2.30s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  91%|█████████ | 136/150 [04:35<00:32,  2.35s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  91%|█████████▏| 137/150 [04:38<00:30,  2.38s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  92%|█████████▏| 138/150 [04:40<00:28,  2.40s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  93%|█████████▎| 139/150 [04:42<00:25,  2.35s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  93%|█████████▎| 140/150 [04:44<00:23,  2.31s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  94%|█████████▍| 141/150 [04:46<00:18,  2.03s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  95%|█████████▍| 142/150 [04:47<00:14,  1.86s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  95%|█████████▌| 143/150 [04:49<00:11,  1.71s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  96%|█████████▌| 144/150 [04:50<00:09,  1.65s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  97%|█████████▋| 145/150 [04:52<00:08,  1.60s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  97%|█████████▋| 146/150 [04:53<00:06,  1.57s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  98%|█████████▊| 147/150 [04:55<00:04,  1.50s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  99%|█████████▊| 148/150 [04:56<00:02,  1.46s/it]

[CONSISTENCY]: 5


Evaluating Consistency model:  99%|█████████▉| 149/150 [04:57<00:01,  1.43s/it]

[CONSISTENCY]: 5


Evaluating Consistency model: 100%|██████████| 150/150 [04:59<00:00,  1.99s/it]

[CONSISTENCY]: 5
GL test: [5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0]





In [62]:
cantidad_none_gl = predictions_gl.count(None)
total_gl = len(predictions_gl)
porcentaje_gl = cantidad_none_gl / total_gl

print(f"Cantidad de valores None: {cantidad_none_gl}, Total de predicciones: {total_gl}")
print(f"Porcentaje de Nones: {porcentaje_gl:.2%}")

Cantidad de valores None: 0, Total de predicciones: 150
Porcentaje de Nones: 0.00%


## **QUITO LOS NONE**

In [63]:
filtrados_gl = [(h, p) for h, p in zip(referencias_gl, predictions_gl) if h is not None and p is not None]
hum_limpio_gl, predictions_limpio_gl = zip(*filtrados_gl)

In [64]:
print(hum_limpio_gl)
print(predictions_limpio_gl)

(5, 4, 5, 2, 4, 4, 5, 5, 5, 3, 4, 4, 3, 3, 3, 4, 5, 4, 4, 5, 5, 5, 5, 2, 2, 2, 5, 5, 5, 4, 5, 4, 5, 3, 4, 5, 5, 4, 5, 5, 5, 5, 5, 3, 4, 4, 5, 5, 5, 5, 5, 5, 4, 4, 2, 3, 5, 5, 5, 5, 4, 5, 5, 2, 1, 4, 5, 5, 5, 5, 5, 5, 5, 2, 3, 2, 5, 5, 5, 5, 4, 4, 5, 2, 3, 4, 5, 5, 5, 5, 4, 4, 5, 1, 3, 4, 5, 5, 5, 5, 5, 5, 5, 1, 2, 2, 5, 5, 5, 5, 5, 5, 4, 1, 4, 4, 5, 5, 5, 1, 5, 5, 5, 1, 4, 1, 5, 5, 5, 5, 5, 4, 5, 1, 2, 4, 5, 5, 5, 5, 5, 4, 5, 2, 4, 3, 5, 4, 4, 5)
(5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0

## **MÉTRICAS**

## **SPEARMANR**

In [65]:
# Mide si el modelo puede ordenar los modelos de peor a mejor 
# en comparación con las puntuaciones humanas (no exactamente la misma puntuación)
s, p = spearmanr(hum_limpio_gl, predictions_limpio_gl)
print(f"Spearman GL: {s:.3f}, p-value: {p:.3f}")

Spearman GL: nan, p-value: nan


  s, p = spearmanr(hum_limpio_gl, predictions_limpio_gl)


## **KENDALLTAU**

In [66]:
# Mide la capacidad para ordenar los resumenes según su calidad, 
# pero se calcula por parejas (de estos dos cuál es el mejor?)
tau, p_value = scipy.stats.kendalltau(hum_limpio_gl, predictions_limpio_gl)
print(f"Kendalltau GL: {tau:.3f}, p-value: {p_value:.3f}")

Kendalltau GL: nan, p-value: nan


## **MAE**

In [67]:
# Error medio absoluto
# Mide la distancia media entre las puntuaciones predichas por los modelos 
# y las dadas por los humanos
# Es decir, menos de 0,5 (ej.: humano -> 5, modelo -> 4.60)
mae = np.mean(np.abs(np.array(hum_limpio_gl) - np.array(predictions_limpio_gl)))
print(f"MAE GL: {mae}")

MAE GL: 0.8466666666666667
