## Libraries

In [2]:
import pandas as pd

import torch
print('Cuda available:', torch.cuda.is_available())
cuda_id = torch.cuda.current_device()
print('Cuda_id: ', cuda_id)
print(torch.cuda.get_device_name(cuda_id))

Cuda available: True
Cuda_id:  0
NVIDIA GeForce RTX 4060 Laptop GPU


## Data Loading

In [3]:
#%% Data Loading

import os
import pandas as pd
file_path = 'backup/df_output_SP.pkl'
def save_data(file_path, data):
    if os.path.exists(file_path):
        print(f"The file {file_path} exist and will returned")
        data = pd.read_pickle(file_path)
        return data
    else: 
        data.to_pickle(file_path)
        print(f"Data has been saved to {file_path}.")
        return data

df_output_SP = save_data('backup/df_output_SP.pkl', None)
df_output_EN = save_data('backup/df_output_EN.pkl', None)

The file backup/df_output_SP.pkl exist and will returned
The file backup/df_output_EN.pkl exist and will returned


In [4]:
df_output_EN = df_output_EN.sample(n=700, random_state=42)
df_output_SP = df_output_SP.sample(n=700, random_state=42) 

## Distinct-N

In [10]:
#%% distinct_n
from distinct_n.metrics import distinct_n_sentence_level, distinct_n_corpus_level

n = 1  # For Monograms

corpus = list(df_output_SP['output_SIM'])
score = distinct_n_corpus_level([sentence.split() for sentence in corpus], n)
print(f"Average Distinct-{n} score for the df_output_SP SIM: {score}")

corpus = list(df_output_SP['output_MAP'])
score = distinct_n_corpus_level([sentence.split() for sentence in corpus], n)
print(f"Average Distinct-{n} score for the df_output_SP MAP: {score}")

corpus = list(df_output_EN['output_SIM'])
score = distinct_n_corpus_level([sentence.split() for sentence in corpus], n)
print(f"Average Distinct-{n} score for the df_output_EN SIM: {score}")

corpus = list(df_output_EN['output_MAP'])
score = distinct_n_corpus_level([sentence.split() for sentence in corpus], n)
print(f"Average Distinct-{n} score for the df_output_EN MAP: {score}")

n = 2  # For bigrams

corpus = list(df_output_SP['output_SIM'])
score = distinct_n_corpus_level([sentence.split() for sentence in corpus], n)
print(f"Average Distinct-{n} score for the df_output_SP SIM: {score}")

corpus = list(df_output_SP['output_MAP'])
score = distinct_n_corpus_level([sentence.split() for sentence in corpus], n)
print(f"Average Distinct-{n} score for the df_output_SP MAP: {score}")

corpus = list(df_output_EN['output_SIM'])
score = distinct_n_corpus_level([sentence.split() for sentence in corpus], n)
print(f"Average Distinct-{n} score for the df_output_EN SIM: {score}")

corpus = list(df_output_EN['output_MAP'])
score = distinct_n_corpus_level([sentence.split() for sentence in corpus], n)
print(f"Average Distinct-{n} score for the df_output_EN MAP: {score}")

Average Distinct-1 score for the df_output_SP SIM: 0.9345773934589631
Average Distinct-1 score for the df_output_SP MAP: 0.9189624512900665
Average Distinct-1 score for the df_output_EN SIM: 0.9463274876485652
Average Distinct-1 score for the df_output_EN MAP: 0.9445213093654956
Average Distinct-2 score for the df_output_SP SIM: 0.9248617945026216
Average Distinct-2 score for the df_output_SP MAP: 0.9330922492114592
Average Distinct-2 score for the df_output_EN SIM: 0.9047808834668024
Average Distinct-2 score for the df_output_EN MAP: 0.8901043383056046


## Ent-N

In [13]:
import math
from collections import Counter

def calculate_entropy_corpus(corpus, n):
    # Combine all sentences into one list of words
    combined_words = []
    for sentence in corpus:
        combined_words.extend(sentence.split())
    
    # Generate n-grams
    ngrams = [tuple(combined_words[i:i+n]) for i in range(len(combined_words) - n + 1)]
    
    # Count the frequency of each n-gram
    ngram_freq = Counter(ngrams)
    
    # Calculate the total number of n-grams
    total_ngrams = sum(ngram_freq.values())
    
    # Calculate the probability of each n-gram
    ngram_probs = {ngram: freq / total_ngrams for ngram, freq in ngram_freq.items()}
    
    # Calculate the entropy
    entropy = -sum(prob * math.log(prob, 2) for prob in ngram_probs.values())
    
    return entropy

n = 1

corpus = list(df_output_SP['output_MAP'])
entropy = calculate_entropy_corpus(corpus, n)
print(f"Entropy-{n} for the df_output_SP MAP: {entropy}")

corpus = list(df_output_SP['output_SIM'])
entropy = calculate_entropy_corpus(corpus, n)
print(f"Entropy-{n} for the df_output_SP SIM: {entropy}")

corpus = list(df_output_EN['output_MAP'])
entropy = calculate_entropy_corpus(corpus, n)
print(f"Entropy-{n} for the df_output_EN MAP: {entropy}")

corpus = list(df_output_EN['output_SIM'])
entropy = calculate_entropy_corpus(corpus, n)
print(f"Entropy-{n} for the df_output_EN SIM: {entropy}")

n = 2

corpus = list(df_output_SP['output_MAP'])
entropy = calculate_entropy_corpus(corpus, n)
print(f"Entropy-{n} for the df_output_SP MAP: {entropy}")

corpus = list(df_output_SP['output_SIM'])
entropy = calculate_entropy_corpus(corpus, n)
print(f"Entropy-{n} for the df_output_SP SIM: {entropy}")

corpus = list(df_output_EN['output_MAP'])
entropy = calculate_entropy_corpus(corpus, n)
print(f"Entropy-{n} for the df_output_EN MAP: {entropy}")

corpus = list(df_output_EN['output_SIM'])
entropy = calculate_entropy_corpus(corpus, n)
print(f"Entropy-{n} for the df_output_EN SIM: {entropy}")

Entropy-1 for the df_output_SP MAP: 8.918680497118206
Entropy-1 for the df_output_SP SIM: 8.40171141669043
Entropy-1 for the df_output_EN MAP: 9.3317688882827
Entropy-1 for the df_output_EN SIM: 9.07602335936366
Entropy-2 for the df_output_SP MAP: 12.220774474826445
Entropy-2 for the df_output_SP SIM: 11.473927504071174
Entropy-2 for the df_output_EN MAP: 12.610284421048435
Entropy-2 for the df_output_EN SIM: 12.075933281349133


## Rouge Score

In [35]:
import pandas as pd
import numpy as np
from rouge_score import rouge_scorer

# Function to calculate ROUGE-2 score for each row
scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=True)

def calculate_rouge(row):
    scores = scorer.score(row['reference'], row['generated'])
    return scores['rouge2'].fmeasure

n = 2
df = pd.DataFrame({'reference': list(df_output_SP['y_text']), 'generated': list(df_output_SP['output_SIM'])})
df['rouge2'] = df.apply(calculate_rouge, axis=1)
print(f"Rouge-{n} for the df_output_SP SIM: {np.mean(df['rouge2'])}")

df = pd.DataFrame({'reference': list(df_output_SP['y_text']), 'generated': list(df_output_SP['output_MAP'])})
df['rouge2'] = df.apply(calculate_rouge, axis=1)
print(f"Rouge-{n} for the df_output_SP MAP: {np.mean(df['rouge2'])}")

df = pd.DataFrame({'reference': list(df_output_EN['y_text']), 'generated': list(df_output_EN['output_SIM'])})
df['rouge2'] = df.apply(calculate_rouge, axis=1)
print(f"Rouge-{n} for the df_output_EN SIM: {np.mean(df['rouge2'])}")

df = pd.DataFrame({'reference': list(df_output_EN['y_text']), 'generated': list(df_output_EN['output_MAP'])})
df['rouge2'] = df.apply(calculate_rouge, axis=1)
print(f"Rouge-{n} for the df_output_EN MAP: {np.mean(df['rouge2'])}")


Rouge-2 for the df_output_SP SIM: 0.060004415053185024
Rouge-2 for the df_output_SP MAP: 0.07194993280934014
Rouge-2 for the df_output_EN SIM: 0.0035095042297832186
Rouge-2 for the df_output_EN MAP: 0.0033139187237304145


## Bleu

In [36]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Function to calculate BLEU score for each row
def calculate_bleu(row):
    reference = [row['reference'].split()]  # BLEU expects a list of reference lists
    generated = row['generated'].split()
    smoothing_function = SmoothingFunction().method1
    score = sentence_bleu(reference, generated, smoothing_function=smoothing_function)
    return score

df = pd.DataFrame({'reference': list(df_output_SP['y_text']), 'generated': list(df_output_SP['output_SIM'])})
df['Bleu'] = df.apply(calculate_bleu, axis=1)
print(f"Bleu for the df_output_SP SIM: {np.mean(df['Bleu'])}")

df = pd.DataFrame({'reference': list(df_output_SP['y_text']), 'generated': list(df_output_SP['output_MAP'])})
df['Bleu'] = df.apply(calculate_bleu, axis=1)
print(f"Bleu for the df_output_SP MAP: {np.mean(df['Bleu'])}")

df = pd.DataFrame({'reference': list(df_output_EN['y_text']), 'generated': list(df_output_EN['output_SIM'])})
df['Bleu'] = df.apply(calculate_bleu, axis=1)
print(f"Bleu for the df_output_EN SIM: {np.mean(df['Bleu'])}")

df = pd.DataFrame({'reference': list(df_output_EN['y_text']), 'generated': list(df_output_EN['output_MAP'])})
df['Bleu'] = df.apply(calculate_bleu, axis=1)
print(f"Bleu for the df_output_EN MAP: {np.mean(df['Bleu'])}")

Bleu for the df_output_SP SIM: 0.031695689377054345
Bleu for the df_output_SP MAP: 0.045401016044072505
Bleu for the df_output_EN SIM: 0.007081544131689892
Bleu for the df_output_EN MAP: 0.006494955602031048


## Bert Score

In [9]:
from tqdm import tqdm 
# Initialize tqdm for pandas apply 
tqdm.pandas(desc="Processing rows")

import warnings 
from transformers import logging

warnings.simplefilter(action='ignore', category=FutureWarning)
logging.set_verbosity_error()


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from bert_score import score

In [44]:
def calculate_bertscore(row):
    reference = [row['reference']]
    generated = [row['generated']]
    precision, recall, f1 = score(generated, reference, lang='es')
    return f1.mean().item()  # Use .item() to get the value from the tensor

In [None]:
df = pd.DataFrame({'reference': list(df_output_SP['y_text']), 'generated': list(df_output_SP['output_SIM'])})
df['BERTScore'] = df.apply(calculate_bertscore, axis=1)
print(f"BERTScore for the df_output_SP SIM: {np.mean(df['BERTScore'])}")



Bleu for the df_output_SP SIM: 0.7065473560776029


In [45]:
df = pd.DataFrame({'reference': list(df_output_SP['y_text']), 'generated': list(df_output_SP['output_MAP'])})
df['BERTScore'] = df.apply(calculate_bertscore, axis=1)
print(f"BERTScore for the df_output_SP MAP: {np.mean(df['BERTScore'])}")




BERTScore for the df_output_SP MAP: 0.7068764520706711


In [46]:
def calculate_bertscore(row):
    reference = [row['reference']]
    generated = [row['generated']]
    precision, recall, f1 = score(generated, reference, lang='en')
    return f1.mean().item() 

In [10]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from bert_score import score

# Assuming calculate_bertscore is already defined
def calculate_bertscore(reference, generated):
    precision, recall, f1 = score([generated], [reference], lang='en')
    return f1.mean().item()  # Use .item() to get the value from the tensor

In [11]:
df = pd.DataFrame({'reference': list(df_output_EN['y_text']), 'generated': list(df_output_EN['output_SIM'])})
df['BERTScore'] = 0.0

# Use tqdm with a loop for progress visualization
for i in tqdm(range(len(df)), desc="Processing rows"):
    df.at[i, 'BERTScore'] = calculate_bertscore(df.at[i, 'reference'], df.at[i, 'generated'])

# Print the mean BERTScore
print(f"BERTScore for the df_output_EN SIM: {np.mean(df['BERTScore'])}")


Processing rows: 100%|██████████| 700/700 [28:13<00:00,  2.42s/it]

BERTScore for the df_output_EN SIM: 0.841197082230023





In [12]:
df = pd.DataFrame({'reference': list(df_output_EN['y_text']), 'generated': list(df_output_EN['output_MAP'])})
df['BERTScore'] = 0.0

# Use tqdm with a loop for progress visualization
for i in tqdm(range(len(df)), desc="Processing rows"):
    df.at[i, 'BERTScore'] = calculate_bertscore(df.at[i, 'reference'], df.at[i, 'generated'])

# Print the mean BERTScore
print(f"BERTScore for the df_output_EN MAP: {np.mean(df['BERTScore'])}")


Processing rows: 100%|██████████| 700/700 [28:14<00:00,  2.42s/it]

BERTScore for the df_output_EN MAP: 0.835725862128394





## GRUEN

In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
config_class, model_class, tokenizer_class = BertConfig, BertForSequenceClassification, BertTokenizer


  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


In [13]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

model_name = "dccuchile/bert-base-spanish-wwm-cased"
saved_pretrained_CoLA_model_dir = "./model_esCola"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ftt = 'EsCoLA' # 'EsCoLA' 'CoLA'

config = config_class.from_pretrained(saved_pretrained_CoLA_model_dir, num_labels=2, finetuning_task=ftt)
tokenizer = BertTokenizer.from_pretrained(saved_pretrained_CoLA_model_dir, do_lower_case=False)
model = BertForSequenceClassification.from_pretrained(saved_pretrained_CoLA_model_dir, from_tf=bool('.ckpt' in model_name), config=config).to(device)

#%% gruen_score
def gruen_score(text, tokenizer, model, device):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        grammaticality_score = predictions.item()
    return grammaticality_score


In [21]:
from tqdm import tqdm
tqdm.pandas(desc="Processing GRUEN scores")
import numpy as np

In [28]:
df_output_SP['GRUEN_score'] = df_output_SP['output_SIM'].progress_apply(lambda x: gruen_score(x, tokenizer, model, device))
print(f"gruen_score for the df_output_SP SIM: {np.mean(df_output_SP['GRUEN_score'])}")

df_output_SP['GRUEN_score'] = df_output_SP['output_MAP'].progress_apply(lambda x: gruen_score(x, tokenizer, model, device))
print(f"gruen_score for the df_output_SP MAP: {np.mean(df_output_SP['GRUEN_score'])}")

Processing GRUEN scores: 100%|██████████| 700/700 [00:03<00:00, 191.44it/s]


gruen_score for the df_output_SP SIM: 0.93


Processing GRUEN scores: 100%|██████████| 700/700 [00:05<00:00, 133.95it/s]

gruen_score for the df_output_SP MAP: 0.9671428571428572





In [29]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

model_name = "bert-base-cased"
saved_pretrained_CoLA_model_dir = "./tmp/grammar_cola"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ftt = 'CoLA' # 'EsCoLA' 'CoLA'

config = config_class.from_pretrained(saved_pretrained_CoLA_model_dir, num_labels=2, finetuning_task=ftt)
tokenizer = BertTokenizer.from_pretrained(saved_pretrained_CoLA_model_dir, do_lower_case=False)
model = BertForSequenceClassification.from_pretrained(saved_pretrained_CoLA_model_dir, from_tf=bool('.ckpt' in model_name), config=config).to(device)

#%% gruen_score
def gruen_score(text, tokenizer, model, device):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        grammaticality_score = predictions.item()
    return grammaticality_score


  return torch.load(checkpoint_file, map_location=map_location)


In [30]:
df_output_EN['GRUEN_score'] = df_output_EN['output_SIM'].progress_apply(lambda x: gruen_score(x, tokenizer, model, device))
print(f"gruen_score for the df_output_EN SIM: {np.mean(df_output_EN['GRUEN_score'])}")

df_output_EN['GRUEN_score'] = df_output_EN['output_MAP'].progress_apply(lambda x: gruen_score(x, tokenizer, model, device))
print(f"gruen_score for the df_output_EN MAP: {np.mean(df_output_EN['GRUEN_score'])}")

Processing GRUEN scores: 100%|██████████| 700/700 [00:05<00:00, 126.95it/s]


gruen_score for the df_output_EN SIM: 0.8628571428571429


Processing GRUEN scores: 100%|██████████| 700/700 [00:04<00:00, 144.04it/s]

gruen_score for the df_output_EN MAP: 0.84





In [37]:
df_output_EN.rename(columns={'x_text': 'Text', 'output_SIM': 'SIM', 'output_MAP': 'MaP'}, inplace=True)
df_output_EN[['Text','SIM', 'MaP']][:5].to_clipboard()


In [39]:
df_output_SP.rename(columns={'x_text': 'Text', 'output_SIM': 'SIM', 'output_MAP': 'MaP'}, inplace=True)
df_output_SP[['Text','SIM', 'MaP']][:5].to_clipboard(encoding='utf-8')