# Evaluating LLM performance

## Loading metrics with evaluate

In [30]:
import evaluate

# Load the metrics
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

## Describing metrics

In [31]:
# Obtain a description of each metric
print(accuracy.description)
print(precision.description)
print(recall.description)
print(f1.description)


Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
Accuracy = (TP + TN) / (TP + TN + FP + FN)
 Where:
TP: True positive
TN: True negative
FP: False positive
FN: False negative


Precision is the fraction of correctly labeled positive examples out of all of the examples that were labeled as positive. It is computed via the equation:
Precision = TP / (TP + FP)
where TP is the True positives (i.e. the examples correctly labeled as positive) and FP is the False positive examples (i.e. the examples incorrectly labeled as positive).


Recall is the fraction of the positive examples that were correctly labeled by the model as positive. It can be computed with the equation:
Recall = TP / (TP + FN)
Where TP is the true positives and FN is the false negatives.


The F1 score is the harmonic mean of the precision and recall. It can be computed with the equation:
F1 = 2 * (precision * recall) / (precision + recall)



In [32]:
# See the required data types
print(f"The required data types for accuracy are: {accuracy.features}.")
print(f"The required data types for precision are: {precision.features}.")
print(f"The required data types for recall are: {recall.features}.")
print(f"The required data types for f1 are: {f1.features}.")

The required data types for accuracy are: {'predictions': Value('int32'), 'references': Value('int32')}.
The required data types for precision are: {'predictions': Value('int32'), 'references': Value('int32')}.
The required data types for recall are: {'predictions': Value('int32'), 'references': Value('int32')}.
The required data types for f1 are: {'predictions': Value('int32'), 'references': Value('int32')}.


## Using evaluate metrics

In [33]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load a pretrained model and tokenizer (this is an example using a sentiment analysis model)
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare some example text data
texts = ["I love this movie!", "This was a terrible experience.", "The food was okay."]

# Tokenize the inputs
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Get the model outputs
outputs = model(**inputs)

# Now outputs.logits can be used with torch.argmax

In [34]:
import torch

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

# Extract the new predictions
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

print(predicted_labels)

validate_labels = [1, 0, 1]  # Example true labels for the validation set

# Compute the metrics by comparing real and predicted labels
print(accuracy.compute(references=validate_labels, predictions=predicted_labels))
print(precision.compute(references=validate_labels, predictions=predicted_labels))
print(recall.compute(references=validate_labels, predictions=predicted_labels))
print(f1.compute(references=validate_labels, predictions=predicted_labels))


[1, 0, 1]
{'accuracy': 1.0}
{'precision': 1.0}
{'recall': 1.0}
{'f1': 1.0}


# Metrics for language tasks: perplexity and BLEU

## Evaluating perplexity

In [35]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load GPT-2 model and tokenizer
model_name = "gpt2"  # you can also use "gpt2-medium", "gpt2-large", or "gpt2-xl" for larger models
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

input_text = "Current trends show that by 2030"

# Encode the input text, generate and decode it
input_text_ids = tokenizer.encode(input_text, return_tensors="pt")
output = model.generate(input_text_ids, max_length=20)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Text: ", generated_text)

# Load and compute the perplexity score
perplexity = evaluate.load("perplexity", module_type="metric")
results = perplexity.compute(model_id="gpt2", predictions=generated_text)
print("Perplexity: ", results['mean_perplexity'])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text:  Current trends show that by 2030, the number of people living in poverty will be at its lowest level


  0%|          | 0/7 [00:00<?, ?it/s]

Perplexity:  3441.6679486083985


## BLEU translations

In [36]:
from transformers import pipeline
import evaluate

translator = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")

input_sentence_1 = "Hola, ¿cómo estás?"

reference_1 = [["Hello, how are you?", "Hi, how are you?"]]

# Translate the first input sentence then calucate the BLEU metric for translation quality
translated_output = translator(input_sentence_1, clean_up_tokenization_spaces=True)

translated_sentence = translated_output[0]['translation_text']

print("Translated:", translated_sentence)

bleu = evaluate.load("bleu")
results = bleu.compute(predictions=[translated_sentence], references=reference_1)
print(results)

Device set to use cpu


Translated: Hey, how are you?
{'bleu': 0.7598356856515925, 'precisions': [0.8333333333333334, 0.8, 0.75, 0.6666666666666666], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 6, 'reference_length': 6}


In [37]:
from transformers import pipeline
import evaluate

translator = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")

input_sentences_2 = ["Hola, ¿cómo estás?", "Estoy genial, gracias."]

references_2 = [
  ["Hello, how are you?", "Hi, how are you?"],
  ["I'm great, thanks.", "I'm great, thank you."]
]

# Translate the input sentences, extract the translated text, and compute BLEU score
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")

translated_outputs = translator(input_sentences_2, clean_up_tokenization_spaces=True)

predictions = [translated_output['translation_text'] for translated_output in translated_outputs]
print(predictions)

bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predictions, references=references_2)
print(results)

Device set to use cpu
Device set to use cpu


['Hey, how are you?', "I'm great, thanks."]
{'bleu': 0.8627788640890415, 'precisions': [0.9090909090909091, 0.8888888888888888, 0.8571428571428571, 0.8], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 11, 'reference_length': 11}


# Metrics for language tasks: ROUGE, METEOR,EM

## Evaluating with ROUGE

In [41]:
import evaluate

# Load the rouge metric
rouge = evaluate.load("rouge")

predictions = ["""Pluto is a dwarf planet in our solar system, located in the Kuiper Belt beyond Neptune, and was formerly considered the ninth planet until its reclassification in 2006."""]
references = ["""Pluto is a dwarf planet in the solar system, located in the Kuiper Belt beyond Neptune, and was previously deemed as a planet until it was reclassified in 2006."""]

# Calculate the rouge scores between the predicted and reference summaries
results = rouge.compute(predictions=predictions, references=references)
print("ROUGE results: ", results)

ROUGE results:  {'rouge1': np.float64(0.7719298245614034), 'rouge2': np.float64(0.6181818181818182), 'rougeL': np.float64(0.736842105263158), 'rougeLsum': np.float64(0.736842105263158)}


## Evaluating with METEOR

In [42]:
import evaluate

meteor = evaluate.load("meteor")

generated = ["The burrow stretched forward like a narrow corridor for a while, then plunged abruptly downward, so quickly that Alice had no chance to stop herself before she was tumbling into an extremely deep shaft."]
reference = ["The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well."]

# Compute and print the METEOR score
results = meteor.compute(predictions=generated, references=reference)
print("Meteor: ", results['meteor'])

Meteor:  0.37180012567275916


[nltk_data] Downloading package wordnet to /home/jorge/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jorge/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jorge/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Evaluating with EM

In [43]:
import evaluate

# Load the metric
exact_match = evaluate.load("exact_match")

predictions = ["It's a wonderful day", "I love dogs", "DataCamp has great AI courses", "Sunshine and flowers"]
references = ["What a wonderful day", "I love cats", "DataCamp has great AI courses", "Sunsets and flowers"]

# Compute the exact match and print the results
results = exact_match.compute(references=references, predictions=predictions)
print("EM results: ", results)

EM results:  {'exact_match': np.float64(0.25)}
