# Evaluating LLM performance

## Loading metrics with evaluate

In [1]:
import evaluate

# Load the metrics
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

## Describing metrics

In [2]:
# Obtain a description of each metric
print(accuracy.description)
print(precision.description)
print(recall.description)
print(f1.description)


Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
Accuracy = (TP + TN) / (TP + TN + FP + FN)
 Where:
TP: True positive
TN: True negative
FP: False positive
FN: False negative


Precision is the fraction of correctly labeled positive examples out of all of the examples that were labeled as positive. It is computed via the equation:
Precision = TP / (TP + FP)
where TP is the True positives (i.e. the examples correctly labeled as positive) and FP is the False positive examples (i.e. the examples incorrectly labeled as positive).


Recall is the fraction of the positive examples that were correctly labeled by the model as positive. It can be computed with the equation:
Recall = TP / (TP + FN)
Where TP is the true positives and FN is the false negatives.


The F1 score is the harmonic mean of the precision and recall. It can be computed with the equation:
F1 = 2 * (precision * recall) / (precision + recall)



In [3]:
# See the required data types
print(f"The required data types for accuracy are: {accuracy.features}.")
print(f"The required data types for precision are: {precision.features}.")
print(f"The required data types for recall are: {recall.features}.")
print(f"The required data types for f1 are: {f1.features}.")

The required data types for accuracy are: {'predictions': Value('int32'), 'references': Value('int32')}.
The required data types for precision are: {'predictions': Value('int32'), 'references': Value('int32')}.
The required data types for recall are: {'predictions': Value('int32'), 'references': Value('int32')}.
The required data types for f1 are: {'predictions': Value('int32'), 'references': Value('int32')}.


## Using evaluate metrics

In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load a pretrained model and tokenizer (this is an example using a sentiment analysis model)
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Prepare some example text data
texts = ["I love this movie!", "This was a terrible experience.", "The food was okay."]

# Tokenize the inputs
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Get the model outputs
outputs = model(**inputs)

# Now outputs.logits can be used with torch.argmax

In [None]:
import torch

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

# Extract the new predictions
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

print(predicted_labels)

validate_labels = [1, 0, 1]  # Example true labels for the validation set

# Compute the metrics by comparing real and predicted labels
print(accuracy.compute(references=validate_labels, predictions=predicted_labels))
print(precision.compute(references=validate_labels, predictions=predicted_labels))
print(recall.compute(references=validate_labels, predictions=predicted_labels))
print(f1.compute(references=validate_labels, predictions=predicted_labels))
