# Guidelines and standard metrics for evaluating LLMs

In [None]:
from transformers import pipeline
from sklearn.metrics import accuracy_score
import evaluate

## Calculating accuracy

In [None]:
sentiment_analysis = pipeline("sentiment-analysis")

test_examples = [
    {"text": "I am making a good use of this product!", "label": 1},
    {"text": "The service was disappointing.", "label": 0},
    {"text": "I learned a lot from this book.", "label": 1},
    {"text": "The book cover broke after two days of use.", "label": 0},
]

# Pass the four input texts (without labels) to the pipeline
predictions = sentiment_analysis([example["text"] for example in test_examples])

true_labels = [example["label"] for example in test_examples]
predicted_labels = [1 if pred["label"] == "POSITIVE" else 0 for pred in predictions]

# Load the accuracy metric
accuracy = evaluate.load("accuracy")

result = accuracy.compute(references=true_labels, predictions=predicted_labels)
print(result)

## Beyond accuracy: describing metrics

In [None]:
# Load the accuracy, precision, recall and F1 score metrics
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

# Obtain a description of each metric
print(accuracy.description)
print(precision.description)
print(recall.description)
print(f1.description)

## Beyond accuracy: using metrics

In [None]:
test_examples = [
    "Fantastic hotel, exceeded expectations!",
    "Quiet despite central location, great stay.",
    "Friendly staff, welcoming atmosphere.",
    "Spacious, comfy room—a perfect retreat.",
    "Cleanliness could improve, overall decent stay.",
      "Disappointing stay, noisy and unclean room.",
    "Terrible service, unfriendly staff, won't return."
]

test_labels = [1, 1, 1, 1, 0, 0, 0]

precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

# Pass the examples to the pipeline, and obtain a list predicted labels
sentiment_analysis = pipeline("sentiment-analysis")
predictions = sentiment_analysis([example for example in test_examples])
predicted_labels = [1 if pred["label"] == "POSITIVE" else 0 for pred in predictions]

# Compute the metrics by comparing real and predicted labels
print(precision.compute(references=test_labels, predictions=predicted_labels))
print(recall.compute(references=test_labels, predictions=predicted_labels))
print(f1.compute(references=test_labels, predictions=predicted_labels))