In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **LLM Evaluation Framework**
This notebook describes how you build LLM Evaluation Framework using **"EVALUATE"** Library in Hugging Face.
![](https://huggingface.co/datasets/evaluate/media/resolve/main/evaluate-banner.png)

A library for easily evaluating machine learning models and datasets.

With a single line of code, you get access to dozens of evaluation methods for different domains. Be it on your local machine or in a distributed training setup, you can evaluate your models in a consistent and reproducible way!


In [None]:
%%capture
!pip install evaluate
!pip install rouge_score

In [None]:
import warnings
warnings.filterwarnings("ignore")

### Example of Evaluate

In [None]:
import evaluate
print(evaluate.load('exact_match').compute(references=['hello'], predictions=['hell']))

# Types of evaluations

There are different aspects of a typical machine learning pipeline that can be evaluated and for each aspect 🤗 Evaluate provides a tool:

- **Metric**: A metric is used to evaluate a model’s performance and usually involves the model’s predictions as well as some ground truth labels. You can find all integrated metrics at evaluate-metric.
- **Comparison**: A comparison is used to compare two models. This can for example be done by comparing their predictions to ground truth labels and computing their agreement. You can find all integrated comparisons at evaluate-comparison.
- **Measurement**: The dataset is as important as the model trained on it. With measurements one can investigate a dataset’s properties. You can find all integrated measurements at evaluate-measurement.

Each metric, comparison, and measurement is a separate Python module, but for using any of them, there is a single entry point: **evaluate.load()**

![](https://i1.wp.com/dataaspirant.com/wp-content/uploads/2020/08/2_6_classification_evaluation_metrics.png?resize=1536%2C1099&ssl=1)

In [None]:
import evaluate
accuracy = evaluate.load("accuracy")
word_length = evaluate.load("word_length", module_type="measurement")

In [None]:
data = ["hello world"]
results = word_length.compute(data=data)
print(results)

In [None]:
accuracy.compute(references=[0,1,0,1], predictions=[1,0,0,1])

## If you have model Actual Results and Predictions, you can calculate the Accuracy as below-

In [None]:
for refs, preds in zip([[0,1],[0,1]], [[1,0],[0,1]]):
    accuracy.add_batch(references=refs, predictions=preds)
accuracy.compute()

# Combining several evaluations

In [None]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

In [None]:
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])

### Saving the Result in File

In [None]:
result = accuracy.compute(references=[0,1,0,1], predictions=[1,0,0,1])

hyperparams = {"model": "bert-base-uncased"}
evaluate.save("/kaggle/working/",experiment="run 42", **result, **hyperparams)

In [None]:
from transformers import pipeline
from datasets import load_dataset
from evaluate import evaluator
import evaluate

pipe = pipeline("text-classification", model="lvwerra/distilbert-imdb", device=0)
data = load_dataset("imdb", split="test").shuffle().select(range(1000))
metric = evaluate.load("accuracy")

In [None]:
task_evaluator = evaluator("text-classification")
results = task_evaluator.compute(model_or_pipeline=pipe, data=data, metric=metric,
                       label_mapping={"NEGATIVE": 0, "POSITIVE": 1},)
print(results)

Calculating the value of the metric alone is often not enough to know if a model performs significantly better than another one. With bootstrapping evaluate computes confidence intervals and the standard error which helps estimate how stable a score is:

In [None]:
results = task_evaluator.compute(model_or_pipeline=pipe, data=data, metric=metric,
                       label_mapping={"NEGATIVE": 0, "POSITIVE": 1},
                       strategy="bootstrap", n_resamples=200)
print(results)

The evaluator expects a "text" and "label" column for the data input. If your dataset differs you can provide the columns with the keywords input_column="text" and label_column="label". Currently only "text-classification" is supported with more tasks being added in the future.

# Visualize the Evaluation of Different Models

In [None]:
import evaluate
from evaluate.visualization import radar_plot

data = [
   {"accuracy": 0.99, "precision": 0.8, "f1": 0.95, "latency_in_seconds": 33.6},
   {"accuracy": 0.98, "precision": 0.87, "f1": 0.91, "latency_in_seconds": 11.2},
   {"accuracy": 0.98, "precision": 0.78, "f1": 0.88, "latency_in_seconds": 87.6},
   {"accuracy": 0.88, "precision": 0.78, "f1": 0.81, "latency_in_seconds": 101.6}
   ]
model_names = ["Model 1", "Model 2", "Model 3", "Model 4"]
plot = radar_plot(data=data, model_names=model_names)
plot.show()

# Choosing a Metric for your task
So you’ve trained your model and want to see how well it’s doing on a dataset of your choice. Where do you start?

There is no “one size fits all” approach to choosing an evaluation metric, but some good guidelines to keep in mind are:

![](https://uptrain-assets.s3.ap-south-1.amazonaws.com/images/how-to-evaluate-llm-blog/dimensions_of_llm_evaluations.webp)

## Categories of Metrics
There are 3 high-level categories of metrics:

- **Generic metrics**, which can be applied to a variety of situations and datasets, such as precision and accuracy.
- **Task-specific metrics**, which are limited to a given task, such as Machine Translation (often evaluated using metrics BLEU or ROUGE) or Named Entity Recognition (often evaluated with seqeval).
- **Dataset-specific metrics**, which aim to measure model performance on specific benchmarks: for instance, the GLUE benchmark has a dedicated evaluation metric.

## Generic metrics
Many of the metrics used in the Machine Learning community are quite generic and can be applied in a variety of tasks and datasets.

This is the case for metrics like accuracy and precision, which can be used for evaluating labeled (supervised) datasets, as well as perplexity, which can be used for evaluating different kinds of (unsupervised) generative tasks.

In [None]:
precision_metric = evaluate.load("precision")
results = precision_metric.compute(references=[0, 1], predictions=[0, 1])
print(results)

## Task-specific metrics
Popular ML tasks like Machine Translation and Named Entity Recognition have specific metrics that can be used to compare models.

- For example, a series of different metrics have been proposed for text generation, ranging from BLEU and its derivatives such as GoogleBLEU and GLEU, but also ROUGE, MAUVE, etc.

You can find the right metric for your task by:

- Looking at the Task pages to see what metrics can be used for evaluating models for a given task.
- Checking out leaderboards on sites like Papers With Code (you can search by task and by dataset).
- Reading the metric cards for the relevant metrics and see which ones are a good fit for your use case. For example, see the BLEU metric card or SQuaD metric card.
- Looking at papers and blog posts published on the topic and see what metrics they report. This can change over time, so try to pick papers from the last couple of years!

In [None]:
def evaluate_machine_translation(hypotheses, references):
    """
    Calculates BLEU score for machine translation evaluation.
    Args:
    hypotheses (List[str]): List of translated sentences.
    references (List[List[str]]): List of reference translations for each input sentence.

    Returns:
    float: BLEU score.
    """
    # Use HF Eval's `load` function to get the BLEU evaluator
    bleu_evaluator = evaluate.load("bleu")

    # Calculate BLEU score using the evaluator
    results = bleu_evaluator.compute(predictions=hypotheses, references=references)

    # Extract BLEU score (average across references)
    bleu_score = results["bleu"]

    return bleu_score

In [None]:
# Example hypotheses (translated sentences)
hypotheses = ["The cat sat on mat.", "The dog played in garden."]

# Example references (reference translations for each input sentence)
references = [["The cat sat on the mat."], ["The dog played in the garden."]]

# Calculate BLEU score
bleu_score = evaluate_machine_translation(hypotheses, references)

# Print the BLEU score
print("BLEU Score:", bleu_score)

This indicates the overall BLEU score. It reflects the cumulative n-gram precision of the generated translations or summaries compared to the reference translations or summaries, where n typically ranges from 1 to 4. A score of approximately 0.51 indicates that around 51.15% of the n-grams (typically up to 4-grams) in the generated translations or summaries match those in the reference translations or summaries.

In [None]:
from sklearn.metrics import classification_report

def evaluate_ner(true_labels, predicted_labels):
    """
    Evaluate the performance of a Named Entity Recognition system.

    Args:
    true_labels (list of lists): True labels for each sentence in the dataset.
    predicted_labels (list of lists): Predicted labels for each sentence in the dataset.

    Returns:
    classification_report (str): Text summary of precision, recall, and F1 score for each class.
    """
    # Flatten the lists of labels
    true_labels_flat = [label for sublist in true_labels for label in sublist]
    predicted_labels_flat = [label for sublist in predicted_labels for label in sublist]

    # Generate classification report
    report = classification_report(true_labels_flat, predicted_labels_flat)

    return report

In [None]:
true_labels = [['N', 'B-PER', 'I-PER', 'O'], ['B-LOC', 'I-LOC', 'N']]
predicted_labels = [['O', 'B-PER', 'I-PER', 'O'], ['B-LOC', 'I-LOC', 'O']]

evaluation_result = evaluate_ner(true_labels, predicted_labels)
print(evaluation_result)

In [None]:
import evaluate

def simple_summarizer(text):
    sentences = text.split(".")
    return sentences[0]

# Load ROUGE metric
rouge_scorer = evaluate.load("rouge")

# Example text and reference summary
text = "Today is a beautiful day. The sun is shining and the birds are singing. I am going for a walk in the park."
reference = "The weather is pleasant today."

# Generate summary using the function
prediction = simple_summarizer(text)

# Compute ROUGE score
rouge_results = rouge_scorer.compute(predictions=[prediction], references=[reference])

# Print ROUGE score (might be very low due to the simplistic summarizer)
print("ROUGE-L score:", rouge_results["rougeL"])
print("ROUGE-1 score:", rouge_results["rouge1"])
print("ROUGE-2 score:", rouge_results["rouge2"])

- ROUGE-L score:This indicates the ROUGE-L score, which measures the overlap of Longest Common Subsequences (LCS) between the generated summaries and the reference summaries. The score of 0.20000000000000004 suggests that approximately 20% of the content in the generated summaries matches the content in the reference summaries, but there's significant room for improvement.

- ROUGE-1 score: This is the ROUGE-1 score, which measures the overlap of unigrams (individual words) between the generated summaries and the reference summaries. The score of 0.4000000000000001 suggests that approximately 40% of the unigrams in the generated summaries match those in the reference summaries.

- ROUGE-2 score: This is the ROUGE-2 score, which measures the overlap of bigrams (pairs of adjacent words) between the generated summaries and the reference summaries. The score of 0.0 suggests that there is no overlap of bigrams between the generated and reference summaries, indicating that the generated summaries did not contain any two-word sequences that matched those in the reference summaries.

## Dataset-specific metrics
Some datasets have specific metrics associated with them — this is especially in the case of popular benchmarks like GLUE and SQuAD.

**GLUE is actually a collection of different subsets on different tasks, so first you need to choose the one that corresponds to the "Natural Language Inference" NLI task, such as mnli, which is described as “crowdsourced collection of sentence pairs with textual entailment annotations”**


In [None]:
from evaluate import load
squad_metric = load("squad")
predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22'}]
references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}]
results = squad_metric.compute(predictions=predictions, references=references)
results

# Using Evaluator
The Evaluator classes allow to evaluate a triplet of model, dataset, and metric. The models wrapped in a pipeline, responsible for handling all preprocessing and post-processing and out-of-the-box, Evaluators support transformers pipelines for the supported tasks.

In [None]:
from datasets import load_dataset
from evaluate import evaluator
from transformers import AutoModelForSequenceClassification, pipeline

data = load_dataset("imdb", split="test").shuffle(seed=42).select(range(1000))
task_evaluator = evaluator("text-classification")

pipe = pipeline("text-classification", model="lvwerra/distilbert-imdb")

eval_results = task_evaluator.compute(
    model_or_pipeline=pipe,
    data=data,
    label_mapping={"NEGATIVE": 0, "POSITIVE": 1}
)
print(eval_results)

# Running Evaluation Suit on GPT2 LLM

In [None]:
from evaluate import EvaluationSuite
suite = EvaluationSuite.load('mathemakitten/glue-evaluation-suite')
results = suite.run("gpt2")

In [None]:
results