# Evaluating Prompting flan-T5-base

In [5]:
## Config
random_seed = 100
data_path = "/kaggle/working/"

In [3]:
%%capture
!pip install -U datasets
!pip install transformers datasets evaluate rouge_score --quiet
!pip uninstall keras -y
!pip install keras==2.11
!pip install bert_score

In [4]:
import evaluate
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import pandas as pd
from bert_score import score
import pickle
import os

## Load Data and Model

In [6]:
# Full dataset (split included)
dataset = load_dataset("EdinburghNLP/xsum")

README.md:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

xsum.py:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/304M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/17.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [7]:
# Few-shot examples from training set
train_examples = dataset["train"].select(range(2))

# Sample a subset of the test set for evaluation
test_sample = dataset["test"]
references = [example["summary"] for example in test_sample]

In [None]:
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=2024)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

## Zero-Shot, One-Shot and Few-Shot Prompts

In [8]:
def build_zero_shot_prompt(doc):
    """Builds a zero-shot prompt."""
    prompt_template = "Summarize the input text.\n\n ### INPUT TEXT\nDocument:{}\nSummary:[Fill the summary]"
    return prompt_template.format(doc)

def build_one_shot_prompt(doc, train_example):
    """Builds a one-shot prompt with one example."""
    prompt = ""
    prompt += "Task: Summarize the input text. An example is provided below. \n"
    prompt += f"### EXAMPLE:\nDocument: {train_example['document'].strip()}\nSummary: {train_example['summary'].strip()}\n\n"
    prompt += f"### INPUT TEXT:\nDocument: {doc.strip()}\nSummary:[Fill the summary]"
    return prompt

def build_few_shot_prompt(doc, few_shots):
    """Builds a few-shot prompt with multiple examples."""
    prompt = ""
    prompt += "Task: Summarize the input text. Examples are provided below. \n"
    for ex in few_shots:
        prompt += f"### EXAMPLE:\nDocument: {ex['document'].strip()}\nSummary: {ex['summary'].strip()}\n\n"
    prompt += f"### INPUT TEXT:\nDocument: {doc.strip()}\nSummary:[Fill the summary]"
    return prompt

In [10]:
zero_shot_prompts = []
one_shot_prompts = []
few_shot_prompts = []

for document in test_sample:

    # Zero shot prompts
    prompt = build_zero_shot_prompt(document["document"])
    zero_shot_prompts.append(prompt)

    # # One shot prompts
    # prompt = build_one_shot_prompt(document["document"], train_examples[0])
    # one_shot_prompts.append(prompt)

    # # Few shot prompts
    # prompt = build_few_shot_prompt(document["document"], train_examples)
    # few_shot_prompts.append(prompt)

### Generate model ops

In [None]:
# Move the model to the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_ = model.to(device)

In [None]:
def generate_prompt_output(prompts, model, device):
  # Generate few-shot predictions in batches
  batch_size = 20
  preds = []

  # Select a larger subset for demonstration
  subset = zero_shot_prompts

  for i in tqdm(range(0, len(prompts), batch_size)):
      batch_subset = []
      for j in range(i, min(i + batch_size, len(prompts))):
          batch_subset.append(prompts[j])

      batch_prompts = batch_subset

      # Tokenize and move inputs to the correct device
      inputs = tokenizer(batch_prompts, return_tensors="pt", truncation=True, max_length=2024, padding=True)
      inputs = {k: v.to(device) for k, v in inputs.items()}  # Fix: move input tensors to the device

      # Generate predictions
      outputs = model.generate(**inputs, max_length=64)  # Do NOT call .to(device) here

      # Decode predictions
      batch_preds = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
      preds.extend(batch_preds)

  return preds

In [None]:
zero_shot_results = generate_prompt_output(zero_shot_prompts, model, device)

file_name = "zero_shot_testset.pkl"
file_path = os.path.join(data_path, file_name)

try:
    with open(file_path, 'wb') as f:
        pickle.dump(zero_shot_results, f)
    print(f"Successfully saved the list as pickle to: {file_path}")
except Exception as e:
    print(f"An error occurred while saving the pickle file: {e}")

100%|██████████| 567/567 [1:03:21<00:00,  6.70s/it]

Successfully saved the list as pickle to: /kaggle/working/zero_shot_testset.pkl





In [None]:
one_shot_results = generate_prompt_output(one_shot_prompts, model, device)

file_name = "one_shot_testset.pkl"
file_path = os.path.join(data_path, file_name)

try:
    with open(file_path, 'wb') as f:
        pickle.dump(one_shot_results, f)
    print(f"Successfully saved the list as pickle to: {file_path}")
except Exception as e:
    print(f"An error occurred while saving the pickle file: {e}")

100%|██████████| 567/567 [1:25:43<00:00,  9.07s/it]

Successfully saved the list as pickle to: /kaggle/working/one_shot_testset.pkl





In [None]:
# # few_shot_results = generate_prompt_output(few_shot_prompts, model, device)

file_path = "/kaggle/input/few-shot-testset-pkl/few_shot_testset.pkl"

try:
    with open(file_path, 'rb') as f:
        few_shot_results = pickle.load(f)
    print(f"Successfully loaded the list from pickle file: {file_path}")
    # Now you can work with the 'few_shot_preds' variable
    # For example, you can print the first few elements:
    # print(few_shot_preds[:5])
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred while loading the pickle file: {e}")

Successfully loaded the list from pickle file: /kaggle/input/few-shot-testset-pkl/few_shot_testset.pkl


### Calculate Evaluation metrics

In [None]:
# Ensure you have the necessary evaluation metrics loaded
rouge = evaluate.load("rouge")

# --- Calculate Metrics ---

# Zero-shot
zero_shot_rouge = rouge.compute(predictions=zero_shot_results, references=references)
_, _, zero_shot_bertscore_f1 = score(zero_shot_results, references, lang="en", verbose=True)

# One-shot
one_shot_rouge = rouge.compute(predictions=one_shot_results, references=references)
_, _, one_shot_bertscore_f1 = score(one_shot_results, references, lang="en", verbose=True)

# Few-shot
few_shot_rouge = rouge.compute(predictions=few_shot_results, references=references)
_, _, few_shot_bertscore_f1 = score(few_shot_results, references, lang="en", verbose=True, device= device)

# --- Prepare Data for DataFrame ---

results = {
    ('Zero-shot', model_name): {
        'ROUGE-1': zero_shot_rouge['rouge1'],
        'ROUGE-2': zero_shot_rouge['rouge2'],
        'ROUGE-L': zero_shot_rouge['rougeL'],
        'BERTScore F1': zero_shot_bertscore_f1.mean().item()
    },
    ('One-shot', model_name): {
        'ROUGE-1': one_shot_rouge['rouge1'],
        'ROUGE-2': one_shot_rouge['rouge2'],
        'ROUGE-L': one_shot_rouge['rougeL'],
        'BERTScore F1': one_shot_bertscore_f1.mean().item()
    },
    ('Few-shot', model_name): {
        'ROUGE-1': few_shot_rouge['rouge1'],
        'ROUGE-2': few_shot_rouge['rouge2'],
        'ROUGE-L': few_shot_rouge['rougeL'],
        'BERTScore F1': few_shot_bertscore_f1.mean().item()
    }
}

# --- Create DataFrame ---

df_results = pd.DataFrame.from_dict(results, orient='index')

# Set the index names
df_results.index.names = ['Prompt Type', 'Model']

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/354 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/178 [00:00<?, ?it/s]



done in 125.71 seconds, 90.16 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/354 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/178 [00:00<?, ?it/s]

done in 123.51 seconds, 91.77 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/354 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/178 [00:00<?, ?it/s]



done in 124.03 seconds, 91.38 sentences/sec


In [None]:
# Display the DataFrame
print(df_results)

                                  ROUGE-1   ROUGE-2   ROUGE-L  BERTScore F1
Prompt Type Model                                                          
Zero-shot   google/flan-t5-base  0.338143  0.119003  0.266704      0.897449
One-shot    google/flan-t5-base  0.338109  0.119825  0.267820      0.897942
Few-shot    google/flan-t5-base  0.337940  0.119473  0.268085      0.897745


In [None]:
file_name = "results.pkl"
file_path = os.path.join(data_path, file_name)

try:
    df_results.to_pickle(file_path)
    print(f"Successfully saved the DataFrame as pickle to: {file_path}")
except Exception as e:
    print(f"An error occurred while saving the pickle file: {e}")

Successfully saved the DataFrame as pickle to: /kaggle/working/results.pkl


In [None]:

print(zero_shot_results[1578],'\n',
one_shot_results[1578], '\n',
few_shot_results[1578])

A lifeboat crew in St Abbs have reopened their pagers after a row over the closure of the station. 
 A lifeboat station in the Borders has been reopened after a row over the closure of its lifeboat. 
 A lifeboat station in St Abbs has been reopened after a row over the closure of the station.


In [None]:
print(zero_shot_prompts[1578],'\n', 50*'===', '\n',
one_shot_prompts[1578], '\n', 50*'===', '\n',
few_shot_prompts[1578])

Summarize the input text.

 ### INPUT TEXT
Document:The volunteers were angry at the RNLI's decision to shut down the station later this year.
They had said they would no longer use the lifeboat to respond to emergencies, and would instead use their own boats.
But the crew agreed to take back their pagers at a meeting on Friday night.
In a statement, the crew members said they felt they had to do so ahead of the busy summer diving season, but they pledged to continue campaigning to save the St Abbs station.
There has been a lifeboat station in St Abbs for more than 100 years. The local volunteers have been credited with saving hundred of lives in and around the seaside town on the east coast of the Borders.
But following a review the RNLI announced last week that the St Abbs boat was no longer needed and in future cover would be provided with an additional boat in nearby Eyemouth.
Supporters of the station have argued that closing it would put lives at risk.
Summary:[Fill the summary] 

# Evaluate T5 Fine-tuned and pretrained

In [1]:
import shutil

model_path = "/content/t5-small-xsum-finetuned_10000_training_10epoch.zip"
model_name = "t5-small-xsum-finetuned_10000_training_10epoch"

# Unzip the archive created earlier
shutil.unpack_archive(model_path, model_name, format='zip')

In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer from unzipped directory
model_path = "t5-small-xsum-finetuned_10000_training_10epoch"
fine_tuned_model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)
base_model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer1 = T5Tokenizer.from_pretrained("t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:

def generate_batch(model, prompts, max_length=64):
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        output_ids = model.generate(**inputs, max_length=max_length)
    return tokenizer.batch_decode(output_ids, skip_special_tokens=True)

# Run both models
original_outputs = generate_batch(original_model, prompts)
fine_tuned_outputs = generate_batch(fine_tuned_model, prompts)
