# Prompting flan-T5-base

In [8]:
%%capture
!pip install -U datasets
!pip install transformers datasets evaluate rouge_score --quiet
!pip uninstall keras -y
!pip install keras==2.11

In [9]:
import evaluate
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [10]:
# Full dataset (split included)
dataset = load_dataset("EdinburghNLP/xsum")
test_sample = dataset["test"][0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [12]:
def generate_with_strategy(doc, strategy="greedy"):
    input_ids = tokenizer("summarize: " + doc, return_tensors="pt", truncation=True).input_ids.to(model.device)

    if strategy == "greedy":
        output = model.generate(input_ids, max_length=64)
    elif strategy == "beam":
        output = model.generate(input_ids, max_length=64, num_beams=5, early_stopping=True)
    elif strategy == "topk":
        output = model.generate(input_ids, max_length=64, do_sample=True, top_k=50)
    elif strategy == "topp":
        output = model.generate(input_ids, max_length=64, do_sample=True, top_p=0.9)
    else:
        raise ValueError("Unknown strategy")

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [13]:
strategies = ["greedy", "beam", "topk", "topp"]
subset = dataset["test"].select(range(100))
references = [example["summary"] for example in subset]


for i in range(3):  # Test on 3 samples
    doc = subset[i]["document"]
    ref = subset[i]["summary"]

    print(f"\n🔹 Example {i+1}")
    print("="*80)
    print("📄 Document:\n", doc[:800], "..." if len(doc) > 800 else "")
    print("\n✅ Reference Summary:\n", ref)

    for strat in strategies:
        pred = generate_with_strategy(doc, strategy=strat)
        print(f"\n🧠 {strat.upper()} Output:\n{pred}")
    print("="*80)


🔹 Example 1
📄 Document:
 Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.
Workers at the charity claim investment in housing would be cheaper than jailing homeless repeat offenders.
The Welsh Government said more people than ever were getting help to address housing problems.
Changes to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation.
Prison Link Cymru, which helps people find accommodation after their release, said things were generally good for women because issues such as children or domestic violence were now considered.
However, the same could not be said for men, the charity said, because issues which often affect them, such as post tra ...

✅ Reference Summary:
 There is a "chronic" need for more housing for prison leavers in Wales, according to a charity.

🧠 GREEDY Output:
Prison is a "dangerous" place fo

In [14]:
def generate_summary(doc, prompt_template="Please write a short summary of the following article:\n\n{}",
                     max_input=512, max_output=64):
    prompt = prompt_template.format(doc)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input, padding=True).to(model.device)
    outputs = model.generate(**inputs, max_length=max_output)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [15]:
rouge = evaluate.load("rouge")

# Evaluate on a 100-sample test slice
subset = dataset["test"].select(range(3))

predictions = [generate_summary(example["document"]) for example in subset]
references = [example["summary"] for example in subset]

rouge_results = rouge.compute(predictions=predictions, references=references)
print("ROUGE Results:", rouge_results)

ROUGE Results: {'rouge1': np.float64(0.3500233426704015), 'rouge2': np.float64(0.14166666666666664), 'rougeL': np.float64(0.282796451914099), 'rougeLsum': np.float64(0.282796451914099)}


In [16]:
# Show a few test examples with model predictions
num_examples = 3
for i in range(num_examples):
    doc = subset[i]["document"]
    ref = subset[i]["summary"]
    pred = generate_summary(doc)

    print(f"\n🔹 Example {i+1}")
    print("=" * 80)
    print("📄 Document:\n", doc[:800], "..." if len(doc) > 800 else "")  # truncate long doc
    print("\n✅ Reference Summary:\n", ref)
    print("🤖 Model Summary:\n", pred)
    print("=" * 80)


🔹 Example 1
📄 Document:
 Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.
Workers at the charity claim investment in housing would be cheaper than jailing homeless repeat offenders.
The Welsh Government said more people than ever were getting help to address housing problems.
Changes to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation.
Prison Link Cymru, which helps people find accommodation after their release, said things were generally good for women because issues such as children or domestic violence were now considered.
However, the same could not be said for men, the charity said, because issues which often affect them, such as post tra ...

✅ Reference Summary:
 There is a "chronic" need for more housing for prison leavers in Wales, according to a charity.
🤖 Model Summary:
 Prisoners are being put out of w

### Few shot

In [17]:
# Few-shot examples from training set
train_examples = dataset["train"].select(range(2))

def build_few_shot_prompt(target_doc, few_shots):
    prompt = ""
    for ex in few_shots:
        prompt += f"Document: {ex['document'].strip()}\nSummary: {ex['summary'].strip()}\n\n"
    prompt += f"Document: {target_doc.strip()}\nSummary:"
    return prompt

In [18]:
def generate_few_shot(doc, few_shots):
    prompt = build_few_shot_prompt(doc, few_shots)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512, padding=True).to(model.device)
    outputs = model.generate(**inputs, max_length=64)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
input = []
subset = dataset["test"].select(range(100))
for i in range(len(subset)):
  doc = subset[i]["document"]
  few = train_examples
  input.append(build_few_shot_prompt(doc,few))

inputs = tokenizer(input, return_tensors="pt", truncation=True, max_length=512, padding=True).to(model.device)
outputs = model.generate(**inputs, max_length=64)
ops = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
ops

In [19]:
# Generate few-shot predictions (dynamic prompt per example)
few_shot_preds = []



for idx, example in tqdm(enumerate(subset)):
    summary = generate_few_shot(example["document"], train_examples)
    few_shot_preds.append(summary)
    if idx >= 2:
      break

few_shot_rouge = rouge.compute(predictions=few_shot_preds, references=references)
print("Few-shot ROUGE:", few_shot_rouge)

2it [00:14,  7.10s/it]

Few-shot ROUGE: {'rouge1': np.float64(0.09479409479409479), 'rouge2': np.float64(0.0), 'rougeL': np.float64(0.07459207459207459), 'rougeLsum': np.float64(0.07459207459207459)}





In [20]:
for i in range(3):
    print(f"\n🔹 Example {i+1}")
    print("="*80)
    print("📄 Document:\n", subset[i]["document"][:800], "..." if len(subset[i]["document"]) > 800 else "")
    print("\n✅ Reference Summary:\n", subset[i]["summary"])
    print("🤖 Few-shot Summary:\n", few_shot_preds[i])
    print("="*80)


🔹 Example 1
📄 Document:
 Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.
Workers at the charity claim investment in housing would be cheaper than jailing homeless repeat offenders.
The Welsh Government said more people than ever were getting help to address housing problems.
Changes to the Housing Act in Wales, introduced in 2015, removed the right for prison leavers to be given priority for accommodation.
Prison Link Cymru, which helps people find accommodation after their release, said things were generally good for women because issues such as children or domestic violence were now considered.
However, the same could not be said for men, the charity said, because issues which often affect them, such as post tra ...

✅ Reference Summary:
 There is a "chronic" need for more housing for prison leavers in Wales, according to a charity.
🤖 Few-shot Summary:
 The Dumfries and Galloway are

### BertScore

In [21]:
!pip install bert-score



In [22]:
from bert_score import score

In [23]:
P, R, F1 = score(few_shot_preds, references, lang="en", verbose=True)
print("BERTScore:")
print(f"  Precision: {P.mean().item():.4f}")
print(f"  Recall: {R.mean().item():.4f}")
print(f"  F1 Score: {F1.mean().item():.4f}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 6.84 seconds, 0.44 sentences/sec
BERTScore:
  Precision: 0.8544
  Recall: 0.8602
  F1 Score: 0.8573


In [25]:
P_few_shot, R_few_shot, F1_few_shot = score(few_shot_preds, references, lang="en", verbose=True)
print("Few-shot BERTScore:")
print(f"  Precision: {P_few_shot.mean().item():.4f}")
print(f"  Recall: {R_few_shot.mean().item():.4f}")
print(f"  F1 Score: {F1_few_shot.mean().item():.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 5.52 seconds, 0.54 sentences/sec
Few-shot BERTScore:
  Precision: 0.8544
  Recall: 0.8602
  F1 Score: 0.8573


In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
sentence  = 'Hello World!'
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased')
model     = BertModel.from_pretrained('bert-large-uncased')

inputs    = tokenizer(sentence, return_tensors="pt").to(device)
model     = model.to(device)
outputs   = model(**inputs)