In [None]:
!pip install evaluate
!pip install rouge_score
!pip install transformers
!pip install bert-score

# Checking the CNN-Dailymail test dataset

In [1]:
import pandas as pd

df = pd.read_csv("CNN-dailymail_dataset/test.csv")
print(df.shape)


(11490, 3)


In [3]:
df.head()

Unnamed: 0,id,article,highlights
0,92c514c913c0bdfe25341af9fd72b29db544099b,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,2003841c7dc0e7c5b1a248f9cd536d727f27a45a,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,91b7d2311527f5c2b63a65ca98d21d9c92485149,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,caabf9cbdf96eb1410295a673e953d304391bfbb,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,3da746a7d9afcaa659088c8366ef6347fe6b53ea,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [2]:
# Examining the first article and it's summary
print(df.iloc[0]['article'], '\n')
print(df.iloc[0]['highlights'])

Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee. 'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues than fighting for space 

# Gathering the results in a csv file

In [1]:
import json
import boto3

In [8]:
endpoint_name = 'place_your_endpoint_name_here' 

def query_endpoint(encoded_text):
    client = boto3.client('runtime.sagemaker')
    response = client.invoke_endpoint(EndpointName=endpoint_name, ContentType='application/x-text', Body=encoded_text)
    return response

def parse_response(response):
    model_predictions = json.loads(response['Body'].read())
    return model_predictions['summary_text']

def get_summary(input_text):
    try:
        query_response = query_endpoint(input_text.encode('utf-8'))
    except Exception as e:
        if e.response['Error']['Code'] == 'ModelError':
            raise Exception(
                 f"To use this notebook, please launch the endpoint again. Error: {e}."
            )
        else:
            raise
            
    try:
        summary_text = parse_response(query_response)
    except (TypeError, KeyError) as e:
        raise Exception(e)

    return summary_text


In [9]:
article = """
Paris is the capital and most populous city of France, with an estimated population of 2,175,601 residents as of 2018, 
in an area of more than 105 square kilometres (41 square miles). The City of Paris is the centre and seat of government of the region and province of Île-de-France, 
or Paris Region, which has an estimated population of 12,174,880, or about 18 percent of the population of France as of 2017.
"""

get_summary(article)

' Paris is the capital and most populous city of France, with an estimated population of 2,175,601 residents as of 2018 . It is the centre and seat of government of the region and province of Île-de-France, the Paris Region, which has about 18 percent of the population of France .'

In [None]:
import csv
import time
from transformers import AutoTokenizer

# Input and output file paths
input_csv = "CNN-dailymail_dataset/test.csv"
output_csv = "CNN-dailymail_dataset/test_results_distilbart-cnn-12-6.csv"

count = 0
invoc = 0

model_name = "sshleifer/distilbart-cnn-12-6"
tokenizer = AutoTokenizer.from_pretrained(model_name)

with open(input_csv, mode='r', newline='', encoding='utf-8') as infile, \
     open(output_csv, mode='w', newline='', encoding='utf-8') as outfile:

    reader = csv.DictReader(infile)
    # Define fieldnames for output CSV
    fieldnames = reader.fieldnames + ['model_summary']
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)
    writer.writeheader()
    start_time = time.time()
         
    for row in reader:
        article = row['article']
        tokens = tokenizer(article, return_offsets_mapping=True, truncation=False)
        input_ids = tokens["input_ids"]
        
        # Do not summarize article if longer than model's context window
        if len(input_ids) > 1024:
            print(f"Article {count}: rejected too long")
            
        else:
            reference_summary = row['highlights']
            model_summary = get_summary(article)
            
            # Add new field and write to output CSV
            row['model_summary'] = model_summary
            writer.writerow(row)
            invoc += 1
            print(f"Article {count}: Summarized")
            
        count += 1
        
    end_time = time.time()
         
elapsed_time = end_time - start_time
print(f"Loop took {elapsed_time:.4f} seconds to run {invoc} invocations")
print(f"Results saved to {output_csv}")


# Evaluating the outputs

In [1]:
import evaluate
from bert_score import BERTScorer

2025-10-03 07:33:42.803381: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Calculating metrics with example data
# https://huggingface.co/spaces/evaluate-metric/rouge
# https://huggingface.co/spaces/evaluate-metric/meteor
# https://pypi.org/project/bert-score/

rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

predictions = [
    "The cat is on the mat",
    "There is a cat playing outside"
]
references = [
    "A cat is sitting on the mat",
    "The cat is playing in the garden"
]

# Rouge and Meteor score
rouge_result = rouge.compute(predictions=predictions, references=references)
meteor_result = meteor.compute(predictions=predictions, references=references)

print("ROUGE:", rouge_result)          # e.g., ROUGE-1, ROUGE-2, ROUGE-L
print("METEOR:", meteor_result)        # METEOR score

# BERTScore mean calculation
scorer = BERTScorer(model_type='bert-base-uncased')
P, R, F1 = scorer.score(predictions, references)
print(f"BERTScore Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

[nltk_data] Downloading package wordnet to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/sagemaker-
[nltk_data]     user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


ROUGE: {'rouge1': 0.6153846153846153, 'rouge2': 0.2727272727272727, 'rougeL': 0.5384615384615384, 'rougeLsum': 0.5384615384615384}
METEOR: {'meteor': 0.45942028985507244}
BERTScore Precision: 0.7764, Recall: 0.7414, F1: 0.7585


In [21]:
import pandas as pd

def evaluate_model(filepath:str) -> None:
    """
    Read in the csv file of references and predicted summaries, then calculate the scores
    """
    df = pd.read_csv(filepath, usecols=['highlights', 'model_summary'])
    references = df["highlights"].tolist()
    summaries = df["model_summary"].tolist()
    
    rouge_result = rouge.compute(predictions=summaries, references=references)
    meteor_result = meteor.compute(predictions=summaries, references=references)
    scorer = BERTScorer(model_type='bert-base-uncased')
    P, R, F1 = scorer.score(summaries, references)

    print("ROUGE:", rouge_result) # e.g., ROUGE-1, ROUGE-2, ROUGE-L
    print("METEOR:", meteor_result) # METEOR score
    print(f"BERTScore Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}") # BERTScore mean calculation

In [22]:
evaluate_model('CNN-dailymail_dataset/test_results_distilbart-cnn-12-6.csv')

ROUGE: {'rouge1': 0.43929470113957625, 'rouge2': 0.21863213112457253, 'rougeL': 0.3084659206447926, 'rougeLsum': 0.37471154157799225}
METEOR: {'meteor': 0.43127707155349815}
BERTScore Precision: 0.6292, Recall: 0.6880, F1: 0.6560


In [23]:
evaluate_model('CNN-dailymail_dataset/test_results_bart-large-cnn-samsum.csv')

ROUGE: {'rouge1': 0.42978785824928817, 'rouge2': 0.20492213736104845, 'rougeL': 0.3011838626506055, 'rougeLsum': 0.36580389684290815}
METEOR: {'meteor': 0.4047433686675237}
BERTScore Precision: 0.6327, Recall: 0.6708, F1: 0.6496
