# Importing packages and defining a function to calculate the rouge score

In [None]:
import pandas as pd
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig

from datasets import load_metric
metric = load_metric("rouge")

def calc_rouge_scores(candidates, references):
    result = metric.compute(predictions=candidates, references=references, use_stemmer=True)
    result = {key: round(value.mid.fmeasure * 100, 1) for key, value in result.items()}
    return result

# Importing the data

In [None]:
file_path = "C:\\Users\\cxs3\\Downloads\\test.csv"
df = pd.read_csv(file_path)

# Importing the t5 model

In [None]:
model_name='google/flan-t5-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Applying Zero Shot Inference:

### Model's maximum input tokens is 512. Any input length longer than that will cause indexing errors.


In [None]:
MAX_TOKENS=512
text_size = 300

In [None]:
summerization = []

for article in list(df["article"])[:text_size]:
    prompt = f"""Summarize: {article}"""

    # Input constructed prompt instead of the article.
    # Here we truncate any input longer than the maximum model input length.
    inputs = tokenizer(prompt, return_tensors='pt', max_length=MAX_TOKENS, truncation=True)
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True
    )
    summerization.append(output)

In [None]:
# Human summaries
list(df["highlights"][:5])

['Experts question if  packed out planes are putting passengers at risk .\nU.S consumer advisory group says minimum space must be stipulated .\nSafety tests conducted on planes with more leg room than airlines offer .',
 "Drunk teenage boy climbed into lion enclosure at zoo in west India .\nRahul Kumar, 17, ran towards animals shouting 'Today I kill a lion!'\nFortunately he fell into a moat before reaching lions and was rescued .",
 "Nottingham Forest are close to extending Dougie Freedman's contract .\nThe Forest boss took over from former manager Stuart Pearce in February .\nFreedman has since lead the club to ninth in the Championship .",
 'Fiorentina goalkeeper Neto has been linked with Liverpool and Arsenal .\nNeto joined Firoentina\xa0from Brazilian outfit Atletico Paranaense in 2011 .\nHe is also wanted by PSG and Spanish clubs, according to his agent .\nCLICK HERE for the latest Liverpool news .',
 "Tell-all interview with the reality TV star, 69, will air on Friday April 24 .\

In [None]:
#model generated summeries
summerization[:5]

['Experts say the shrinking space on planes is putting our health and safety in danger.',
 'A teenager who climbed into a lion enclosure at a zoo in India has been rescued by zoo staff.',
 "Dougie Freedman is set to sign a new two-year deal at Nottingham Forest. Forest's owners are pleased with the job he has done at the City Ground.",
 'Brazilian goalkeeper is wanted by Liverpool and PSG. Brazilian is wanted by PSG and Spain clubs.',
 "Bruce Jenner will speak out in a 'far-ranging' interview with Sawyer. The former Olympian and reality TV star will speak out in a 'far-ranging' interview with Sawyer. The interview comes amid growing"]

In [None]:
calc_rouge_scores(summerization,df["highlights"][:text_size])

{'rouge1': 34.0, 'rouge2': 14.7, 'rougeL': 24.8, 'rougeLsum': 29.9}

# Prompt Engineering

In [None]:
summerization = []

for article in list(df["article"])[:text_size]:
    # modified prompt
    prompt = f"Summarize the following news article:\n\n\n{article}\n\n\nProvide a concise summary:"

    # Tokenize the prompt and generate a summary
    # Input constructed prompt instead of the article.
    inputs = tokenizer(prompt, return_tensors='pt', max_length=MAX_TOKENS, truncation=True)
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True
    )
    summerization.append(output)

In [None]:
#model generated summeries
summerization[:5]

['Experts say the shrinking space on planes is putting our health and safety in danger.',
 "A 17-year-old boy has been rescued after he climbed into a lions' enclosure at a zoo in Ahmedabad.",
 "Dougie Freedman is set to sign a new two-year deal at Nottingham Forest. Forest's owners are pleased with the job he has done at the City Ground.",
 'Brazilian goalkeeper is wanted by Liverpool and PSG. Brazilian is wanted by PSG and Spain clubs.',
 "Bruce Jenner will speak out in a 'far-ranging' interview with Sawyer. The former Olympian and reality TV star will speak out in a 'far-ranging' interview with Sawyer. The interview comes amid growing"]

In [None]:
calc_rouge_scores(summerization,df["highlights"][:text_size])

{'rouge1': 34.8, 'rouge2': 15.0, 'rougeL': 25.5, 'rougeLsum': 30.6}

# Parameter Tuning with Top-p and Top-K Sampling



In [None]:
summerization = []

for article in list(df["article"])[:text_size]:
    # modified prompt
    prompt = f"Summarize the following news article:\n\n\n{article}\n\n\nProvide a concise summary:"

    # Tokenize the prompt and generate a summary
    # Input constructed prompt instead of the article.
    inputs = tokenizer(prompt, return_tensors='pt', max_length=MAX_TOKENS, truncation=True)
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            # tuning parameters
            max_new_tokens=50,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            num_return_sequences=3,
            num_beams=5,
            length_penalty=2.0,
            no_repeat_ngram_size=2,
        )[0],
        skip_special_tokens=True
    )
    summerization.append(output)

In [None]:
#model generated summeries
summerization[:5]

['Experts are questioning if the shrinking space on aeroplanes is putting our health and safety in danger. This week, a U.S consumer advisory group set up by the Department of Transportation said that while the government is',
 "A 17-year-old boy has been rescued after jumping into lions' enclosure at a zoo in Ahmedabad and shouting he would 'kill them' The teenager was sitting near the enclosure when ",
 "Dougie Freedman is set to sign a new two-year deal at Nottingham Forest. Forest's owners are pleased with the job he has done at the City Ground. They made an audacious attempt on the play-",
 "Liverpool target Neto is also wanted by PSG and clubs in Spain. Brazilian's agent Stefano Castagna says he has no decision yet about his future. Fiorentina goalkeeper has been linked with a move to",
 "Former Olympian and reality TV star will speak out in a 'far-ranging' interview with Sawyer. The interview comes amid growing speculation about the father-of-six's gender identity. He also split

In [None]:
calc_rouge_scores(summerization,df["highlights"][:text_size])

{'rouge1': 37.8, 'rouge2': 16.7, 'rougeL': 26.6, 'rougeLsum': 32.5}

# Applying One-Shot Inference:

### Because the model's maximum input token is 512, and truncation is used if the input text in too long. This means one-shot and few-shot learning would be difficult since the piece of text we want to summerize would usually be truncaded, leaving only the example summerization.

In [None]:
def make_prompt(example_indices_full, text_to_summarize):
    prompt = ''
    for index in example_indices_full:
        article = df["article"][index]
        highlights = df['highlights'][index]

        prompt += f"""
Summarize the following news article:

{article}

Provide a concise summary:

{highlights}

"""

    prompt += f"""
Summarize the following news article:

{text_to_summarize}

Provide a concise summary:
"""

    return prompt

In [None]:
one_shot_summerization = []
example_indices_full = [1001]

for article in list(df["article"])[:5]:
    # one-shot prompt
    prompt = make_prompt(example_indices_full,article)

    # Tokenize the prompt and generate a summary
    # Input constructed prompt instead of the article.
    inputs = tokenizer(prompt, return_tensors='pt', max_length=MAX_TOKENS, truncation=True)
    output = tokenizer.decode(
        model.generate(
            inputs["input_ids"],
            # tuning parameters
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True
    )
    one_shot_summerization.append(output)

In [None]:
#model generated summeries
one_shot_summerization[:5]

['Chancellor pledges to double number of first-time buyers buying their first home. He wants at least 2.4 million more over next five years.',
 'Chancellor pledges to double number of first-time buyers buying their first home. He wants at least 2.4 million more over next five years.',
 'Chancellor pledges to double number of first-time buyers buying their first home. He wants at least 2.4 million more over next five years.',
 'Chancellor pledges to double number of first-time buyers buying their first home. He wants at least 2.4 million more over next five years.',
 'Chancellor pledges to double number of first-time buyers buying their first home. He wants at least 2.4 million more over next five years.']

In [None]:
calc_rouge_scores(one_shot_summerization,df["highlights"][:5])

{'rouge1': 7.8, 'rouge2': 0.0, 'rougeL': 6.5, 'rougeLsum': 7.1}