# Config and Imports

In [1]:
## Config
random_seed = 100
# data_path = "/kaggle/working/"
data_path = "/content"

In [3]:
%%capture
!pip install -U datasets
!pip install transformers datasets evaluate rouge_score --quiet
!pip uninstall keras -y
!pip install keras==2.11
!pip install bert_score

In [4]:
import evaluate
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
import pandas as pd
from bert_score import score
import pickle
import os

# Load Data

In [5]:
# Full dataset (split included)
dataset = load_dataset("EdinburghNLP/xsum")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

xsum.py: 0.00B [00:00, ?B/s]

0000.parquet:   0%|          | 0.00/304M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/17.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [6]:
# Few-shot examples from training set
train_examples = dataset["train"].select(range(2))

# Sample a subset of the test set for evaluation
test_sample = dataset["test"]
references = [example["summary"] for example in test_sample]

# Zero-Shot, One-Shot and Few-Shot Prompts

In [7]:
def build_zero_shot_prompt(doc):
    """Builds a zero-shot prompt."""
    prompt_template = "Summarize the input text.\n\n ### INPUT TEXT\nDocument:{}\nSummary:[Fill the summary]"
    return prompt_template.format(doc)

def build_one_shot_prompt(doc, train_example):
    """Builds a one-shot prompt with one example."""
    prompt = ""
    prompt += "Task: Summarize the input text. An example is provided below. \n"
    prompt += f"### EXAMPLE:\nDocument: {train_example['document'].strip()}\nSummary: {train_example['summary'].strip()}\n\n"
    prompt += f"### INPUT TEXT:\nDocument: {doc.strip()}\nSummary:[Fill the summary]"
    return prompt

def build_few_shot_prompt(doc, few_shots):
    """Builds a few-shot prompt with multiple examples."""
    prompt = ""
    prompt += "Task: Summarize the input text. Examples are provided below. \n"
    for ex in few_shots:
        prompt += f"### EXAMPLE:\nDocument: {ex['document'].strip()}\nSummary: {ex['summary'].strip()}\n\n"
    prompt += f"### INPUT TEXT:\nDocument: {doc.strip()}\nSummary:[Fill the summary]"
    return prompt

In [8]:
zero_shot_prompts = []
one_shot_prompts = []
few_shot_prompts = []

for document in test_sample:

    # Zero shot prompts
    prompt = build_zero_shot_prompt(document["document"])
    zero_shot_prompts.append(prompt)

    # # One shot prompts
    # prompt = build_one_shot_prompt(document["document"], train_examples[0])
    # one_shot_prompts.append(prompt)

    # # Few shot prompts
    # prompt = build_few_shot_prompt(document["document"], train_examples)
    # few_shot_prompts.append(prompt)

# Evaluating Prompting flan-T5-base

In [None]:
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=2024)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
# Move the model to the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_ = model.to(device)

In [None]:
def generate_prompt_output(prompts, model, device):
  # Generate few-shot predictions in batches
  batch_size = 20
  preds = []

  # Select a larger subset for demonstration
  subset = zero_shot_prompts

  for i in tqdm(range(0, len(prompts), batch_size)):
      batch_subset = []
      for j in range(i, min(i + batch_size, len(prompts))):
          batch_subset.append(prompts[j])

      batch_prompts = batch_subset

      # Tokenize and move inputs to the correct device
      inputs = tokenizer(batch_prompts, return_tensors="pt", truncation=True, max_length=2024, padding=True)
      inputs = {k: v.to(device) for k, v in inputs.items()}  # Fix: move input tensors to the device

      # Generate predictions
      outputs = model.generate(**inputs, max_length=64)  # Do NOT call .to(device) here

      # Decode predictions
      batch_preds = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
      preds.extend(batch_preds)

  return preds

In [None]:
# zero_shot_results = generate_prompt_output(zero_shot_prompts, model, device)

# file_name = "zero_shot_testset.pkl"
# file_path = os.path.join(data_path, file_name)

# try:
#     with open(file_path, 'wb') as f:
#         pickle.dump(zero_shot_results, f)
#     print(f"Successfully saved the list as pickle to: {file_path}")
# except Exception as e:
#     print(f"An error occurred while saving the pickle file: {e}")


# Read previosly computed results
file_path = "/kaggle/input/predictions/zero_shot_testset.pkl"

try:
    with open(file_path, 'rb') as f:
        zero_shot_results = pickle.load(f)
    print(f"Successfully loaded the list from pickle file: {file_path}")
    # Now you can work with the 'few_shot_preds' variable
    # For example, you can print the first few elements:
    # print(few_shot_preds[:5])
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred while loading the pickle file: {e}")

Successfully loaded the list from pickle file: /kaggle/input/predictions/zero_shot_testset.pkl


In [None]:
# one_shot_results = generate_prompt_output(one_shot_prompts, model, device)

# file_name = "one_shot_testset.pkl"
# file_path = os.path.join(data_path, file_name)

# try:
#     with open(file_path, 'wb') as f:
#         pickle.dump(one_shot_results, f)
#     print(f"Successfully saved the list as pickle to: {file_path}")
# except Exception as e:
#     print(f"An error occurred while saving the pickle file: {e}")

# Read previosly computed results
file_path = "/kaggle/input/predictions/one_shot_testset.pkl"

try:
    with open(file_path, 'rb') as f:
        one_shot_results = pickle.load(f)
    print(f"Successfully loaded the list from pickle file: {file_path}")
    # Now you can work with the 'few_shot_preds' variable
    # For example, you can print the first few elements:
    # print(few_shot_preds[:5])
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred while loading the pickle file: {e}")

Successfully loaded the list from pickle file: /kaggle/input/predictions/one_shot_testset.pkl


In [None]:
# # few_shot_results = generate_prompt_output(few_shot_prompts, model, device)

# Read previosly computed results
file_path = "/kaggle/input/few-shot-testset-pkl/few_shot_testset.pkl"

try:
    with open(file_path, 'rb') as f:
        few_shot_results = pickle.load(f)
    print(f"Successfully loaded the list from pickle file: {file_path}")
    # Now you can work with the 'few_shot_preds' variable
    # For example, you can print the first few elements:
    # print(few_shot_preds[:5])
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred while loading the pickle file: {e}")

Successfully loaded the list from pickle file: /kaggle/input/few-shot-testset-pkl/few_shot_testset.pkl


### Calculate Evaluation metrics

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Ensure you have the necessary evaluation metrics loaded
rouge = evaluate.load("rouge")

# --- Calculate Metrics ---

# Zero-shot
zero_shot_rouge = rouge.compute(predictions=zero_shot_results, references=references)
_, _, zero_shot_bertscore_f1 = score(zero_shot_results, references, lang="en", rescale_with_baseline=True, verbose=True)

# One-shot
one_shot_rouge = rouge.compute(predictions=one_shot_results, references=references)
_, _, one_shot_bertscore_f1 = score(one_shot_results, references, lang="en", rescale_with_baseline=True, verbose=True)

# Few-shot
few_shot_rouge = rouge.compute(predictions=few_shot_results, references=references)
_, _, few_shot_bertscore_f1 = score(few_shot_results, references, lang="en", verbose=True, rescale_with_baseline=True, device= device)

# --- Prepare Data for DataFrame ---

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/354 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/178 [00:00<?, ?it/s]



done in 128.97 seconds, 87.88 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/354 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/178 [00:00<?, ?it/s]

done in 130.53 seconds, 86.83 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/354 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/178 [00:00<?, ?it/s]



done in 130.98 seconds, 86.53 sentences/sec


NameError: name 'model_name' is not defined

In [None]:
model_name = "google/flan-t5-base"
results = {
    ('Zero-shot', model_name): {
        'ROUGE-1': zero_shot_rouge['rouge1'],
        'ROUGE-2': zero_shot_rouge['rouge2'],
        'ROUGE-L': zero_shot_rouge['rougeL'],
        'BERTScore F1': zero_shot_bertscore_f1.mean().item()
    },
    ('One-shot', model_name): {
        'ROUGE-1': one_shot_rouge['rouge1'],
        'ROUGE-2': one_shot_rouge['rouge2'],
        'ROUGE-L': one_shot_rouge['rougeL'],
        'BERTScore F1': one_shot_bertscore_f1.mean().item()
    },
    ('Few-shot', model_name): {
        'ROUGE-1': few_shot_rouge['rouge1'],
        'ROUGE-2': few_shot_rouge['rouge2'],
        'ROUGE-L': few_shot_rouge['rougeL'],
        'BERTScore F1': few_shot_bertscore_f1.mean().item()
    }
}

# --- Create DataFrame ---

df_results = pd.DataFrame.from_dict(results, orient='index')

# Set the index names
df_results.index.names = ['Prompt Type', 'Model']

In [None]:
# Display the DataFrame
print(df_results)

                                  ROUGE-1   ROUGE-2   ROUGE-L  BERTScore F1
Prompt Type Model                                                          
Zero-shot   google/flan-t5-base  0.338012  0.118883  0.266840      0.392381
One-shot    google/flan-t5-base  0.337965  0.119797  0.267914      0.395298
Few-shot    google/flan-t5-base  0.337772  0.119434  0.268080      0.394133


In [None]:
file_name = "results.pkl"
file_path = os.path.join(data_path, file_name)

try:
    df_results.to_pickle(file_path)
    print(f"Successfully saved the DataFrame as pickle to: {file_path}")
except Exception as e:
    print(f"An error occurred while saving the pickle file: {e}")

Successfully saved the DataFrame as pickle to: /kaggle/working/results.pkl


In [None]:

print(zero_shot_results[1578],'\n',
one_shot_results[1578], '\n',
few_shot_results[1578])

A lifeboat crew in St Abbs have reopened their pagers after a row over the closure of the station. 
 A lifeboat station in the Borders has been reopened after a row over the closure of its lifeboat. 
 A lifeboat station in St Abbs has been reopened after a row over the closure of the station.


# Evaluate T5 Fine-tuned and pretrained

## Default decoder Strategy for Full Fine-Tuned

In [None]:
# import shutil

# model_path = "/kaggle/input/t5-finetuned/other/t5-finetuned/1/t5-small-xsum-finetuned_10000_training_10epoch.zip"
# model_name = "t5-small-xsum-finetuned_10000_training_10epoch"

# # Unzip the archive created earlier
# shutil.unpack_archive(model_path, model_name, format='zip')

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer from unzipped directory
# model_path = "t5-small-xsum-finetuned_10000_training_10epoch"
model_path = "/kaggle/input/t5-finetuned/other/t5-finetuned/1"
fine_tuned_model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)
original_model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
tokenizer = T5Tokenizer.from_pretrained("t5-small")

In [None]:
prompts = zero_shot_prompts

def generate_batch(model, prompts, tokenizer, batch_size=8, max_length=64):
    device = next(model.parameters()).device
    outputs = []

    for i in tqdm(range(0, len(prompts), batch_size), desc="Generating"):
        batch_prompts = prompts[i:i+batch_size]
        inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            output_ids = model.generate(**inputs, max_length=max_length)
        decoded = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        outputs.extend(decoded)

    return outputs

In [None]:
# original_outputs = generate_batch(original_model, prompts, tokenizer)

# file_name = "zero_shot_t5_original.pkl"
# file_path = os.path.join(data_path, file_name)

# try:
#     with open(file_path, 'wb') as f:
#         pickle.dump(original_outputs, f)
#     print(f"Successfully saved the list as pickle to: {file_path}")
# except Exception as e:
#     print(f"An error occurred while saving the pickle file: {e}")

# Read previosly computed results
file_path = "/kaggle/input/predictions/zero_shot_t5_original.pkl"

try:
    with open(file_path, 'rb') as f:
        original_outputs = pickle.load(f)
    print(f"Successfully loaded the list from pickle file: {file_path}")
    # Now you can work with the 'few_shot_preds' variable
    # For example, you can print the first few elements:
    # print(few_shot_preds[:5])
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred while loading the pickle file: {e}")

Successfully loaded the list from pickle file: /kaggle/input/predictions/zero_shot_t5_original.pkl


In [None]:
# fine_tuned_outputs = generate_batch(fine_tuned_model, prompts, tokenizer)

# file_name = "zero_shot_t5_fine-tuned.pkl"
# file_path = os.path.join(data_path, file_name)

# try:
#     with open(file_path, 'wb') as f:
#         pickle.dump(fine_tuned_outputs, f)
#     print(f"Successfully saved the list as pickle to: {file_path}")
# except Exception as e:
#     print(f"An error occurred while saving the pickle file: {e}")

# Read previosly computed results
file_path = "/kaggle/input/predictions/zero_shot_t5_fine-tuned.pkl"

try:
    with open(file_path, 'rb') as f:
        fine_tuned_outputs = pickle.load(f)
    print(f"Successfully loaded the list from pickle file: {file_path}")
    # Now you can work with the 'few_shot_preds' variable
    # For example, you can print the first few elements:
    # print(few_shot_preds[:5])
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred while loading the pickle file: {e}")

Successfully loaded the list from pickle file: /kaggle/input/predictions/zero_shot_t5_fine-tuned.pkl


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# prompt: calculate scores for results and save in a df

# Ensure you have the necessary evaluation metrics loaded
rouge = evaluate.load("rouge")

# --- Calculate Metrics ---

# Original Model (t5-small)
original_rouge = rouge.compute(predictions=original_outputs, references=references)
_, _, original_bertscore_f1 = score(original_outputs, references, lang="en", verbose=True, rescale_with_baseline=True, device=device)

# Fine-tuned Model (t5-small-xsum-finetuned)
fine_tuned_rouge = rouge.compute(predictions=fine_tuned_outputs, references=references)
_, _, fine_tuned_bertscore_f1 = score(fine_tuned_outputs, references, lang="en", verbose=True, rescale_with_baseline=True, device=device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/353 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/178 [00:00<?, ?it/s]

done in 175.23 seconds, 64.68 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/326 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/178 [00:00<?, ?it/s]



done in 126.68 seconds, 89.47 sentences/sec


In [None]:
# --- Prepare Data for DataFrame ---

fine_tuned_results = {
    ('Zero Shot', 'T5-small'): {
        'ROUGE-1': original_rouge['rouge1'],
        'ROUGE-2': original_rouge['rouge2'],
        'ROUGE-L': original_rouge['rougeL'],
        'BERTScore F1': original_bertscore_f1.mean().item()
    },
    ('Zero Shot', "t5-small-xsum-finetuned_10000_training_10epoch"): {
        'ROUGE-1': fine_tuned_rouge['rouge1'],
        'ROUGE-2': fine_tuned_rouge['rouge2'],
        'ROUGE-L': fine_tuned_rouge['rougeL'],
        'BERTScore F1': fine_tuned_bertscore_f1.mean().item()
    }
}


# --- Create DataFrame ---

df_fine_tuned_results = pd.DataFrame.from_dict(fine_tuned_results, orient='index')

# Set the index names
df_fine_tuned_results.index.names = ['Prompt Type', 'Model']

# Display the DataFrame
print(df_fine_tuned_results)

                                                             ROUGE-1  \
Prompt Type Model                                                      
Zero Shot   T5-small                                        0.171081   
            t5-small-xsum-finetuned_10000_training_10epoch  0.225432   

                                                             ROUGE-2  \
Prompt Type Model                                                      
Zero Shot   T5-small                                        0.022468   
            t5-small-xsum-finetuned_10000_training_10epoch  0.053187   

                                                             ROUGE-L  \
Prompt Type Model                                                      
Zero Shot   T5-small                                        0.120879   
            t5-small-xsum-finetuned_10000_training_10epoch  0.174207   

                                                            BERTScore F1  
Prompt Type Model                                         

In [None]:
# --- Save DataFrame ---
file_name_fine_tuned = "fine_tuned_results.pkl"
file_path_fine_tuned = os.path.join(data_path, file_name_fine_tuned)

try:
    df_fine_tuned_results.to_pickle(file_path_fine_tuned)
    print(f"Successfully saved the fine-tuned DataFrame as pickle to: {file_path_fine_tuned}")
except Exception as e:
    print(f"An error occurred while saving the fine-tuned pickle file: {e}")

Successfully saved the fine-tuned DataFrame as pickle to: /kaggle/working/fine_tuned_results.pkl


## Greedy, Top K, Top P, Beaming for Full Fine-Tuned

In [12]:
file_name = "xsum_test_fullFT_preds.csv"
file_path = os.path.join(data_path, file_name)

try:
    fine_tuned_decoding_strategies_outputs = pd.read_csv(file_path)
    print(f"Successfully loaded the list from pickle file: {file_path}")
    # Now you can work with the 'few_shot_preds' variable
    # For example, you can print the first few elements:
    # print(few_shot_preds[:5])
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred while loading the pickle file: {e}")

Successfully loaded the list from pickle file: /content/xsum_test_fullFT_preds.csv


In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
spec_references = fine_tuned_decoding_strategies_outputs["summary"]
rouge = evaluate.load("rouge")
fine_tuned_results = {}

strategies = ["fullFT_greedy",	"fullFT_top_k",	"fullFT_top_p",	"fullFT_beam"]

for strategy in strategies:

  # Set the corresponding preds to the strategy
  spec_predictions = fine_tuned_decoding_strategies_outputs[strategy]

  # --- Calculate Metrics ---

  # Fine-tuned Model (t5-small-xsum-finetuned)
  fine_tuned_rouge = rouge.compute(predictions=spec_predictions, references=spec_references)
  _, _, fine_tuned_bertscore_f1 = score(list(spec_predictions), list(spec_references), lang="en", verbose=True, rescale_with_baseline=True, device=device)

  fine_tuned_results[(strategy, 'T5-small_full_finetuned')] = {
          'ROUGE-1': fine_tuned_rouge['rouge1'],
          'ROUGE-2': fine_tuned_rouge['rouge2'],
          'ROUGE-L': fine_tuned_rouge['rougeL'],
          'BERTScore F1': fine_tuned_bertscore_f1.mean().item()
      }


# --- Create DataFrame ---

df_fine_tuned_results = pd.DataFrame.from_dict(fine_tuned_results, orient='index')

# Set the index names
df_fine_tuned_results.index.names = ['Prompt Type', 'Model']

# Display the DataFrame
print(df_fine_tuned_results)

# --- Save DataFrame ---
file_name_fine_tuned = "littleTest_full_fine_tuned_strategies_results.csv"
file_path_fine_tuned = os.path.join(data_path, file_name_fine_tuned)

try:
    df_fine_tuned_results.to_csv(file_path_fine_tuned)
    print(f"Successfully saved the fine-tuned DataFrame as csv to: {file_path_fine_tuned}")
except Exception as e:
    print(f"An error occurred while saving the fine-tuned pickle file: {e}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 6.31 seconds, 0.79 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 6.80 seconds, 0.74 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 5.80 seconds, 0.86 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 5.65 seconds, 0.89 sentences/sec
                                        ROUGE-1   ROUGE-2   ROUGE-L  \
Prompt Type   Model                                                   
fullFT_greedy T5-small_full_finetuned  0.428276  0.234754  0.357471   
fullFT_top_k  T5-small_full_finetuned  0.207722  0.100000  0.191722   
fullFT_top_p  T5-small_full_finetuned  0.276048  0.066809  0.175142   
fullFT_beam   T5-small_full_finetuned  0.369066  0.167109  0.326540   

                                       BERTScore F1  
Prompt Type   Model                                  
fullFT_greedy T5-small_full_finetuned      0.462046  
fullFT_top_k  T5-small_full_finetuned      0.309543  
fullFT_top_p  T5-small_full_finetuned      0.269249  
fullFT_beam   T5-small_full_finetuned      0.394224  
Successfully saved the fine-tuned DataFrame as csv to: /content/littleTest_full_fine_tuned_strategies_results.csv


## Greedy, Top K, Top P, Beaming for LoRa

In [25]:
file_name = "xsum_test_lora_preds.csv"
file_path = os.path.join(data_path, file_name)

try:
    fine_tuned_decoding_strategies_outputs = pd.read_csv(file_path)
    print(f"Successfully loaded the list from pickle file: {file_path}")
    # Now you can work with the 'few_shot_preds' variable
    # For example, you can print the first few elements:
    # print(few_shot_preds[:5])
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred while loading the pickle file: {e}")

Successfully loaded the list from pickle file: /content/xsum_test_lora_preds.csv


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
spec_references = fine_tuned_decoding_strategies_outputs["summary"].head()
rouge = evaluate.load("rouge")
fine_tuned_results = {}

strategies = ["fullFT_greedy",	"fullFT_top_k",	"fullFT_top_p",	"fullFT_beam"]

for strategy in strategies:

  # Set the corresponding preds to the strategy
  spec_predictions = fine_tuned_decoding_strategies_outputs[strategy].head()

  # --- Calculate Metrics ---

  # Fine-tuned Model (t5-small-xsum-finetuned)
  fine_tuned_rouge = rouge.compute(predictions=spec_predictions, references=spec_references)
  _, _, fine_tuned_bertscore_f1 = score(list(spec_predictions), list(spec_references), lang="en", verbose=True, rescale_with_baseline=True, device=device)

  fine_tuned_results[(strategy, 'T5-small_full_finetuned')] = {
          'ROUGE-1': fine_tuned_rouge['rouge1'],
          'ROUGE-2': fine_tuned_rouge['rouge2'],
          'ROUGE-L': fine_tuned_rouge['rougeL'],
          'BERTScore F1': fine_tuned_bertscore_f1.mean().item()
      }


# --- Create DataFrame ---

df_fine_tuned_results = pd.DataFrame.from_dict(fine_tuned_results, orient='index')

# Set the index names
df_fine_tuned_results.index.names = ['Prompt Type', 'Model']

# Display the DataFrame
print(df_fine_tuned_results)

# --- Save DataFrame ---
file_name_fine_tuned = "littleTest_full_fine_tuned_strategies_results.csv"
file_path_fine_tuned = os.path.join(data_path, file_name_fine_tuned)

try:
    df_fine_tuned_results.to_csv(file_path_fine_tuned)
    print(f"Successfully saved the fine-tuned DataFrame as csv to: {file_path_fine_tuned}")
except Exception as e:
    print(f"An error occurred while saving the fine-tuned pickle file: {e}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 6.31 seconds, 0.79 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 6.80 seconds, 0.74 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 5.80 seconds, 0.86 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 5.65 seconds, 0.89 sentences/sec
                                        ROUGE-1   ROUGE-2   ROUGE-L  \
Prompt Type   Model                                                   
fullFT_greedy T5-small_full_finetuned  0.428276  0.234754  0.357471   
fullFT_top_k  T5-small_full_finetuned  0.207722  0.100000  0.191722   
fullFT_top_p  T5-small_full_finetuned  0.276048  0.066809  0.175142   
fullFT_beam   T5-small_full_finetuned  0.369066  0.167109  0.326540   

                                       BERTScore F1  
Prompt Type   Model                                  
fullFT_greedy T5-small_full_finetuned      0.462046  
fullFT_top_k  T5-small_full_finetuned      0.309543  
fullFT_top_p  T5-small_full_finetuned      0.269249  
fullFT_beam   T5-small_full_finetuned      0.394224  
Successfully saved the fine-tuned DataFrame as csv to: /content/littleTest_full_fine_tuned_strategies_results.csv


## Greedy, Top K, Top P, Beaming for ia3

## Greedy, Top K, Top P, Beaming for Full adapter

## Greedy, Top K, Top P, Beaming for prefix

# GRU with Attention

In [None]:
data_path = "/kaggle/input/xsum-test-with-preds-gru/xsum_test_with_preds_GRU.csv"

lstm_pred = pd.read_csv(data_path)

In [None]:
# drop 11156 index in test set since preds are missing that
references_modified = references.copy()
del references_modified[11155]

references_modified[11155]

'A 24-year-old man and a 23-year-old woman have been arrested in Berkshire on suspicion of preparing for acts of terrorism in the UK.'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# methods
strategies = ["pred_greedy", "pred_top_k",	"pred_top_p",	"pred_beam"]
rouge = evaluate.load("rouge")
lstm_results = {}

for strategy in strategies:

  lstm_preds = list(lstm_pred[strategy])

  # Calculate ROUGE and BERTScore for LSTM
  lstm_rouge = rouge.compute(predictions=lstm_preds, references=references_modified)
  _, _, lstm_bertscore_f1 = score(lstm_preds, references_modified, lang="en", verbose=True, rescale_with_baseline=True, device=device)

  # Add LSTM results to the overall results dictionary
  lstm_results[('-', 'GRU_' + strategy)] = {
      'ROUGE-1': lstm_rouge['rouge1'],
      'ROUGE-2': lstm_rouge['rouge2'],
      'ROUGE-L': lstm_rouge['rougeL'],
      'BERTScore F1': lstm_bertscore_f1.mean().item()
  }

# Update the DataFrame with LSTM results
df_results_updated = pd.DataFrame.from_dict(lstm_results, orient='index')
df_results_updated.index.names = ['Prompt Type', 'Model']

# Display the updated DataFrame
print(df_results_updated)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/351 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/178 [00:00<?, ?it/s]

done in 108.81 seconds, 104.16 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/353 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/178 [00:00<?, ?it/s]

done in 125.24 seconds, 90.49 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/353 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/178 [00:00<?, ?it/s]

done in 127.66 seconds, 88.78 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/350 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/178 [00:00<?, ?it/s]

done in 124.07 seconds, 91.34 sentences/sec
                              ROUGE-1   ROUGE-2   ROUGE-L  BERTScore F1
Prompt Type Model                                                      
-           GRU_pred_greedy  0.189852  0.029122  0.139330      0.135189
            GRU_pred_top_k   0.147645  0.013676  0.108129      0.049020
            GRU_pred_top_p   0.137238  0.012181  0.102272      0.034133
            GRU_pred_beam    0.189561  0.029555  0.139431      0.138498
An error occurred while saving the updated pickle file: Cannot save file into a non-existent directory: '/kaggle/input/xsum-test-with-preds-gru/xsum_test_with_preds_GRU.csv'


In [None]:
# Save the updated DataFrame
file_name_updated = "all_GRU_results.pkl"
file_path_updated = os.path.join(data_path, file_name_updated)

try:
    df_results_updated.to_pickle(file_path_updated)
    print(f"Successfully saved the updated DataFrame as pickle to: {file_path_updated}")
except Exception as e:
    print(f"An error occurred while saving the updated pickle file: {e}")

Successfully saved the updated DataFrame as pickle to: /kaggle/working/all_GRU_results.pkl


In [None]:
df_results_updated

Unnamed: 0_level_0,Unnamed: 1_level_0,ROUGE-1,ROUGE-2,ROUGE-L,BERTScore F1
Prompt Type,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-,GRU_pred_greedy,0.189852,0.029122,0.13933,0.135189
-,GRU_pred_top_k,0.147645,0.013676,0.108129,0.04902
-,GRU_pred_top_p,0.137238,0.012181,0.102272,0.034133
-,GRU_pred_beam,0.189561,0.029555,0.139431,0.138498


# Final Comparison of all results

In [None]:
df_results

Unnamed: 0_level_0,Unnamed: 1_level_0,ROUGE-1,ROUGE-2,ROUGE-L,BERTScore F1
Prompt Type,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Zero-shot,google/flan-t5-base,0.338012,0.118883,0.26684,0.392381
One-shot,google/flan-t5-base,0.337965,0.119797,0.267914,0.395298
Few-shot,google/flan-t5-base,0.337772,0.119434,0.26808,0.394133


In [None]:
df_fine_tuned_results

Unnamed: 0_level_0,Unnamed: 1_level_0,ROUGE-1,ROUGE-2,ROUGE-L,BERTScore F1
Prompt Type,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Zero Shot,T5-small,0.171081,0.022468,0.120879,0.088924
Zero Shot,t5-small-xsum-finetuned_10000_training_10epoch,0.225432,0.053187,0.174207,0.147041


In [None]:
df_results_updated

Unnamed: 0_level_0,Unnamed: 1_level_0,ROUGE-1,ROUGE-2,ROUGE-L,BERTScore F1
Prompt Type,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-,GRU_pred_greedy,0.189852,0.029122,0.13933,0.135189
-,GRU_pred_top_k,0.147645,0.013676,0.108129,0.04902
-,GRU_pred_top_p,0.137238,0.012181,0.102272,0.034133
-,GRU_pred_beam,0.189561,0.029555,0.139431,0.138498


In [None]:
final_results_test = pd.concat([df_results, df_fine_tuned_results, df_results_updated])
final_results_test

Unnamed: 0_level_0,Unnamed: 1_level_0,ROUGE-1,ROUGE-2,ROUGE-L,BERTScore F1
Prompt Type,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Zero-shot,google/flan-t5-base,0.338012,0.118883,0.26684,0.392381
One-shot,google/flan-t5-base,0.337965,0.119797,0.267914,0.395298
Few-shot,google/flan-t5-base,0.337772,0.119434,0.26808,0.394133
Zero Shot,T5-small,0.171081,0.022468,0.120879,0.088924
Zero Shot,t5-small-xsum-finetuned_10000_training_10epoch,0.225432,0.053187,0.174207,0.147041
-,GRU_pred_greedy,0.189852,0.029122,0.13933,0.135189
-,GRU_pred_top_k,0.147645,0.013676,0.108129,0.04902
-,GRU_pred_top_p,0.137238,0.012181,0.102272,0.034133
-,GRU_pred_beam,0.189561,0.029555,0.139431,0.138498


In [None]:
# --- Save DataFrame ---
file_name = "GRU_T5-original-vs-ft_Flat-T5_results_test.pkl"
file_path = os.path.join(data_path, file_name)

try:
    final_results_test.to_pickle(file_path)
    print(f"Successfully saved the fine-tuned DataFrame as pickle to: {file_path}")
except Exception as e:
    print(f"An error occurred while saving the fine-tuned pickle file: {e}")

Successfully saved the fine-tuned DataFrame as pickle to: /kaggle/working/GRU_T5-original-vs-ft_Flat-T5_results_test.pkl
