In [7]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "google-t5/t5-small"

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Optionally, load the corresponding tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Model and tokenizer loaded successfully!")


Model and tokenizer loaded successfully!


In [8]:
from datasets import load_dataset
import os

# Update this to your actual file path
data_path = "/kaggle/input/assignment2nlp/train.csv"  # Example: "/kaggle/input/wiki-data/train.csv"

# Check if the file exists
if not os.path.exists(data_path):
    raise FileNotFoundError(f"File not found at: {data_path}")

# Load the dataset from the full path
dataset = load_dataset("csv", data_files={"train": data_path})["train"]

# Split off 500 examples for validation
split_dataset = dataset.train_test_split(test_size=500, seed=42)
train_dataset = split_dataset["train"]
validation_dataset = split_dataset["test"]

# Print stats
print(f"Total loaded: {len(dataset)} examples")
print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(validation_dataset)}")


Total loaded: 13879 examples
Training set size: 13379
Validation set size: 500


In [11]:
import os
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict # Make sure Dataset, DatasetDict are imported if using them directly
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
)
import nltk # Needed for rouge calculation later, but good to import early
import numpy as np

# Ensure NLTK's punkt tokenizer is available (needed for ROUGE)
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt', quiet=True)

print("Imports successful.")

# --- 1. Define Model Name ---
model_name = "google-t5/t5-small"
print(f"Using model: {model_name}")

# --- 2. Load Tokenizer and Model ---
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    print("Tokenizer and model loaded successfully.")
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    # Handle error appropriately, maybe raise it again if critical
    raise

# --- 3. Load and Prepare Datasets (ASSUMES train_dataset and validation_dataset exist) ---
# Make sure these variables hold your datasets from the previous split
# Example of how they might have been created:
#
# data_path_train = "/kaggle/input/assignment2nlp/train.csv"
# data_path_test = "/kaggle/input/assignment2nlp/test.csv" # Define test path as well
#
# if not os.path.exists(data_path_train):
#     raise FileNotFoundError(f"Train file not found at: {data_path_train}")
# if not os.path.exists(data_path_test):
#      raise FileNotFoundError(f"Test file not found at: {data_path_test}")
#
# Load the dataset from the full path
# full_dataset = load_dataset("csv", data_files={"train": data_path_train})["train"]
# test_data_raw = load_dataset("csv", data_files={"test": data_path_test})["test"] # Load test set here too
#
# Split off 500 examples for validation
# split_dataset = full_dataset.train_test_split(test_size=500, seed=42)
# train_dataset = split_dataset["train"]
# validation_dataset = split_dataset["test"]
#
# print(f"Training set size: {len(train_dataset)}")
# print(f"Validation set size: {len(validation_dataset)}")
# print(f"Test set size: {len(test_data_raw)}") # Print test set size

# --- Check if datasets exist (replace with your actual variable names) ---
if 'train_dataset' not in locals() or 'validation_dataset' not in locals():
     raise NameError("Variables 'train_dataset' and 'validation_dataset' are not defined. Please run the data loading and splitting code first.")
print("Train and validation datasets are ready.")


# --- 4. Define Preprocessing/Tokenization Function ---
def tokenize_function(examples):
    # Ensure 'text' and 'title' columns exist and handle potential None values
    texts = [str(t) if t is not None else "" for t in examples['text']]
    titles = [str(t) if t is not None else "" for t in examples['title']]

    # Add prefix for T5 models (optional but recommended for summarization/title generation)
    prefix = "summarize: " # Or "generate title: "
    texts = [prefix + text for text in texts]

    # Tokenize inputs
    model_inputs = tokenizer(
        texts, # Use prefixed texts
        max_length=512,
        truncation=True,
        padding="max_length" # Pad to max_length during tokenization
    )

    # Tokenize targets using text_target argument
    labels = tokenizer(
        text_target=titles,
        max_length=64, # Max length for titles
        truncation=True,
        padding="max_length" # Pad to max_length during tokenization
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# --- 5. Apply Tokenization ---
print("Tokenizing datasets...")
try:
    tokenized_train = train_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=train_dataset.column_names # Remove original text/title columns
    )
    tokenized_validation = validation_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=validation_dataset.column_names # Remove original text/title columns
    )
    print("Tokenization finished.")
    print(f"Columns in tokenized_train: {tokenized_train.column_names}")
except Exception as e:
    print(f"Error during tokenization: {e}")
    raise

# --- 6. Define Data Collator ---
# Pads sequences dynamically to the longest sequence in a batch
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
print("Data collator defined.")

# --- Define Training Arguments ---
training_args = Seq2SeqTrainingArguments(
    output_dir="./results_c1_t5_small",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    fp16=False,
    report_to="none" # <--- ADD THIS LINE
)
print("Training arguments defined (wandb disabled).")

# --- 8. Initialize Trainer ---
# We'll add compute_metrics later for evaluation
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics, # We will define and add this for evaluation in the next step
)
print("Seq2SeqTrainer initialized.")

Imports successful.
Using model: google-t5/t5-small
Tokenizer and model loaded successfully.
Train and validation datasets are ready.
Tokenizing datasets...
Tokenization finished.
Columns in tokenized_train: ['input_ids', 'attention_mask', 'labels']
Data collator defined.
Training arguments defined (wandb disabled).
Seq2SeqTrainer initialized.


  trainer = Seq2SeqTrainer(


In [12]:
# --- 9. Start Training ---
print("Starting training...")
try:
    train_result = trainer.train()
    print("Training finished successfully.")

    # Optional: Save metrics and final model
    metrics = train_result.metricsz
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    # Save the final model
    # trainer.save_model("./results_c1_t5_small/final_model")
    # print("Final model saved.")

except Exception as e:
    print(f"An error occurred during training: {e}")
    # You might want to add more specific error handling or logging here

Starting training...


Step,Training Loss
100,2.9632
200,0.1424
300,0.0811
400,0.0801
500,0.0647
600,0.0585
700,0.0524
800,0.0475
900,0.0561
1000,0.0379


Training finished successfully.
An error occurred during training: 'TrainOutput' object has no attribute 'metricsz'


In [18]:
import pandas as pd
import torch
from datasets import load_dataset
from tqdm.notebook import tqdm # Or from tqdm import tqdm
import time
import evaluate # Use Hugging Face's evaluate library
import nltk
import numpy as np

# --- 1. Load Test Data ---
test_data_path = "/kaggle/input/assignment2nlp/test.csv" # CHANGE TO YOUR PATH
print(f"Loading test data from: {test_data_path}")
try:
    # Load using datasets library to be consistent if train/val were loaded that way
    # Or use pandas: test_df = pd.read_csv(test_data_path)
    raw_test_dataset = load_dataset("csv", data_files={"test": test_data_path})["test"]

    # Extract texts and reference titles
    # Handle potential None values
    test_texts = [str(t) if t is not None else "" for t in raw_test_dataset['text']]
    reference_titles = [str(t) if t is not None else "" for t in raw_test_dataset['title']]
    print(f"Loaded {len(test_texts)} test examples.")
    # Optional: Use a smaller subset for faster debugging
    # subset_size = 10
    # test_texts = test_texts[:subset_size]
    # reference_titles = reference_titles[:subset_size]
    # print(f"Using subset of {len(test_texts)} examples for testing.")

except Exception as e:
    print(f"Error loading or processing test data: {e}")
    raise

# --- 2. Ensure Model and Tokenizer are Ready ---
# Assuming 'trainer' and 'tokenizer' are still available from the training step
# If not, load them:
# model_path = "./results_c1_t5_small" # Path where trainer saved the model
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# Get the fine-tuned model from the trainer
model = trainer.model
model.eval() # Set model to evaluation mode

# Define the prefix used during training (if any)
prefix = "summarize: " # Make sure this matches the prefix used in tokenize_function

# --- 3. Generation Function (can be adapted from C2) ---
def generate_predictions(model, tokenizer, texts, prefix, device, num_beams=1, max_length=64):
    """Generates titles for a list of texts."""
    generated_titles = []
    model.eval() # Ensure model is in eval mode
    with torch.no_grad():
        for text in tqdm(texts, desc=f"Generating (beams={num_beams})"):
            # Prepare input
            input_text = prefix + text
            inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding=True)
            inputs = inputs.to(device)

            # Generate
            output_sequences = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_new_tokens=max_length,
                num_beams=num_beams,
                early_stopping=True if num_beams > 1 else False # Early stopping mainly for beam search
            )

            # Decode
            prediction = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
            generated_titles.append(prediction)
    return generated_titles

# --- 4. Generate Predictions - Greedy Search ---
print("\n--- Generating predictions (Greedy Search) ---")
start_time_greedy = time.time()
greedy_predictions = generate_predictions(model, tokenizer, test_texts, prefix, device, num_beams=1)
end_time_greedy = time.time()
time_greedy = end_time_greedy - start_time_greedy
print(f"Greedy generation finished in {time_greedy:.2f} seconds.")
print("Example Greedy Predictions:")
for i in range(min(3, len(greedy_predictions))):
    print(f"  Ref: {reference_titles[i]}")
    print(f"  Gen: {greedy_predictions[i]}\n")


# --- 5. Generate Predictions - Beam Search ---
BEAM_SIZE = 4 # You can choose a different beam size
print(f"\n--- Generating predictions (Beam Search, Num Beams = {BEAM_SIZE}) ---")
start_time_beam = time.time()
beam_predictions = generate_predictions(model, tokenizer, test_texts, prefix, device, num_beams=BEAM_SIZE)
end_time_beam = time.time()
time_beam = end_time_beam - start_time_beam
print(f"Beam search generation finished in {time_beam:.2f} seconds.")
print("Example Beam Search Predictions:")
for i in range(min(3, len(beam_predictions))):
    print(f"  Ref: {reference_titles[i]}")
    print(f"  Gen: {beam_predictions[i]}\n")


# --- 6. Calculate ROUGE Scores ---
print("\n--- Calculating ROUGE Scores ---")

# Ensure NLTK punkt is downloaded
try:
    nltk.data.find('tokenizers/punkt')
except (OSError, LookupError):
    print("Downloading nltk punkt tokenizer...")
    nltk.download('punkt', quiet=True)

# Load the ROUGE Metric
try:
    rouge_metric = evaluate.load('rouge')
except Exception as e:
    print(f"Error loading ROUGE metric: {e}")
    raise

# Re-use the calculation function (or define it here if not run before)
def calculate_rouge_scores(predictions, references):
    """Calculates ROUGE scores using the evaluate library."""
    if not predictions or not references or len(predictions) != len(references):
        print("Warning: Invalid input for ROUGE calculation.")
        return None
    result = rouge_metric.compute(predictions=predictions, references=references, use_stemmer=True)
    scores = {
        'ROUGE-1 F1': result.get('rouge1', 0.0) * 100,
        'ROUGE-2 F1': result.get('rouge2', 0.0) * 100,
        'ROUGE-L F1': result.get('rougeL', 0.0) * 100,
    }
    return scores

# Calculate for Greedy
print("Calculating ROUGE for Greedy Search...")
rouge_scores_greedy = calculate_rouge_scores(greedy_predictions, reference_titles)

# Calculate for Beam Search
print(f"Calculating ROUGE for Beam Search (beams={BEAM_SIZE})...")
rouge_scores_beam = calculate_rouge_scores(beam_predictions, reference_titles)

# --- 7. Print Results ---
print("\n--- Evaluation Results for Fine-tuned T5-Small ---")
print(f"Generation Time (Greedy): {time_greedy:.2f}s")
if rouge_scores_greedy:
    print("ROUGE Scores (Greedy):")
    print(f"  ROUGE-1 F1: {rouge_scores_greedy['ROUGE-1 F1']:.2f}")
    print(f"  ROUGE-2 F1: {rouge_scores_greedy['ROUGE-2 F1']:.2f}")
    print(f"  ROUGE-L F1: {rouge_scores_greedy['ROUGE-L F1']:.2f}")
else:
    print("ROUGE Scores (Greedy): Not calculated.")

print("-" * 30)
print(f"Generation Time (Beam Search, beams={BEAM_SIZE}): {time_beam:.2f}s")
if rouge_scores_beam:
    print(f"ROUGE Scores (Beam Search, beams={BEAM_SIZE}):")
    print(f"  ROUGE-1 F1: {rouge_scores_beam['ROUGE-1 F1']:.2f}")
    print(f"  ROUGE-2 F1: {rouge_scores_beam['ROUGE-2 F1']:.2f}")
    print(f"  ROUGE-L F1: {rouge_scores_beam['ROUGE-L F1']:.2f}")
else:
    print(f"ROUGE Scores (Beam Search, beams={BEAM_SIZE}): Not calculated.")

print("\n--- End of C1 Evaluation ---")

Loading test data from: /kaggle/input/assignment2nlp/test.csv
Loaded 100 test examples.

--- Generating predictions (Greedy Search) ---


Generating (beams=1):   0%|          | 0/100 [00:00<?, ?it/s]

Greedy generation finished in 6.90 seconds.
Example Greedy Predictions:
  Ref: Weyburn
  Gen: Weyburn, Saskatchewan

  Ref: Catholic High School, Singapore
  Gen: Catholic High School

  Ref: Minnesota Golden Gophers
  Gen: Minnesota Golden Gophers


--- Generating predictions (Beam Search, Num Beams = 4) ---


Generating (beams=4):   0%|          | 0/100 [00:00<?, ?it/s]

Beam search generation finished in 9.84 seconds.
Example Beam Search Predictions:
  Ref: Weyburn
  Gen: Weyburn, Saskatchewan

  Ref: Catholic High School, Singapore
  Gen: Catholic High School

  Ref: Minnesota Golden Gophers
  Gen: Minnesota Golden Gophers


--- Calculating ROUGE Scores ---
Calculating ROUGE for Greedy Search...
Calculating ROUGE for Beam Search (beams=4)...

--- Evaluation Results for Fine-tuned T5-Small ---
Generation Time (Greedy): 6.90s
ROUGE Scores (Greedy):
  ROUGE-1 F1: 90.38
  ROUGE-2 F1: 69.96
  ROUGE-L F1: 90.36
------------------------------
Generation Time (Beam Search, beams=4): 9.84s
ROUGE Scores (Beam Search, beams=4):
  ROUGE-1 F1: 89.70
  ROUGE-2 F1: 69.31
  ROUGE-L F1: 89.64

--- End of C1 Evaluation ---


In [19]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch # Import torch to check for GPU

# --- Define Model Names ---
model_name_base = "google/flan-t5-base"
model_name_large = "google/flan-t5-large"

# --- Check for GPU availability ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Load Flan-T5 Base Model and Tokenizer ---
print(f"Loading {model_name_base}...")
try:
    tokenizer_base = AutoTokenizer.from_pretrained(model_name_base)
    model_base = AutoModelForSeq2SeqLM.from_pretrained(model_name_base)
    model_base.to(device) # Move model to GPU if available
    print(f"{model_name_base} loaded successfully.")
except Exception as e:
    print(f"Error loading {model_name_base}: {e}")
    # Depending on your setup, you might want to stop or continue without this model
    model_base = None
    tokenizer_base = None

# --- Load Flan-T5 Large Model and Tokenizer ---
# Note: flan-t5-large is significantly bigger and requires more memory/GPU RAM.
# If you have resource constraints, you might skip this or use it cautiously.
print(f"Loading {model_name_large}...")
try:
    tokenizer_large = AutoTokenizer.from_pretrained(model_name_large)
    model_large = AutoModelForSeq2SeqLM.from_pretrained(model_name_large)
    model_large.to(device) # Move model to GPU if available
    print(f"{model_name_large} loaded successfully.")
except Exception as e:
    print(f"Error loading {model_name_large}: {e}")
    # Depending on your setup, you might want to stop or continue without this model
    model_large = None
    tokenizer_large = None

# --- Verify loaded models ---
if model_base and tokenizer_base:
    print("Flan-T5 Base is ready.")
if model_large and tokenizer_large:
    print("Flan-T5 Large is ready.")
elif model_name_large: # Check if we attempted to load it
     print("Flan-T5 Large could not be loaded (check memory/GPU RAM).")

Using device: cuda
Loading google/flan-t5-base...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:  56%|#####6    | 556M/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

google/flan-t5-base loaded successfully.
Loading google/flan-t5-large...


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

google/flan-t5-large loaded successfully.
Flan-T5 Base is ready.
Flan-T5 Large is ready.


In [21]:
import pandas as pd
import os

# --- 1. Load the RAW Test Data ---
# Make sure the path to your test.csv is correct
test_data_path = "/kaggle/input/assignment2nlp/test.csv" # Example path

if not os.path.exists(test_data_path):
    raise FileNotFoundError(f"Test file not found at: {test_data_path}")

# Load using pandas to easily access text and title columns
try:
    test_df = pd.read_csv(test_data_path)
    # Ensure 'text' and 'title' columns exist
    if 'text' not in test_df.columns or 'title' not in test_df.columns:
        raise ValueError("Test CSV must contain 'text' and 'title' columns.")
    # Handle potential missing values (replace NaN with empty string)
    test_df['text'].fillna('', inplace=True)
    test_df['title'].fillna('', inplace=True)
    print(f"Loaded test data with {len(test_df)} examples.")
    # Keep only a small subset for faster testing/debugging if needed
    # test_df = test_df.head(5)
    # print(f"Using subset of {len(test_df)} examples for testing.")
except Exception as e:
    print(f"Error loading or processing test data: {e}")
    raise

# Extract texts and reference titles for later evaluation
test_texts = test_df['text'].tolist()
reference_titles = test_df['title'].tolist() # Ground truth titles

# --- 2. Define Prompt Variations ---
# You need at least two variations. Here are a few examples:

prompt1_template = "Generate a concise title for the following Wikipedia article: {article_text}"

prompt2_template = "What is a suitable title for this text? Text: {article_text} Title:"

prompt3_template = "Summarize the main topic of this document into a short title: {article_text}"
# Add more prompts if you like

# Store prompts for easy iteration
prompts = {
    "Prompt 1 (Generate)": prompt1_template,
    "Prompt 2 (Question)": prompt2_template,
    "Prompt 3 (Summarize)": prompt3_template,
    # Add more keys if you added more prompts
}

print("Defined prompts:")
for name, template in prompts.items():
    print(f"- {name}: '{template[:50]}...'") # Print start of each template

# --- Store models and tokenizers for iteration ---
# Assumes model_base, tokenizer_base, model_large, tokenizer_large are loaded from previous step
# And device is also defined ('cuda' or 'cpu')
available_models = {}
if 'model_base' in locals() and model_base is not None:
    available_models['flan-t5-base'] = {'model': model_base, 'tokenizer': tokenizer_base}
if 'model_large' in locals() and model_large is not None:
    available_models['flan-t5-large'] = {'model': model_large, 'tokenizer': tokenizer_large}

if not available_models:
    raise RuntimeError("No Flan-T5 models were successfully loaded in the previous step.")

print(f"Ready to generate titles using models: {list(available_models.keys())}")

Loaded test data with 100 examples.
Defined prompts:
- Prompt 1 (Generate): 'Generate a concise title for the following Wikiped...'
- Prompt 2 (Question): 'What is a suitable title for this text? Text: {art...'
- Prompt 3 (Summarize): 'Summarize the main topic of this document into a s...'
Ready to generate titles using models: ['flan-t5-base', 'flan-t5-large']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['text'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['title'].fillna('', inplace=True)


In [22]:
import pandas as pd
import os

# --- 1. Load the RAW Test Data ---
# Make sure the path to your test.csv is correct
test_data_path = "/kaggle/input/assignment2nlp/test.csv" # Example path

if not os.path.exists(test_data_path):
    raise FileNotFoundError(f"Test file not found at: {test_data_path}")

# Load using pandas to easily access text and title columns
try:
    test_df = pd.read_csv(test_data_path)
    # Ensure 'text' and 'title' columns exist
    if 'text' not in test_df.columns or 'title' not in test_df.columns:
        raise ValueError("Test CSV must contain 'text' and 'title' columns.")
    # Handle potential missing values (replace NaN with empty string)
    test_df['text'].fillna('', inplace=True)
    test_df['title'].fillna('', inplace=True)
    print(f"Loaded test data with {len(test_df)} examples.")
    # Keep only a small subset for faster testing/debugging if needed
    # test_df = test_df.head(5)
    # print(f"Using subset of {len(test_df)} examples for testing.")
except Exception as e:
    print(f"Error loading or processing test data: {e}")
    raise

# Extract texts and reference titles for later evaluation
test_texts = test_df['text'].tolist()
reference_titles = test_df['title'].tolist() # Ground truth titles

# --- 2. Define Prompt Variations ---
# You need at least two variations. Here are a few examples:

prompt1_template = "Generate a concise title for the following Wikipedia article: {article_text}"

prompt2_template = "What is a suitable title for this text? Text: {article_text} Title:"

prompt3_template = "Summarize the main topic of this document into a short title: {article_text}"
# Add more prompts if you like

# Store prompts for easy iteration
prompts = {
    "Prompt 1 (Generate)": prompt1_template,
    "Prompt 2 (Question)": prompt2_template,
    "Prompt 3 (Summarize)": prompt3_template,
    # Add more keys if you added more prompts
}

print("Defined prompts:")
for name, template in prompts.items():
    print(f"- {name}: '{template[:50]}...'") # Print start of each template

# --- Store models and tokenizers for iteration ---
# Assumes model_base, tokenizer_base, model_large, tokenizer_large are loaded from previous step
# And device is also defined ('cuda' or 'cpu')
available_models = {}
if 'model_base' in locals() and model_base is not None:
    available_models['flan-t5-base'] = {'model': model_base, 'tokenizer': tokenizer_base}
if 'model_large' in locals() and model_large is not None:
    available_models['flan-t5-large'] = {'model': model_large, 'tokenizer': tokenizer_large}

if not available_models:
    raise RuntimeError("No Flan-T5 models were successfully loaded in the previous step.")

print(f"Ready to generate titles using models: {list(available_models.keys())}")

Loaded test data with 100 examples.
Defined prompts:
- Prompt 1 (Generate): 'Generate a concise title for the following Wikiped...'
- Prompt 2 (Question): 'What is a suitable title for this text? Text: {art...'
- Prompt 3 (Summarize): 'Summarize the main topic of this document into a s...'
Ready to generate titles using models: ['flan-t5-base', 'flan-t5-large']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['text'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['title'].fillna('', inplace=True)


In [23]:
import torch
from tqdm.notebook import tqdm # Use tqdm.notebook for Kaggle/Jupyter, or just tqdm otherwise
import time

# --- Generation Function ---
def generate_title_with_prompt(model, tokenizer, prompt_template, article_text, device, max_length=64, num_beams=1):
    """
    Generates a title for a given article using a specific prompt and model.

    Args:
        model: The loaded Seq2Seq LM model.
        tokenizer: The corresponding tokenizer.
        prompt_template: The string template for the prompt (e.g., "Generate title: {article_text}").
        article_text: The raw text of the article.
        device: The device ('cuda' or 'cpu') to run generation on.
        max_length: The maximum length of the generated title tokens.
        num_beams: Number of beams for beam search (1 = greedy).

    Returns:
        The generated title string.
    """
    # Format the prompt
    prompted_text = prompt_template.format(article_text=article_text)

    # Tokenize the prompted text
    # Flan-T5 doesn't strictly require a prefix like T5, but the prompt serves a similar role.
    # We need to handle potential length issues here before generation.
    # Let's tokenize first to see the length, though generation handles truncation internally.
    inputs = tokenizer(prompted_text, return_tensors="pt", truncation=True, max_length=1024) # Flan-T5 often has longer context
    inputs = inputs.to(device) # Move tokenized inputs to the correct device

    # Generate output (title)
    # Use torch.no_grad() for inference to save memory and compute
    with torch.no_grad():
        output_sequences = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'], # Pass attention mask
            max_new_tokens=max_length,              # More modern way than max_length for generated part
            num_beams=num_beams,                    # Control greedy vs beam search
            early_stopping=True                     # Stop beam search early if possible
        )

    # Decode the generated token ids to text
    generated_title = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

    return generated_title

# --- Run Generation for All Models and Prompts ---

# Dictionary to store results: results[model_name][prompt_name] = [list_of_titles]
results = {}
generation_times = {} # To store time taken

# Iterate through each available model
for model_name, model_info in available_models.items():
    print(f"\n--- Generating titles with {model_name} ---")
    model = model_info['model']
    tokenizer = model_info['tokenizer']
    results[model_name] = {}
    generation_times[model_name] = {}

    # Iterate through each prompt
    for prompt_name, prompt_template in prompts.items():
        print(f"Using prompt: '{prompt_name}'")
        start_time = time.time()
        generated_titles_list = []
        # Use tqdm for progress bar over test texts
        for article_text in tqdm(test_texts, desc=f"{model_name} - {prompt_name}"):
            # Generate title for the current article text
            # Using default greedy search (num_beams=1) for now
            # You might want to run beam search separately if needed
            generated_title = generate_title_with_prompt(
                model,
                tokenizer,
                prompt_template,
                article_text,
                device,
                max_length=64, # Max length for the generated title
                num_beams=1    # Greedy decoding
            )
            generated_titles_list.append(generated_title)

        end_time = time.time()
        total_time = end_time - start_time
        results[model_name][prompt_name] = generated_titles_list
        generation_times[model_name][prompt_name] = total_time
        print(f"Finished '{prompt_name}' for {model_name} in {total_time:.2f} seconds.")
        # Optional: Print a few examples
        print("Example generated titles:")
        for i in range(min(3, len(test_texts))):
             print(f"  Article {i+1}: {generated_titles_list[i]}")

print("\n--- All generations complete ---")
# Results dictionary now contains all generated titles
# Example access: results['flan-t5-base']['Prompt 1 (Generate)'][0] -> first title generated by base model with prompt 1


--- Generating titles with flan-t5-base ---
Using prompt: 'Prompt 1 (Generate)'


flan-t5-base - Prompt 1 (Generate):   0%|          | 0/100 [00:00<?, ?it/s]



Finished 'Prompt 1 (Generate)' for flan-t5-base in 17.15 seconds.
Example generated titles:
  Article 1: Weyburn, Saskatchewan
  Article 2: Catholic High School
  Article 3: Minnesota Golden Gophers
Using prompt: 'Prompt 2 (Question)'


flan-t5-base - Prompt 2 (Question):   0%|          | 0/100 [00:00<?, ?it/s]

Finished 'Prompt 2 (Question)' for flan-t5-base in 15.87 seconds.
Example generated titles:
  Article 1: Weyburn, Saskatchewan
  Article 2: Catholic High School
  Article 3: Sports
Using prompt: 'Prompt 3 (Summarize)'


flan-t5-base - Prompt 3 (Summarize):   0%|          | 0/100 [00:00<?, ?it/s]

Finished 'Prompt 3 (Summarize)' for flan-t5-base in 19.25 seconds.
Example generated titles:
  Article 1: Weyburn, Saskatchewan
  Article 2: Catholic High School
  Article 3: University of Minnesota

--- Generating titles with flan-t5-large ---
Using prompt: 'Prompt 1 (Generate)'


flan-t5-large - Prompt 1 (Generate):   0%|          | 0/100 [00:00<?, ?it/s]

Finished 'Prompt 1 (Generate)' for flan-t5-large in 35.24 seconds.
Example generated titles:
  Article 1: Weyburn, Saskatchewan
  Article 2: Catholic High School
  Article 3: University of Minnesota
Using prompt: 'Prompt 2 (Question)'


flan-t5-large - Prompt 2 (Question):   0%|          | 0/100 [00:00<?, ?it/s]

Finished 'Prompt 2 (Question)' for flan-t5-large in 35.04 seconds.
Example generated titles:
  Article 1: Weyburn, Saskatchewan
  Article 2: Catholic High School
  Article 3: University of Minnesota
Using prompt: 'Prompt 3 (Summarize)'


flan-t5-large - Prompt 3 (Summarize):   0%|          | 0/100 [00:00<?, ?it/s]

Finished 'Prompt 3 (Summarize)' for flan-t5-large in 35.85 seconds.
Example generated titles:
  Article 1: Weyburn, Saskatchewan
  Article 2: Catholic High School
  Article 3: University of Minnesota

--- All generations complete ---


In [24]:
import evaluate # Use Hugging Face's evaluate library
import numpy as np
import nltk # Ensure nltk is imported, needed by rouge_score

# --- Ensure NLTK punkt is downloaded (needed for default ROUGE tokenizer) ---
try:
    nltk.data.find('tokenizers/punkt')
except (OSError, LookupError):
    print("Downloading nltk punkt tokenizer...")
    nltk.download('punkt', quiet=True)

# --- Load the ROUGE Metric ---
try:
    rouge_metric = evaluate.load('rouge')
    print("ROUGE metric loaded successfully.")
except Exception as e:
    print(f"Error loading ROUGE metric: {e}")
    print("Please ensure the 'evaluate' and 'rouge_score' libraries are installed (`pip install evaluate rouge_score`)")
    # Stop execution if metric can't be loaded
    raise

# --- Function to Compute ROUGE Scores ---
def calculate_rouge_scores(predictions, references):
    """Calculates ROUGE scores using the evaluate library."""
    if not predictions or not references or len(predictions) != len(references):
        print("Warning: Invalid input for ROUGE calculation (empty lists or length mismatch).")
        return None

    # The library expects lists of strings
    result = rouge_metric.compute(predictions=predictions, references=references, use_stemmer=True)

    # Extract F1 scores (you can also extract precision/recall if needed)
    # The keys might be rouge1, rouge2, rougeL, rougeLsum depending on the version and options
    # We'll check for common keys and report F1 score
    scores = {
        'ROUGE-1 F1': result.get('rouge1', 0.0) * 100, # Multiply by 100 for percentage
        'ROUGE-2 F1': result.get('rouge2', 0.0) * 100,
        'ROUGE-L F1': result.get('rougeL', 0.0) * 100,
    }
    return scores

# --- Evaluate All Generated Titles ---

# Store results in a dictionary for easy reporting
rouge_results_c2 = {}

print("\n--- Calculating ROUGE Scores for Flan-T5 Generations ---")

# Ensure reference_titles list is available from the previous step
if 'reference_titles' not in locals() or not reference_titles:
     raise NameError("The 'reference_titles' list is not defined. Please ensure the test data loading was successful.")
if 'results' not in locals() or not results:
     raise NameError("The 'results' dictionary with generated titles is not defined. Please ensure the generation step ran successfully.")


# Iterate through models and prompts where results were generated
for model_name, prompt_dict in results.items():
    rouge_results_c2[model_name] = {}
    print(f"\nEvaluating Model: {model_name}")
    for prompt_name, generated_titles in prompt_dict.items():
        print(f"  Prompt: {prompt_name}")

        # Ensure we have the same number of predictions and references
        if len(generated_titles) != len(reference_titles):
            print(f"    Warning: Mismatch in number of generated ({len(generated_titles)}) and reference ({len(reference_titles)}) titles. Skipping.")
            rouge_results_c2[model_name][prompt_name] = None
            continue

        # Calculate scores
        scores = calculate_rouge_scores(generated_titles, reference_titles)

        if scores:
            rouge_results_c2[model_name][prompt_name] = scores
            # Print scores formatted to 2 decimal places
            print(f"    ROUGE-1 F1: {scores['ROUGE-1 F1']:.2f}")
            print(f"    ROUGE-2 F1: {scores['ROUGE-2 F1']:.2f}")
            print(f"    ROUGE-L F1: {scores['ROUGE-L F1']:.2f}")
        else:
            print("    Failed to calculate ROUGE scores.")
            rouge_results_c2[model_name][prompt_name] = None


print("\n--- ROUGE Score Calculation Complete ---")

# Now rouge_results_c2 dictionary holds the scores, e.g.:
# rouge_results_c2['flan-t5-base']['Prompt 1 (Generate)']['ROUGE-1 F1']

# You would typically format these results into a table for your report (using pandas or just printing)
print("\nSummary of ROUGE F1 Scores (%):")
print("-" * 50)
for model_name, prompt_scores in rouge_results_c2.items():
    print(f"Model: {model_name}")
    for prompt_name, scores in prompt_scores.items():
         if scores:
             print(f"  Prompt: {prompt_name:<25} | ROUGE-1: {scores['ROUGE-1 F1']:.2f} | ROUGE-2: {scores['ROUGE-2 F1']:.2f} | ROUGE-L: {scores['ROUGE-L F1']:.2f}")
         else:
             print(f"  Prompt: {prompt_name:<25} | Scores: Not calculated")
    print("-" * 50)
    

ROUGE metric loaded successfully.

--- Calculating ROUGE Scores for Flan-T5 Generations ---

Evaluating Model: flan-t5-base
  Prompt: Prompt 1 (Generate)
    ROUGE-1 F1: 85.57
    ROUGE-2 F1: 66.32
    ROUGE-L F1: 85.26
  Prompt: Prompt 2 (Question)
    ROUGE-1 F1: 69.28
    ROUGE-2 F1: 54.24
    ROUGE-L F1: 69.37
  Prompt: Prompt 3 (Summarize)
    ROUGE-1 F1: 75.09
    ROUGE-2 F1: 54.86
    ROUGE-L F1: 73.80

Evaluating Model: flan-t5-large
  Prompt: Prompt 1 (Generate)
    ROUGE-1 F1: 88.23
    ROUGE-2 F1: 64.78
    ROUGE-L F1: 88.15
  Prompt: Prompt 2 (Question)
    ROUGE-1 F1: 87.96
    ROUGE-2 F1: 66.00
    ROUGE-L F1: 87.86
  Prompt: Prompt 3 (Summarize)
    ROUGE-1 F1: 88.88
    ROUGE-2 F1: 66.42
    ROUGE-L F1: 88.82

--- ROUGE Score Calculation Complete ---

Summary of ROUGE F1 Scores (%):
--------------------------------------------------
Model: flan-t5-base
  Prompt: Prompt 1 (Generate)       | ROUGE-1: 85.57 | ROUGE-2: 66.32 | ROUGE-L: 85.26
  Prompt: Prompt 2 (Question)   