In [None]:
import pandas as pd
import numpy as np


In [None]:
traindata=pd.read_csv('CNNtrain.csv')

In [None]:
testdata=pd.read_csv('CNNtest.csv')

In [None]:
traindata.drop('id',axis=1,inplace=True)

In [None]:
traindata

In [None]:
testdata.reset_index(drop=True, inplace=True)


In [None]:
testdata

In [None]:
# Assuming traindata and testdata are your original dataframes
# Select 5000 random rows for training and 3000 for testing

# Set a random seed for reproducibility
train_sample = traindata.sample(n=5000, random_state=42)
test_sample = testdata.sample(n=1000, random_state=42)


In [None]:
train_sample = train_sample.reset_index(drop=True)
test_sample = test_sample.reset_index(drop=True)

# Check the first few rows of each sample to confirm
print(train_sample.head())
print(test_sample.head())

In [None]:
train_sample

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader


In [None]:
# Load pre-trained T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")


In [None]:
def preprocess_data(data, tokenizer, max_input_length=512, max_output_length=150):
    # Add "summarize:" prefix to each article for T5’s text-to-text format
    inputs = ["summarize: " + text for text in data["article"]]
    targets = list(data["highlights"])
    
    # Tokenize inputs and targets
    input_encodings = tokenizer(inputs, truncation=True, padding=True, max_length=max_input_length, return_tensors="pt")
    target_encodings = tokenizer(targets, truncation=True, padding=True, max_length=max_output_length, return_tensors="pt").input_ids
    
    return input_encodings, target_encodings


In [None]:
train_inputs, train_labels = preprocess_data(train_sample, tokenizer)
test_inputs, test_labels = preprocess_data(test_sample, tokenizer)


In [None]:
train_labels

In [None]:
train_inputs

In [None]:
from transformers import AdamW

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=3e-5)


In [None]:
from torch.utils.data import Dataset, DataLoader

class SummarizationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
# Create the training and testing datasets
train_dataset = SummarizationDataset(train_inputs, train_labels)
test_dataset = SummarizationDataset(test_inputs, test_labels)

# Define batch size (you can adjust it based on your available memory and GPU capacity)
batch_size = 4

# Create the DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [None]:
# Training loop (you should be able to run this now)
# Set the number of epochs
num_epochs = 3  # You can adjust this number as needed
for epoch in range(num_epochs):
    epoch_loss = 0
    for batch in train_loader:
        # Training code here...
        pass  # Replace with actual training steps as discussed earlier


In [None]:
# Define the number of epochs
num_epochs = 1  # You can adjust this based on your requirements
model.train()

for epoch in range(num_epochs):
    epoch_loss = 0  # Initialize loss for this epoch

    # Iterate over each batch
    for batch in train_loader:
        # Move batch data to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Zero out gradients from the previous step
        optimizer.zero_grad()
        
        # Forward pass: Get model outputs and compute loss
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        # Backward pass: Calculate gradients and update weights
        loss.backward()
        optimizer.step()
        
        # Accumulate the loss for this batch
        epoch_loss += loss.item()
    
    # Calculate and print average loss for this epoch
    avg_loss = epoch_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss}")


In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1, rouge2, rougeL = 0, 0, 0
num_samples = len(generated_summaries)

for gen_summary, ref_summary in zip(generated_summaries, reference_summaries):
    scores = scorer.score(ref_summary, gen_summary)
    rouge1 += scores['rouge1'].fmeasure
    rouge2 += scores['rouge2'].fmeasure
    rougeL += scores['rougeL'].fmeasure

# Calculate average ROUGE scores
print(f"Average ROUGE-1: {rouge1 / num_samples:.4f}")
print(f"Average ROUGE-2: {rouge2 / num_samples:.4f}")
print(f"Average ROUGE-L: {rougeL / num_samples:.4f}")


In [None]:
generated_summaries = []
reference_summaries = test_sample['highlights'].tolist()  # Assuming 'highlights' is the column for reference summaries

# Set model to evaluation mode
model.eval()

# Generate summaries for each batch in the test set
with torch.no_grad():
    for batch in test_loader:
        # Move batch data to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        # Generate summary using the model
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=150, num_beams=4, early_stopping=True)
        
        # Decode the generated summaries
        batch_summaries = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
        generated_summaries.extend(batch_summaries)


In [1]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset


In [2]:
# Load the dataset
# Replace 'path_to_csv' with the actual path to your dataset
data = pd.read_csv('CNNtest.csv')

# Display dataset structure
print(data.head())

# Select a smaller sample (e.g., 5000 rows for training and 2000 for testing)
train_data = data.sample(n=5000, random_state=42)
test_data = data.sample(n=2000, random_state=42)

# Reset indices for both
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

# Display sample structure
print(train_data.head())
print(test_data.head())


                                         id  \
0  92c514c913c0bdfe25341af9fd72b29db544099b   
1  2003841c7dc0e7c5b1a248f9cd536d727f27a45a   
2  91b7d2311527f5c2b63a65ca98d21d9c92485149   
3  caabf9cbdf96eb1410295a673e953d304391bfbb   
4  3da746a7d9afcaa659088c8366ef6347fe6b53ea   

                                             article  \
0  Ever noticed how plane seats appear to be gett...   
1  A drunk teenage boy had to be rescued by secur...   
2  Dougie Freedman is on the verge of agreeing a ...   
3  Liverpool target Neto is also wanted by PSG an...   
4  Bruce Jenner will break his silence in a two-h...   

                                          highlights  
0  Experts question if  packed out planes are put...  
1  Drunk teenage boy climbed into lion enclosure ...  
2  Nottingham Forest are close to extending Dougi...  
3  Fiorentina goalkeeper Neto has been linked wit...  
4  Tell-all interview with the reality TV star, 6...  
                                         id  \
0  

In [3]:
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=512, max_target_length=150):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        article = str(self.data.iloc[index]['article'])
        summary = str(self.data.iloc[index]['highlights'])

        # Tokenize input (article) and output (summary)
        input_encoding = self.tokenizer(
            article,
            max_length=self.max_input_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        target_encoding = self.tokenizer(
            summary,
            max_length=self.max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Return tokenized input and output
        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }


In [4]:
# Initialize T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Create train and test datasets
train_dataset = SummarizationDataset(train_data, tokenizer)
test_dataset = SummarizationDataset(test_data, tokenizer)

# Create DataLoader for batch processing
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
# Load pre-trained T5 model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)


In [6]:
# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training function
def train_model(model, train_loader, optimizer, num_epochs=1):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            # Move data to the device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Zero out gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # Backward pass
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")


In [7]:
# Train the model for 1 epoch as a starting point
train_model(model, train_loader, optimizer, num_epochs=1)


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1, Loss: 1.5661949213981627


In [9]:
from rouge_score import rouge_scorer

# Generate summaries for the test dataset
model.eval()
generated_summaries = []
reference_summaries = test_data['highlights'].tolist()  # Reference summaries (ground truth)

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Generate summaries
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=150, num_beams=4, early_stopping=True)
        decoded_summaries = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in outputs]
        generated_summaries.extend(decoded_summaries)

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate ROUGE scores
rouge1, rouge2, rougeL = 0, 0, 0
num_samples = len(generated_summaries)

for gen_summary, ref_summary in zip(generated_summaries, reference_summaries):
    scores = scorer.score(ref_summary, gen_summary)
    rouge1 += scores['rouge1'].fmeasure
    rouge2 += scores['rouge2'].fmeasure
    rougeL += scores['rougeL'].fmeasure

# Average scores
rouge1_avg = rouge1 / num_samples
rouge2_avg = rouge2 / num_samples
rougeL_avg = rougeL / num_samples

# Print the results
print(f"Average ROUGE-1: {rouge1_avg:.4f}")
print(f"Average ROUGE-2: {rouge2_avg:.4f}")
print(f"Average ROUGE-L: {rougeL_avg:.4f}")


Average ROUGE-1: 0.3450
Average ROUGE-2: 0.1588
Average ROUGE-L: 0.2436
