In [1]:
from datasets import load_dataset

samsum_train_dataset = load_dataset("csv", data_files={"train": "/kaggle/input/samsum-dataset-text-summarization/samsum-test.csv"})
samsum_test_dataset = load_dataset("csv", data_files={"test": "/kaggle/input/samsum-dataset-text-summarization/samsum-test.csv"})
samsum_validate_dataset = load_dataset("csv", data_files={"validation": "/kaggle/input/samsum-dataset-text-summarization/samsum-validation.csv"})



Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [2]:
samsum_train_dataset["train"][280]

{'id': '13681721',
 'dialogue': "Stan: She replied :-)\r\nDave: She did?\r\nStan: <file_photo>\r\nDave: Lucky you!\r\nStan: I can't believe it! She's my dream come true!\r\nDave: Good luck today! Where are you going to take her?\r\nStan: Pat&Gill's\r\nDave: Good choice. Let me know how it was :-)\r\nStan: I will.\r\nDave: In minute detail :-)\r\nStan: Forget it!",
 'summary': "Stan is meeting the girl of his dreams today in Pat&Gill's. Later he's going to tell Dave how his date went."}

In [None]:
from transformers import pipeline

text_summarizer = pipeline("summarization", model="facebook/bart-base", device=0)


In [None]:
text_summarizer(samsum_train_dataset["train"][1]["dialogue"], max_length=20, min_length=10, do_sample= False )

In [136]:
# Fine tune the SamSUM model to improve the summarize performance
# Add the BART tokenizer and model
from transformers import BartForConditionalGeneration, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base", dropout=0.3)

In [137]:
# Remove the icon tag like =), :v
# Guess the word meaning for the missing character of a word
import re

def preprocess_missingchar_and_icon(sample):
    def clean_text(text):
        # Remove icon tags, including the characters inside angled brackets (e.g., <photo>, <emoji>)
        text = re.sub(r'<.*?>', '', text)
        
        # Remove common emoticons or icons like :v, :-), :)
        text = re.sub(r'(:\)|:-\)|:v|:D|<3)', '', text)
             
        # Remove extra whitespace caused by the removal of icons
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    # Apply cleaning to the text and summary fields
    sample["dialogue"] = clean_text(sample["dialogue"])
    sample["summary"] = clean_text(sample["summary"])
    return sample

samsum_train_dataset = samsum_train_dataset.map(preprocess_missingchar_and_icon)
samsum_test_dataset = samsum_test_dataset.map(preprocess_missingchar_and_icon)
samsum_validate_dataset = samsum_validate_dataset.map(preprocess_missingchar_and_icon)

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [138]:
samsum_train_dataset["train"][280]

{'id': '13681721',
 'dialogue': "Stan: She replied Dave: She did? Stan: Dave: Lucky you! Stan: I can't believe it! She's my dream come true! Dave: Good luck today! Where are you going to take her? Stan: Pat&Gill's Dave: Good choice. Let me know how it was Stan: I will. Dave: In minute detail Stan: Forget it!",
 'summary': "Stan is meeting the girl of his dreams today in Pat&Gill's. Later he's going to tell Dave how his date went."}

In [139]:
def preprocessData(records, tokenizer, max_length_preprocess=128):
    sources = records["dialogue"]
    targets = records["summary"]

    input_encoding = tokenizer(sources, max_length=max_length_preprocess, padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        output_encoding = tokenizer(targets, max_length=max_length_preprocess, padding="max_length", truncation=True)

    # Return as lists to ensure compatibility with DataLoader
    return {
        "input_ids": input_encoding["input_ids"],
        "attention_mask": input_encoding["attention_mask"],
        "labels": output_encoding["input_ids"],
    }

train_dataset = samsum_train_dataset["train"].map(lambda x: preprocessData(x, tokenizer), batched=True)
validation_dataset = samsum_validate_dataset["validation"].map(lambda x: preprocessData(x, tokenizer), batched=True)
test_dataset = samsum_test_dataset["test"].map(lambda x: preprocessData(x, tokenizer), batched=True)

Map:   0%|          | 0/819 [00:00<?, ? examples/s]



Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

In [140]:
# Build the customized DataLoader class for fine-tunning
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset

class SamSUMDataset(Dataset):
    def __init__(self, tokenizer, max_length=512):
        self.dataset = samsum_train_dataset['train']
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        dialogue = self.dataset[idx]['dialogue']
        summary = self.dataset[idx]['summary']
        inputs = self.tokenizer(
            dialogue,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        labels = self.tokenizer(
            summary,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": inputs['input_ids'].squeeze(0),
            "attention_mask": inputs['attention_mask'].squeeze(0),
            "labels": labels['input_ids'].squeeze(0)
        }

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)

In [141]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [142]:
from transformers import BartForConditionalGeneration, TrainingArguments, Trainer, EarlyStoppingCallback

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",  
    save_strategy="steps",        
    learning_rate=5e-5,
    weight_decay= 0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    save_total_limit=2,
    load_best_model_at_end=True,  
    metric_for_best_model="eval_loss", 
    greater_is_better=False,  
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./finetuned_bart_samsum")
tokenizer.save_pretrained("./finetuned_bart_samsum")



Step,Training Loss,Validation Loss
500,1.1067,0.451836
1000,0.3337,0.465542


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


('./finetuned_bart_samsum/tokenizer_config.json',
 './finetuned_bart_samsum/special_tokens_map.json',
 './finetuned_bart_samsum/vocab.json',
 './finetuned_bart_samsum/merges.txt',
 './finetuned_bart_samsum/added_tokens.json',
 './finetuned_bart_samsum/tokenizer.json')

In [143]:
# Evaluate the model
results = trainer.evaluate(eval_dataset=test_dataset)
print(results)


{'eval_loss': 0.2771747410297394, 'eval_runtime': 5.4516, 'eval_samples_per_second': 150.23, 'eval_steps_per_second': 18.893, 'epoch': 10.0}


In [144]:
!pip install rouge_score


  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [145]:
# Model evaluating using ROUGE
from evaluate import load
import torch

# Load ROUGE metric
rouge = load("rouge")

# Function to generate predictions
def generate_predictions(model, tokenizer, dataset):
    predictions = []
    references = []

    for example in dataset:
        # Prepare the input dialogue
        inputs = tokenizer(
            example["dialogue"], 
            return_tensors="pt", 
            max_length=512, 
            truncation=True, 
            padding="max_length"
        )
        
        # Move inputs to GPU if available
        inputs = {k: v.to("cuda") for k, v in inputs.items()} if torch.cuda.is_available() else inputs
        
        # Generate summary
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"], 
                attention_mask=inputs["attention_mask"], 
                max_length=128, 
                min_length=30, 
                do_sample=False
            )
        
        # Decode the generated summary
        generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Append generated summary and reference summary
        predictions.append(generated_summary)
        references.append(example["summary"])
    
    return predictions, references

# Generate predictions and references
test_predictions, test_references = generate_predictions(model, tokenizer, validation_dataloader)

# Compute ROUGE scores
rouge_results = rouge.compute(predictions=test_predictions, references=test_references)

# Print ROUGE scores
print("ROUGE Scores:")
for key, value in rouge_results.items():
    print(f"{key}: {value:.4f}")


ROUGE Scores:
rouge1: 0.4246
rouge2: 0.1890
rougeL: 0.3285
rougeLsum: 0.3284
