In [35]:
from datasets import load_dataset

samsum_train_dataset = load_dataset("csv", data_files={"train": "/kaggle/input/samsum-dataset-text-summarization/samsum-train.csv"})
samsum_test_dataset = load_dataset("csv", data_files={"test": "/kaggle/input/samsum-dataset-text-summarization/samsum-test.csv"})
samsum_validate_dataset = load_dataset("csv", data_files={"validation": "/kaggle/input/samsum-dataset-text-summarization/samsum-validation.csv"})



In [36]:
# Check the number of rows in each split of the dataset
print(f"Training dataset size: {samsum_train_dataset['train'].num_rows}")
print(f"Test dataset size: {samsum_test_dataset['test'].num_rows}")
print(f"Validation dataset size: {samsum_validate_dataset['validation'].num_rows}")


Training dataset size: 14732
Test dataset size: 819
Validation dataset size: 818


In [37]:
samsum_train_dataset["train"][25]


{'id': '13810064',
 'dialogue': 'Julius: dude, your assessment of manutd\r\nLawrence: i have nothing to say, im so offended and hopeless of them this season\r\nJulius: me too\r\nLawrence: i dont even know whats wrong with the team\r\nJulius: the quality is there but nothing is happening\r\nLawrence: the players look tired of something\r\nJulius:  with mourinhos conservative football!!\r\nLawrence: its so boring\r\nJulius: so lifeless\r\nLawrence: man!!\r\nJulius: it needs to change, hope the board sees it\r\nLawrence: sooner than later\r\nJulius: yeah\r\nLawrence: yeah',
 'summary': "Lawrence doesn't like the play of Manchester United. He and Julius complain about the team and Mourinho's style."}

In [38]:
print(samsum_test_dataset.keys())

dict_keys(['test'])


In [39]:
from transformers import pipeline

text_summarizer = pipeline("summarization", model="facebook/bart-base", device=0)




In [40]:
text_summarizer(samsum_train_dataset["train"][128]["dialogue"], max_length=20, min_length=10, do_sample= False )

[{'summary_text': 'Dorothy: Hi! You know what? Ron messaged me again, and'}]

In [41]:
# Fine tune the SamSUM model to improve the summarize performance
# Add the BART tokenizer and model
from transformers import BartForConditionalGeneration, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base", dropout=0.3)

In [42]:
# Remove the icon tag like =), :v
# Guess the word meaning for the missing character of a word
import re

def preprocess_missingchar_and_icon(sample):
    def clean_text(text):
        text = str(text)
        # Remove icon tags, including the characters inside angled brackets (e.g., <photo>, <emoji>)
        text = re.sub(r'<.*?>', '', text)
        
        # Remove common emoticons or icons like :v, :-), :)
        text = re.sub(r'(:\)|:-\)|:v|:D|<3)', '', text)
             
        # Remove extra whitespace caused by the removal of icons
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    # Apply cleaning to the text and summary fields
    sample["dialogue"] = clean_text(sample["dialogue"])
    sample["summary"] = clean_text(sample["summary"])
    return sample

samsum_train_dataset_clean = samsum_train_dataset.map(preprocess_missingchar_and_icon)
samsum_test_dataset_clean = samsum_test_dataset.map(preprocess_missingchar_and_icon)
samsum_validate_dataset_clean = samsum_validate_dataset.map(preprocess_missingchar_and_icon)

In [43]:
def preprocessData(records, tokenizer, max_length_preprocess=128):
    sources = records["dialogue"]
    targets = records["summary"]

    input_encoding = tokenizer(sources, max_length=max_length_preprocess*8, padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        output_encoding = tokenizer(targets, max_length=max_length_preprocess, padding="max_length", truncation=True)

    # Return as lists to ensure compatibility with DataLoader
    return {
        "input_ids": input_encoding["input_ids"],
        "attention_mask": input_encoding["attention_mask"],
        "labels": output_encoding["input_ids"],
    }

train_dataset = samsum_train_dataset_clean["train"].map(lambda x: preprocessData(x, tokenizer), batched=True)
validation_dataset = samsum_validate_dataset_clean["validation"].map(lambda x: preprocessData(x, tokenizer), batched=True)
test_dataset = samsum_test_dataset_clean["test"].map(lambda x: preprocessData(x, tokenizer), batched=True)

Map:   0%|          | 0/818 [00:00<?, ? examples/s]



In [44]:
# Build the customized DataLoader class for fine-tunning
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset

train_dataloader = DataLoader(train_dataset, batch_size=8,num_workers=4, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=8,num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size=8,num_workers=4)

In [45]:
!pip install evaluate

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [46]:
from transformers import BartForConditionalGeneration, TrainingArguments, Trainer, EarlyStoppingCallback

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results/pre-trained-model",
    evaluation_strategy="steps",  
    save_strategy="steps", 
    save_steps=500,
    learning_rate=5e-5,
    weight_decay= 0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    save_total_limit=2,
    load_best_model_at_end=True,  
    metric_for_best_model="eval_loss", 
    greater_is_better=False,  
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./finetuned_bart_samsum")
tokenizer.save_pretrained("./finetuned_bart_samsum")



Step,Training Loss,Validation Loss
500,1.135,0.410469
1000,0.4777,0.389774
1500,0.4604,0.380804
2000,0.4329,0.374556
2500,0.4041,0.377255
3000,0.4082,0.369926
3500,0.3957,0.360668
4000,0.3756,0.361912
4500,0.3564,0.364814
5000,0.3705,0.355772


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


('./finetuned_bart_samsum/tokenizer_config.json',
 './finetuned_bart_samsum/special_tokens_map.json',
 './finetuned_bart_samsum/vocab.json',
 './finetuned_bart_samsum/merges.txt',
 './finetuned_bart_samsum/added_tokens.json',
 './finetuned_bart_samsum/tokenizer.json')

In [47]:
# Evaluate the model
results = trainer.evaluate(eval_dataset=validation_dataset)
print(results)


{'eval_loss': 0.3557721972465515, 'eval_runtime': 20.9406, 'eval_samples_per_second': 39.063, 'eval_steps_per_second': 4.919, 'epoch': 3.528773072747014}


In [48]:
!pip install rouge_score


  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [49]:
# Model evaluating using ROUGE
from evaluate import load
import torch

# Load ROUGE metric
rouge = load("rouge")

# Function to generate predictions
def generate_predictions(model, tokenizer, dataset):
    predictions = []
    references = []

    for example in dataset:
        # Prepare the input dialogue
        inputs = tokenizer(
            example["dialogue"], 
            return_tensors="pt", 
            max_length=512, 
            truncation=True, 
            padding="max_length"
        )
        
        # Move inputs to GPU if available
        inputs = {k: v.to("cuda") for k, v in inputs.items()} if torch.cuda.is_available() else inputs
        
        # Generate summary
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"], 
                attention_mask=inputs["attention_mask"], 
                max_length=128, 
                min_length=30, 
                do_sample=False
            )
        
        # Decode the generated summary
        generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Append generated summary and reference summary
        predictions.append(generated_summary)
        references.append(example["summary"])
    
    return predictions, references

# Generate predictions and references
test_predictions, test_references = generate_predictions(model, tokenizer, validation_dataloader)

# Compute ROUGE scores
rouge_results = rouge.compute(predictions=test_predictions, references=test_references)

# Print ROUGE scores
print("ROUGE Scores:")
for key, value in rouge_results.items():
    print(f"{key}: {value:.4f}")


  self.pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to a

ROUGE Scores:
rouge1: 0.4504
rouge2: 0.2175
rougeL: 0.3650
rougeLsum: 0.3643


In [50]:
#Load the TweetSum dataset
from datasets import load_dataset

tweetsum_train = load_dataset("csv", data_files={"train": "/kaggle/input/tweetsum/tweetsum_train.csv"})
tweetsum_test = load_dataset("csv", data_files={"test": "/kaggle/input/tweetsum/tweetsum_test.csv"})
tweetsum_validate = load_dataset("csv", data_files={"validation": "/kaggle/input/tweetsum/tweetsum_valid.csv"})



In [51]:
tweetsum_train["train"][0]

{'id': 1,
 'dialogue': ' customer: neither my iPhone nor my Apple Watch are recording my steps/activity, and Health doesn’t recognise either source anymore for some reason. Any ideas?  customer: please read the above. support: Let’s investigate this together. To start, can you tell us the software versions your iPhone and Apple Watch are running currently? customer: My iPhone is on 11.1.2, and my watch is on 4.1. support: Thank you. Have you tried restarting both devices since this started happening? customer: I’ve restarted both, also un-paired then re-paired the watch. support: Got it. When did you first notice that the two devices were not talking to each other. Do the two devices communicate through other apps such as Messages? customer: Yes, everything seems fine, it’s just Health and activity. support: Let’s move to DM and look into this a bit more. When reaching out in DM, let us know when this first started happening please. For example, did it start after an update or after in

In [52]:
tweetsum_test["test"][0]

{'id': 1,
 'dialogue': " customer: My watchlist is not updating with new episodes (past couple days).  Any idea why? support: Apologies for the trouble, Norlene! We're looking into this. In the meantime, try navigating to the season / episode manually. customer: Tried logging out/back in, that didn’t help support: Sorry! 😔 We assure you that our team is working hard to investigate, and we hope to have a fix ready soon! customer: Thank you! Some shows updated overnight, but others did not... support: We definitely understand, Norlene. For now, we recommend checking the show page for these shows as the new eps will be there customer: As of this morning, the problem seems to be resolved. Watchlist updated overnight with all new episodes. Thank you for your attention to this matter! I love Hulu 💚 support: Awesome! That's what we love to hear. If you happen to need anything else, we'll be here to support! 💚",
 'summary': 'Customer is complaining that the watchlist is not updated with new ep

In [53]:
tweetsum_validate["validation"][0]

{'id': 1,
 'dialogue': ' customer: hey, any explanation why the "Create similar playlist" function doesn\'t work anymore for me? MacBook, v1.0.64.399.g4637b02a. support: Hi there, the cavalry\'s here! Does logging out, restarting your device, and logging back into Spotify help? Keep us in the loop /JI customer: no, it didn\'t :( tried everything but I still can\'t create the playlist. it\'s not even greyed out but nothing happens after clicking on it. support: Okay. Can we have you try reinstalling the app? To do so, just follow the steps at  Let us know how it goes /JI customer: i tried and it\'s still the same... moreover, my song history is always empty, so I can\'t find songs from previous Discover playlists :( support: Does restarting your computer help at all? Also, is the song history you\'re referring to the History tab on your Play Queue? /MT customer: no, I tried that as well and just reinstalled again - didn\'t help. yes, that\'s what I mean. support: Could you DM us your ac

In [54]:
# Preprocessing with the TweetSUM dataset
tweetsum_train_clean = tweetsum_train.map(preprocess_missingchar_and_icon)
tweetsum_test_clean = tweetsum_test.map(preprocess_missingchar_and_icon)
tweetsum_validate_clean = tweetsum_validate.map(preprocess_missingchar_and_icon)

In [55]:
# Load the BART_SamSUM model
# Load the fine-tuned SAMSum model and tokenizer
from transformers import BartForConditionalGeneration, BartTokenizer

model_pretrained = BartForConditionalGeneration.from_pretrained("./finetuned_bart_samsum")
tokenizer_pretrained = BartTokenizer.from_pretrained("./finetuned_bart_samsum")
model_pretrained.resize_token_embeddings(len(tokenizer_pretrained))

BartScaledWordEmbedding(50265, 768, padding_idx=1)

In [56]:
tweetsum_train_dataset = tweetsum_train_clean["train"].map(lambda x: preprocessData(x, tokenizer), batched=True)
tweetsum_validation_dataset = tweetsum_validate_clean["validation"].map(lambda x: preprocessData(x, tokenizer), batched=True)
tweetsum_test_dataset = tweetsum_test_clean["test"].map(lambda x: preprocessData(x, tokenizer), batched=True)

Map:   0%|          | 0/110 [00:00<?, ? examples/s]



In [57]:
tweetsum_train_dataloader = DataLoader(tweetsum_train_dataset, batch_size=8, shuffle=True)
tweetsum_validation_dataloader = DataLoader(tweetsum_validation_dataset, batch_size=8)
tweetsum_test_dataloader = DataLoader(tweetsum_test_dataset, batch_size=8)

In [58]:
from transformers import BartForConditionalGeneration, TrainingArguments, Trainer, EarlyStoppingCallback

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results/fine-tuned-model",
    evaluation_strategy="steps",  
    save_strategy="steps",        
    learning_rate=5e-5,
    weight_decay= 0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    save_total_limit=2,
    load_best_model_at_end=True,  
    metric_for_best_model="eval_loss", 
    greater_is_better=False,  
)

# Initialize the Trainer
trainer = Trainer(
    model=model_pretrained,
    args=training_args,
    train_dataset=tweetsum_train_dataset,
    eval_dataset=tweetsum_validation_dataset,
    tokenizer=tokenizer_pretrained,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./finetuned_bart_tweetsum")
tokenizer.save_pretrained("./finetuned_bart_tweetsum")



Step,Training Loss,Validation Loss
500,0.6619,0.644841
1000,0.4916,0.625262


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


('./finetuned_bart_tweetsum/tokenizer_config.json',
 './finetuned_bart_tweetsum/special_tokens_map.json',
 './finetuned_bart_tweetsum/vocab.json',
 './finetuned_bart_tweetsum/merges.txt',
 './finetuned_bart_tweetsum/added_tokens.json',
 './finetuned_bart_tweetsum/tokenizer.json')

In [59]:
# Model evaluating using ROUGE
from evaluate import load
import torch

# Load ROUGE metric
rouge = load("rouge")

# Function to generate predictions
def generate_predictions(model, tokenizer, dataset):
    predictions = []
    references = []

    for example in dataset:
        # Prepare the input dialogue
        inputs = tokenizer(
            example["dialogue"], 
            return_tensors="pt", 
            max_length=512, 
            truncation=True, 
            padding="max_length"
        )
        
        # Move inputs to GPU if available
        inputs = {k: v.to("cuda") for k, v in inputs.items()} if torch.cuda.is_available() else inputs
        
        # Generate summary
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"], 
                attention_mask=inputs["attention_mask"], 
                max_length=128, 
                min_length=30, 
                do_sample=False
            )
        
        # Decode the generated summary
        generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Append generated summary and reference summary
        predictions.append(generated_summary)
        references.append(example["summary"])
    
    return predictions, references

# Generate predictions and references
test_predictions, test_references = generate_predictions(model_pretrained, tokenizer_pretrained, tweetsum_validation_dataloader)

# Compute ROUGE scores
rouge_results = rouge.compute(predictions=test_predictions, references=test_references)

# Print ROUGE scores
print("ROUGE Scores:")
for key, value in rouge_results.items():
    print(f"{key}: {value:.4f}")


ROUGE Scores:
rouge1: 0.4810
rouge2: 0.2222
rougeL: 0.3954
rougeLsum: 0.3935
