In [1]:
from datasets import load_dataset

samsum_train_dataset = load_dataset("csv", data_files={"train": "/kaggle/input/samsum-dataset-text-summarization/samsum-test.csv"})
samsum_test_dataset = load_dataset("csv", data_files={"test": "/kaggle/input/samsum-dataset-text-summarization/samsum-test.csv"})
samsum_validate_dataset = load_dataset("csv", data_files={"validation": "/kaggle/input/samsum-dataset-text-summarization/samsum-validation.csv"})



Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [2]:
samsum_train_dataset["train"][1]


{'id': '13729565',
 'dialogue': "Eric: MACHINE!\r\nRob: That's so gr8!\r\nEric: I know! And shows how Americans see Russian ;)\r\nRob: And it's really funny!\r\nEric: I know! I especially like the train part!\r\nRob: Hahaha! No one talks to the machine like that!\r\nEric: Is this his only stand-up?\r\nRob: Idk. I'll check.\r\nEric: Sure.\r\nRob: Turns out no! There are some of his stand-ups on youtube.\r\nEric: Gr8! I'll watch them now!\r\nRob: Me too!\r\nEric: MACHINE!\r\nRob: MACHINE!\r\nEric: TTYL?\r\nRob: Sure :)",
 'summary': 'Eric and Rob are going to watch a stand-up on youtube.'}

In [3]:
print(samsum_test_dataset.keys())

dict_keys(['test'])


In [4]:
from transformers import pipeline

text_summarizer = pipeline("summarization", model="facebook/bart-base", device=0)


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [5]:
text_summarizer(samsum_train_dataset["train"][128]["dialogue"], max_length=20, min_length=10, do_sample= False )

[{'summary_text': 'Paul: hey Matthew did you find anyone to couch the game Saturday?Matthew: hey'}]

In [6]:
# Fine tune the SamSUM model to improve the summarize performance
# Add the BART tokenizer and model
from transformers import BartForConditionalGeneration, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base", dropout=0.3)

In [7]:
# Remove the icon tag like =), :v
# Guess the word meaning for the missing character of a word
import re

def preprocess_missingchar_and_icon(sample):
    def clean_text(text):
        # Remove icon tags, including the characters inside angled brackets (e.g., <photo>, <emoji>)
        text = re.sub(r'<.*?>', '', text)
        
        # Remove common emoticons or icons like :v, :-), :)
        text = re.sub(r'(:\)|:-\)|:v|:D|<3)', '', text)
             
        # Remove extra whitespace caused by the removal of icons
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    # Apply cleaning to the text and summary fields
    sample["dialogue"] = clean_text(sample["dialogue"])
    sample["summary"] = clean_text(sample["summary"])
    return sample

samsum_train_dataset_clean = samsum_train_dataset.map(preprocess_missingchar_and_icon)
samsum_test_dataset_clean = samsum_test_dataset.map(preprocess_missingchar_and_icon)
samsum_validate_dataset_clean = samsum_validate_dataset.map(preprocess_missingchar_and_icon)

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [8]:
def preprocessData(records, tokenizer, max_length_preprocess=128):
    sources = records["dialogue"]
    targets = records["summary"]

    input_encoding = tokenizer(sources, max_length=max_length_preprocess*8, padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        output_encoding = tokenizer(targets, max_length=max_length_preprocess, padding="max_length", truncation=True)

    # Return as lists to ensure compatibility with DataLoader
    return {
        "input_ids": input_encoding["input_ids"],
        "attention_mask": input_encoding["attention_mask"],
        "labels": output_encoding["input_ids"],
    }

train_dataset = samsum_train_dataset_clean["train"].map(lambda x: preprocessData(x, tokenizer), batched=True)
validation_dataset = samsum_validate_dataset_clean["validation"].map(lambda x: preprocessData(x, tokenizer), batched=True)
test_dataset = samsum_test_dataset_clean["test"].map(lambda x: preprocessData(x, tokenizer), batched=True)

Map:   0%|          | 0/819 [00:00<?, ? examples/s]



Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

In [9]:
# Build the customized DataLoader class for fine-tunning
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)

In [10]:
!pip install evaluate

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [11]:
from transformers import BartForConditionalGeneration, TrainingArguments, Trainer, EarlyStoppingCallback

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",  
    save_strategy="steps",        
    learning_rate=5e-5,
    weight_decay= 0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    save_total_limit=2,
    load_best_model_at_end=True,  
    metric_for_best_model="eval_loss", 
    greater_is_better=False,  
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./finetuned_bart_samsum")
tokenizer.save_pretrained("./finetuned_bart_samsum")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112949311110596, max=1.0…

Step,Training Loss,Validation Loss
500,1.0934,0.416001
1000,0.3021,0.424176


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


('./finetuned_bart_samsum/tokenizer_config.json',
 './finetuned_bart_samsum/special_tokens_map.json',
 './finetuned_bart_samsum/vocab.json',
 './finetuned_bart_samsum/merges.txt',
 './finetuned_bart_samsum/added_tokens.json',
 './finetuned_bart_samsum/tokenizer.json')

In [12]:
# Evaluate the model
results = trainer.evaluate(eval_dataset=test_dataset)
print(results)


{'eval_loss': 0.2509564757347107, 'eval_runtime': 20.8104, 'eval_samples_per_second': 39.355, 'eval_steps_per_second': 4.949, 'epoch': 10.0}


In [13]:
!pip install rouge_score


  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=b0bb208e6825f4f2aad484f9d5aa04bb5a6f1cf135f0526bcafdbfb2fa4a708a
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [14]:
# Model evaluating using ROUGE
from evaluate import load
import torch

# Load ROUGE metric
rouge = load("rouge")

# Function to generate predictions
def generate_predictions(model, tokenizer, dataset):
    predictions = []
    references = []

    for example in dataset:
        # Prepare the input dialogue
        inputs = tokenizer(
            example["dialogue"], 
            return_tensors="pt", 
            max_length=512, 
            truncation=True, 
            padding="max_length"
        )
        
        # Move inputs to GPU if available
        inputs = {k: v.to("cuda") for k, v in inputs.items()} if torch.cuda.is_available() else inputs
        
        # Generate summary
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"], 
                attention_mask=inputs["attention_mask"], 
                max_length=128, 
                min_length=30, 
                do_sample=False
            )
        
        # Decode the generated summary
        generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Append generated summary and reference summary
        predictions.append(generated_summary)
        references.append(example["summary"])
    
    return predictions, references

# Generate predictions and references
test_predictions, test_references = generate_predictions(model, tokenizer, validation_dataloader)

# Compute ROUGE scores
rouge_results = rouge.compute(predictions=test_predictions, references=test_references)

# Print ROUGE scores
print("ROUGE Scores:")
for key, value in rouge_results.items():
    print(f"{key}: {value:.4f}")


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ROUGE Scores:
rouge1: 0.4234
rouge2: 0.1938
rougeL: 0.3272
rougeLsum: 0.3273


In [15]:
#Load the TweetSum dataset
from datasets import load_dataset

tweetsum_train = load_dataset("csv", data_files={"train": "/kaggle/input/tweetsum/tweetsum_train.csv"})
tweetsum_test = load_dataset("csv", data_files={"test": "/kaggle/input/tweetsum/tweetsum_test.csv"})
tweetsum_validate = load_dataset("csv", data_files={"validation": "/kaggle/input/tweetsum/tweetsum_valid.csv"})



Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [16]:
tweetsum_train["train"][0]

{'id': 1,
 'dialogue': ' customer: neither my iPhone nor my Apple Watch are recording my steps/activity, and Health doesn’t recognise either source anymore for some reason. Any ideas?  customer: please read the above. support: Let’s investigate this together. To start, can you tell us the software versions your iPhone and Apple Watch are running currently? customer: My iPhone is on 11.1.2, and my watch is on 4.1. support: Thank you. Have you tried restarting both devices since this started happening? customer: I’ve restarted both, also un-paired then re-paired the watch. support: Got it. When did you first notice that the two devices were not talking to each other. Do the two devices communicate through other apps such as Messages? customer: Yes, everything seems fine, it’s just Health and activity. support: Let’s move to DM and look into this a bit more. When reaching out in DM, let us know when this first started happening please. For example, did it start after an update or after in

In [17]:
tweetsum_test["test"][0]

{'id': 1,
 'dialogue': " customer: My watchlist is not updating with new episodes (past couple days).  Any idea why? support: Apologies for the trouble, Norlene! We're looking into this. In the meantime, try navigating to the season / episode manually. customer: Tried logging out/back in, that didn’t help support: Sorry! 😔 We assure you that our team is working hard to investigate, and we hope to have a fix ready soon! customer: Thank you! Some shows updated overnight, but others did not... support: We definitely understand, Norlene. For now, we recommend checking the show page for these shows as the new eps will be there customer: As of this morning, the problem seems to be resolved. Watchlist updated overnight with all new episodes. Thank you for your attention to this matter! I love Hulu 💚 support: Awesome! That's what we love to hear. If you happen to need anything else, we'll be here to support! 💚",
 'summary': 'Customer is complaining that the watchlist is not updated with new ep

In [18]:
tweetsum_validate["validation"][0]

{'id': 1,
 'dialogue': ' customer: hey, any explanation why the "Create similar playlist" function doesn\'t work anymore for me? MacBook, v1.0.64.399.g4637b02a. support: Hi there, the cavalry\'s here! Does logging out, restarting your device, and logging back into Spotify help? Keep us in the loop /JI customer: no, it didn\'t :( tried everything but I still can\'t create the playlist. it\'s not even greyed out but nothing happens after clicking on it. support: Okay. Can we have you try reinstalling the app? To do so, just follow the steps at  Let us know how it goes /JI customer: i tried and it\'s still the same... moreover, my song history is always empty, so I can\'t find songs from previous Discover playlists :( support: Does restarting your computer help at all? Also, is the song history you\'re referring to the History tab on your Play Queue? /MT customer: no, I tried that as well and just reinstalled again - didn\'t help. yes, that\'s what I mean. support: Could you DM us your ac

In [19]:
# Preprocessing with the TweetSUM dataset
tweetsum_train_clean = tweetsum_train.map(preprocess_missingchar_and_icon)
tweetsum_test_clean = tweetsum_test.map(preprocess_missingchar_and_icon)
tweetsum_validate_clean = tweetsum_validate.map(preprocess_missingchar_and_icon)

Map:   0%|          | 0/879 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

In [20]:
# Load the BART_SamSUM model
# Load the fine-tuned SAMSum model and tokenizer
from transformers import BartForConditionalGeneration, BartTokenizer

model_pretrained = BartForConditionalGeneration.from_pretrained("./finetuned_bart_samsum")
tokenizer_pretrained = BartTokenizer.from_pretrained("./finetuned_bart_samsum")

In [21]:
tweetsum_train_dataset = tweetsum_train_clean["train"].map(lambda x: preprocessData(x, tokenizer), batched=True)
tweetsum_validation_dataset = tweetsum_validate_clean["validation"].map(lambda x: preprocessData(x, tokenizer), batched=True)
tweetsum_test_dataset = tweetsum_test_clean["test"].map(lambda x: preprocessData(x, tokenizer), batched=True)

Map:   0%|          | 0/879 [00:00<?, ? examples/s]



Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

In [22]:
tweetsum_train_dataloader = DataLoader(tweetsum_train_dataset, batch_size=8, shuffle=True)
tweetsum_validation_dataloader = DataLoader(tweetsum_validation_dataset, batch_size=8)
tweetsum_test_dataloader = DataLoader(tweetsum_test_dataset, batch_size=8)

In [23]:
from transformers import BartForConditionalGeneration, TrainingArguments, Trainer, EarlyStoppingCallback

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",  
    save_strategy="steps",        
    learning_rate=5e-5,
    weight_decay= 0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    save_total_limit=2,
    load_best_model_at_end=True,  
    metric_for_best_model="eval_loss", 
    greater_is_better=False,  
)

# Initialize the Trainer
trainer = Trainer(
    model=model_pretrained,
    args=training_args,
    train_dataset=tweetsum_train_dataset,
    eval_dataset=tweetsum_validation_dataset,
    tokenizer=tokenizer_pretrained,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./finetuned_bart_tweetsum")
tokenizer.save_pretrained("./finetuned_bart_tweetsum")



Step,Training Loss,Validation Loss
500,0.7011,0.647596
1000,0.5264,0.627805


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


('./finetuned_bart_tweetsum/tokenizer_config.json',
 './finetuned_bart_tweetsum/special_tokens_map.json',
 './finetuned_bart_tweetsum/vocab.json',
 './finetuned_bart_tweetsum/merges.txt',
 './finetuned_bart_tweetsum/added_tokens.json',
 './finetuned_bart_tweetsum/tokenizer.json')

In [24]:
# Model evaluating using ROUGE
from evaluate import load
import torch

# Load ROUGE metric
rouge = load("rouge")

# Function to generate predictions
def generate_predictions(model, tokenizer, dataset):
    predictions = []
    references = []

    for example in dataset:
        # Prepare the input dialogue
        inputs = tokenizer(
            example["dialogue"], 
            return_tensors="pt", 
            max_length=512, 
            truncation=True, 
            padding="max_length"
        )
        
        # Move inputs to GPU if available
        inputs = {k: v.to("cuda") for k, v in inputs.items()} if torch.cuda.is_available() else inputs
        
        # Generate summary
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"], 
                attention_mask=inputs["attention_mask"], 
                max_length=128, 
                min_length=30, 
                do_sample=False
            )
        
        # Decode the generated summary
        generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Append generated summary and reference summary
        predictions.append(generated_summary)
        references.append(example["summary"])
    
    return predictions, references

# Generate predictions and references
test_predictions, test_references = generate_predictions(model_pretrained, tokenizer_pretrained, tweetsum_validation_dataloader)

# Compute ROUGE scores
rouge_results = rouge.compute(predictions=test_predictions, references=test_references)

# Print ROUGE scores
print("ROUGE Scores:")
for key, value in rouge_results.items():
    print(f"{key}: {value:.4f}")


ROUGE Scores:
rouge1: 0.4495
rouge2: 0.2005
rougeL: 0.3739
rougeLsum: 0.3707
