In [1]:
#Load the TweetSum dataset
from datasets import load_dataset

tweetsum_train = load_dataset("csv", data_files={"train": "/kaggle/input/tweetsum1/tweetsum_train.csv"})
tweetsum_test = load_dataset("csv", data_files={"test": "/kaggle/input/tweetsum1/tweetsum_test.csv"})
tweetsum_validate = load_dataset("csv", data_files={"validation": "/kaggle/input/tweetsum1/tweetsum_valid.csv"})



Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [2]:
from transformers import BartForConditionalGeneration, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base", dropout=0.3)

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [3]:
# Remove the icon tag like =), :v
# Guess the word meaning for the missing character of a word
import re

def preprocess_missingchar_and_icon(sample):
    def clean_text(text):
        # Remove icon tags, including the characters inside angled brackets (e.g., <photo>, <emoji>)
        text = re.sub(r'<.*?>', '', text)
        
        # Remove common emoticons or icons like :v, :-), :)
        text = re.sub(r'(:\)|:-\)|:v|:D|<3)', '', text)
             
        # Remove extra whitespace caused by the removal of icons
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    # Apply cleaning to the text and summary fields
    sample["dialogue"] = clean_text(sample["dialogue"])
    sample["summary"] = clean_text(sample["summary"])
    return sample

# Preprocessing with the TweetSUM dataset
tweetsum_train_clean = tweetsum_train.map(preprocess_missingchar_and_icon)
tweetsum_test_clean = tweetsum_test.map(preprocess_missingchar_and_icon)
tweetsum_validate_clean = tweetsum_validate.map(preprocess_missingchar_and_icon)

Map:   0%|          | 0/879 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

In [4]:
def preprocessData(records, tokenizer, max_length_preprocess=128):
    sources = records["dialogue"]
    targets = records["summary"]

    input_encoding = tokenizer(sources, max_length=max_length_preprocess*8, padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        output_encoding = tokenizer(targets, max_length=max_length_preprocess, padding="max_length", truncation=True)

    # Return as lists to ensure compatibility with DataLoader
    return {
        "input_ids": input_encoding["input_ids"],
        "attention_mask": input_encoding["attention_mask"],
        "labels": output_encoding["input_ids"],
    }
tweetsum_train_dataset = tweetsum_train_clean["train"].map(lambda x: preprocessData(x, tokenizer), batched=True)
tweetsum_validation_dataset = tweetsum_validate_clean["validation"].map(lambda x: preprocessData(x, tokenizer), batched=True)
tweetsum_test_dataset = tweetsum_test_clean["test"].map(lambda x: preprocessData(x, tokenizer), batched=True)

Map:   0%|          | 0/879 [00:00<?, ? examples/s]



Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

In [5]:
from transformers import BartForConditionalGeneration, TrainingArguments, Trainer, EarlyStoppingCallback

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results/tweetsum-finetuned",
    evaluation_strategy="steps",  
    save_strategy="steps",        
    learning_rate=5e-5,
    weight_decay= 0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    save_total_limit=2,
    load_best_model_at_end=True,  
    metric_for_best_model="eval_loss", 
    greater_is_better=False,  
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tweetsum_train_dataset,
    eval_dataset=tweetsum_validation_dataset,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./finetuned_bart_tweetsum1")
tokenizer.save_pretrained("./finetuned_bart_tweetsum1")

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011115631455557681, max=1.0…

Step,Training Loss,Validation Loss
500,1.2819,0.652078
1000,0.5282,0.627011


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


('./finetuned_bart_tweetsum1/tokenizer_config.json',
 './finetuned_bart_tweetsum1/special_tokens_map.json',
 './finetuned_bart_tweetsum1/vocab.json',
 './finetuned_bart_tweetsum1/merges.txt',
 './finetuned_bart_tweetsum1/added_tokens.json',
 './finetuned_bart_tweetsum1/tokenizer.json')

In [14]:
# Build the customized DataLoader class for fine-tunning
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset

tweetsum_train_dataloader = DataLoader(tweetsum_train_dataset, batch_size=8, shuffle=True)
tweetsum_validation_dataloader = DataLoader(tweetsum_validation_dataset, batch_size=8)
tweetsum_test_dataloader = DataLoader(tweetsum_test_dataset, batch_size=8)

In [None]:
!pip install evaluate

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
!pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=9d89f8dc5d81f3963937a591916bd2938acb354bf7e932c4ea5936ec416ca2d3
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [15]:
# Model evaluating using ROUGE
from evaluate import load
import torch

# Load ROUGE metric
rouge = load("rouge")

# Function to generate predictions
def generate_predictions(model, tokenizer, dataset):
    predictions = []
    references = []

    for example in dataset:
        # Prepare the input dialogue
        inputs = tokenizer(
            example["dialogue"], 
            return_tensors="pt", 
            max_length=512, 
            truncation=True, 
            padding="max_length"
        )
        
        # Move inputs to GPU if available
        inputs = {k: v.to("cuda") for k, v in inputs.items()} if torch.cuda.is_available() else inputs
        
        # Generate summary
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"], 
                attention_mask=inputs["attention_mask"], 
                max_length=128, 
                min_length=30, 
                do_sample=False
            )
        
        # Decode the generated summary
        generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Append generated summary and reference summary
        predictions.append(generated_summary)
        references.append(example["summary"])
    
    return predictions, references

# Generate predictions and references
test_predictions, test_references = generate_predictions(model, tokenizer, tweetsum_validation_dataloader)

# Compute ROUGE scores
rouge_results = rouge.compute(predictions=test_predictions, references=test_references)

# Print ROUGE scores
print("ROUGE Scores:")
for key, value in rouge_results.items():
    print(f"{key}: {value:.4f}")

ROUGE Scores:
rouge1: 0.4787
rouge2: 0.2325
rougeL: 0.4190
rougeLsum: 0.4161
