In [1]:
pip install rouge_score

Note: you may need to restart the kernel to use updated packages.


# Import Libraries

In [37]:
import pandas as pd
import numpy as np
from transformers import (AutoModelForSeq2SeqLM,
                          AutoTokenizer,
                          DataCollatorForSeq2Seq,
                          TrainingArguments,
                          Trainer,
                          IntervalStrategy,
                          EarlyStoppingCallback,
                         )
from datasets import Dataset, DatasetDict, load_metric
import torch
import nltk
nltk.download("punkt", quiet=True)

metric = load_metric("rouge", trust_remote_code=True)

In [16]:
device = "cuda" if torch.cuda.is_available() else "cpu"
encoder_max_length = 512
decoder_max_length = 128
batch_size = 2

# Prepare Data, Model, and Tokenizer

In [4]:
train = pd.read_csv("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv")
train_dataset = Dataset.from_pandas(train)

In [7]:
cheakPoint = "facebook/bart-large-cnn"
model = AutoModelForSeq2SeqLM.from_pretrained(cheakPoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(cheakPoint)

In [8]:
# Working on a sample
train_dataset = train_dataset.shuffle(seed=42)
train, val = train_dataset.select(range(400)), train_dataset.select(range(400, 490))

In [9]:
dataset_dict = DatasetDict({"train": train, "validation": val})
dataset_dict.remove_columns("id")

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights'],
        num_rows: 400
    })
    validation: Dataset({
        features: ['article', 'highlights'],
        num_rows: 90
    })
})

# Tokenization Step

In [11]:
def batch_tokenize_preprocess(batch, tokenizer, encoder_max_length, decoder_max_length):
    
    source, target = batch["article"], batch["highlights"]
    source_tokenized = tokenizer(source, padding="max_length", truncation=True, max_length=encoder_max_length )
    target_tokenized = tokenizer(target, padding="max_length", truncation=True, max_length=decoder_max_length)

    # Ignore padding in the loss
    target_labels = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]

    # Create a dictionary for the batch
    batch_dict = {
        "input_ids": source_tokenized["input_ids"],
        "attention_mask": source_tokenized["attention_mask"],
        "labels": target_labels,
    }

    return batch_dict

In [17]:
train_data = train.map(
    lambda batch: batch_tokenize_preprocess(batch, tokenizer, encoder_max_length, decoder_max_length),
    batched=True,
    remove_columns=train.column_names,
)

validation_data = val.map(
    lambda batch: batch_tokenize_preprocess(batch, tokenizer, encoder_max_length, decoder_max_length),
    batched=True,
    remove_columns=val.column_names,
)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

# *Metric func* for compute metrics at evaluation. 

In [20]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
#     print(f"preds: {preds}")
#     print(f"labels: {labels}")
    decoded_preds = [tokenizer.batch_decode(np.argmax(pred, axis=1), skip_special_tokens=True) for pred in preds]
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = [tokenizer.batch_decode(label, skip_special_tokens=True) for label in labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {
        k: round(v, 4) for k, v in result.items()
    }
    return result

In [21]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Prepare the TrainingArguments module

In [30]:
training_args = TrainingArguments(
    output_dir='bart_CNN_NLP',
    num_train_epochs=4,  
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    logging_dir="bart_logs",
    logging_steps=20,
    load_best_model_at_end=True,
    evaluation_strategy = "steps",
    eval_steps = 40,
    save_steps=1e6,
)

In [31]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


# Fine-Tuning step

In [32]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
40,3.1748,3.156408,44.8208,26.6733,41.2873,41.226,6433791.8889
80,3.0649,2.938647,45.8469,27.8327,41.8543,41.8139,6433791.8556
120,2.6983,2.871237,47.7681,29.8568,43.9396,43.8816,6433791.8778
160,2.6725,2.869803,46.6433,29.2504,43.1299,43.0348,6433791.9333
200,2.7537,2.853443,47.0645,29.6233,43.5479,43.4841,6433791.8778
240,2.3728,2.93052,46.1673,28.848,42.6293,42.5577,6433791.8889
280,2.3572,2.941355,47.2408,29.4202,43.4668,43.3747,6433791.9
320,2.087,3.036608,46.652,28.7844,42.7646,42.6204,6433791.8778
360,2.1212,3.016856,46.6902,28.1997,42.5114,42.4226,6433791.8222
400,2.1264,3.047896,45.8751,28.1917,42.0922,41.9934,6433791.8333


TrainOutput(global_step=400, training_loss=2.5789558506011963, metrics={'train_runtime': 537.4812, 'train_samples_per_second': 2.977, 'train_steps_per_second': 0.744, 'total_flos': 1733683681689600.0, 'train_loss': 2.5789558506011963, 'epoch': 4.0})

# Generate sumary

In [39]:
def generate_summary(test_samples, model, max_length):
    inputs = tokenizer(
        test_samples,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return output_str

In [86]:
sample = "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct."

In [88]:
res = generate_summary(sample, trainer.model, max_length=1028)
res

['The Eiffel Tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris.\nDuring its construction, the EIFFel Tower surpassed the Washington Monument to become the tallest man-made structure in the world.\nIt held the title for 41 years until the Chrysler Building in New York City was finished in 1930.']

# Push the model to my HuggingFace 🤗 repo

In [35]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [89]:
# Push your model to the Model Hub
trainer.push_to_hub()

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


events.out.tfevents.1714223528.dac0803a1b2d.5846.6:   0%|          | 0.00/5.58k [00:00<?, ?B/s]

events.out.tfevents.1714214491.dac0803a1b2d.34.0:   0%|          | 0.00/11.1k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

events.out.tfevents.1714223507.dac0803a1b2d.5846.5:   0%|          | 0.00/5.58k [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

events.out.tfevents.1714228525.dac0803a1b2d.7049.0:   0%|          | 0.00/7.33k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Moatasem22/bart_CNN_NLP/commit/31cf7b76fd7df65e3653a4fed631b6ed531d34b0', commit_message='End of training', commit_description='', oid='31cf7b76fd7df65e3653a4fed631b6ed531d34b0', pr_url=None, pr_revision=None, pr_num=None)