In [5]:
import transformers
import pandas as pd
from datasets import Dataset, load_metric
import datasets
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from huggingface_hub import notebook_login
from transformers import AutoTokenizer
import nltk
import numpy as np
import os
import gc
import torch
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [6]:
! pip install datasets transformers rouge-score nltk



In [7]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.


In [8]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
train_set = pd.read_excel("/kaggle/input/summarization/Competition CERIST -- Summarization/Dataset 2 Arabic + English (XL_sum)/English/dataset_XL_sum_v1.0_train_en.xlsx")
train_ds = Dataset.from_pandas(train_set)

In [10]:
train_dataset, validation_dataset= train_ds.train_test_split(test_size=0.1).values()
data_all_splits = datasets.DatasetDict({"train":train_dataset, "val":validation_dataset})

In [11]:
data_all_splits

DatasetDict({
    train: Dataset({
        features: ['Document', 'Summary'],
        num_rows: 274291
    })
    val: Dataset({
        features: ['Document', 'Summary'],
        num_rows: 30477
    })
})

In [12]:
import re

def clean_text_row(text):
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # Remove emails
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '', text)
    
    # Remove phone numbers (formats: xxx-xxx-xxxx, (xxx)xxx-xxxx, xxxxxxxxxx)
    text = re.sub(r'\b(?:\d{3}[-.]?)?\d{3}[-.]?\d{4}\b', '', text)
    
    # Remove special characters except whitespace
    text = re.sub(r'[^\w\s!.,;?\'"\-]', '', text)
    
    return text

In [13]:
def clean_text(example):
    example['doc_clean']= clean_text_row(example['Document'])
    example['sum_clean']= clean_text_row(example['Summary'])
    return example
    

In [14]:
data_all_splits=data_all_splits.map(clean_text)

  0%|          | 0/274291 [00:00<?, ?ex/s]

  0%|          | 0/30477 [00:00<?, ?ex/s]

In [15]:
metric = load_metric("rouge")

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [16]:
model_checkpoint = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [17]:
prefix = "summarize: "

In [18]:
max_input_length = 512
max_target_length = 128


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["doc_clean"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["sum_clean"], max_length=max_target_length, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [19]:
tokenized_datasets = data_all_splits.map(preprocess_function, batched=True)

  0%|          | 0/275 [00:00<?, ?ba/s]



  0%|          | 0/31 [00:00<?, ?ba/s]

In [20]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['Document', 'Summary', 'doc_clean', 'sum_clean', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 274291
    })
    val: Dataset({
        features: ['Document', 'Summary', 'doc_clean', 'sum_clean', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 30477
    })
})

In [21]:

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [22]:
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum_epoch_27",
    evaluation_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [23]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [24]:


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [25]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

In [26]:
torch.cuda.empty_cache()
gc.collect()
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.0,,30.2059,9.8508,23.9448,24.0281,18.2213




TrainOutput(global_step=34287, training_loss=0.0, metrics={'train_runtime': 27485.9148, 'train_samples_per_second': 9.979, 'train_steps_per_second': 1.247, 'total_flos': 1.8773901387730944e+17, 'train_loss': 0.0, 'epoch': 1.0})

In [27]:
trainer.push_to_hub()


events.out.tfevents.1707948552.cfe134b30fa2.34.0:   0%|          | 0.00/17.1k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lilouuch/flan-t5-base-finetuned-xsum_epoch_27/commit/09adbe87b078f46727b17afa9bbd93bb60eb29cf', commit_message='End of training', commit_description='', oid='09adbe87b078f46727b17afa9bbd93bb60eb29cf', pr_url=None, pr_revision=None, pr_num=None)