In [None]:
import datasets
import transformers
import pandas as pd
from datasets import Dataset
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
! pip install datasets transformers rouge-score nltk


In [None]:
!pip install transformers pyarabic
!git clone https://github.com/aub-mind/arabert 

In [None]:
!apt install git-lfs

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
train_set = pd.read_excel("../input/arabic-summarization/Arabic/dataset_XL_sum_v1.0_train_ar.xlsx")

test_set = pd.read_excel("../input/arabic-summarization/Arabic/dataset_XL_sum_v1.0_test_ar.xlsx")
train_set['Summary'] = train_set['Summary'].replace(r'\n', '', regex = True)
train_set['Document'] = train_set['Document'].replace(r'\n', '', regex = True)

test_set['Summary'] = test_set['Summary'].replace(r'\n', '', regex = True)
test_set['Document'] = test_set['Document'].replace(r'\n', '', regex = True)
print(train_set.shape)
train_set = train_set.dropna()
print(train_set.shape)
print(test_set.shape)

In [None]:
train_set

In [None]:
train_set.info()

In [None]:
test_set.info()

In [None]:
train_ds = Dataset.from_pandas(train_set)
test_ds = Dataset.from_pandas(test_set)


In [None]:
train_ds

In [None]:
train_dataset, validation_dataset= train_ds.train_test_split(test_size=0.15).values()
#train_dataset = train_dataset.select(range(10))
#validation_dataset = validation_dataset.select(range(10))

In [None]:
data_all_splits = datasets.DatasetDict({"train":train_dataset,"test":test_ds, "val":validation_dataset})

In [None]:
data_all_splits

In [None]:

from transformers import BertTokenizer, AutoModelForSeq2SeqLM, pipeline
from arabert.preprocess import ArabertPreprocessor
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model_name="malmarjeh/mbert2mbert-arabic-text-summarization"

preprocessor = ArabertPreprocessor(model_name=model_name)

tokenizer = BertTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))

In [None]:
max_input_length = 512
max_target_length = 128


def preprocess_function(examples):
    inputs = [doc for doc in examples["Document"]]
    #inputs=preprocessor.preprocess(inputs)
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["Summary"], max_length=max_target_length, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = data_all_splits.map(preprocess_function, batched=True)

In [None]:
tokenized_datasets

In [None]:
batch_size =4
model_name = model_name.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"mbert2mbert-arabic-text-summarization-finetuned-xsum_arabic_abstractive_final_finaln",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    overwrite_output_dir=True
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [None]:
from datasets import load_metric
metric = load_metric("rouge")

In [None]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
tokenized_datasets

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

In [None]:
import os
import gc
import torch
torch.cuda.empty_cache()
gc.collect()
os.environ["WANDB_DISABLED"] = "true"
trainer.train()

In [None]:
trainer.push_to_hub()
