In [None]:
!pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece] PyArabic contractions evaluate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

df= pd.read_excel("/content/drive/My Drive/combined.xlsx")
df.head()

In [None]:
df.isnull().sum()

In [None]:
from datasets import Dataset

my_dataset = Dataset.from_pandas(df)
print(my_dataset)

In [None]:
import pyarabic.araby as araby
import re

def clean_arabic(example):
  example['Text (Arabic)'] = re.sub(r'[,]', '،', example['Text (Arabic)'])
  example['Text (Arabic)'] = araby.strip_diacritics(example['Text (Arabic)'])
  return example

my_dataset = my_dataset.map(clean_arabic)

In [None]:
import re

# filter rows that have anything other than letters
def filter_dataset(dataset):
    filtered_rows = []
    for i in range(len(dataset['Text (Arabic)'])):
        if bool(re.search(r'[^\u0621-\u064A\s]', dataset['Text (Arabic)'][i])):
            filtered_rows.append(i)

    return dataset.select(filtered_rows)

filtered_dataset2 = filter_dataset(my_dataset)

print(filtered_dataset2)

In [None]:
for i in range(len(filtered_dataset2)):
  print(filtered_dataset2[i]['Text (Arabic)'])
  print("\n")

In [None]:
import contractions

def clean_english(example):
  example['Text (English)'] = contractions.fix(example['Text (English)']) # this expands words like I'm to I am
  example['Text (English)'] = re.sub(r'[\u0621-\u064A]', '', example['Text (English)'])
  example['Text (English)'] = re.sub(r'[()*…]', '', example['Text (English)'])
  example['Text (English)'] = re.sub(r'[%]', ' percent', example['Text (English)'])
  example['Text (English)'] = example['Text (English)'].strip()
  return example

my_dataset = my_dataset.map(clean_english)

In [None]:
import re

# filter rows that have anything other than letters
def filter_dataset(dataset):
    filtered_rows = []
    for i in range(len(dataset['Text (English)'])):
        if bool(re.search(r'[^A-Za-z\s]', dataset['Text (English)'][i])):
            filtered_rows.append(i)

    return dataset.select(filtered_rows)

filtered_dataset2 = filter_dataset(my_dataset)

print(filtered_dataset2)

In [None]:
for i in range(len(filtered_dataset2)):
  print(filtered_dataset2[i]['Text (English)'])
  print("\n")

In [None]:
from transformers import AutoTokenizer

model_checkpoint="Helsinki-NLP/opus-mt-ar-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
!pip install sacremoses

In [None]:
print(my_dataset[40]['Text (Arabic)']+"\n")
print(my_dataset[40]['Text (English)'])

In [None]:
tokenizer.tokenize(my_dataset[40]['Text (Arabic)'])

In [None]:
tokenizer.tokenize(my_dataset[40]['Text (English)'])

In [None]:
my_dataset

In [None]:
# Tokenize the sentences and calculate their lengths to find max length
# tokenized_lengths = [len(tokenizer.encode(sentence)) for sentence in my_dataset['Text (English)']]

# print("Tokenized Lengths of Sentences:", tokenized_lengths)

# import numpy as np
# print("Mean Length:", np.mean(tokenized_lengths))
# print("Max Length:", np.max(tokenized_lengths))

# x = np.where(np.array(tokenized_lengths) > 390)
# print(len(x[0]))

# import matplotlib.pyplot as plt
# plt.hist(tokenized_lengths)
# plt.show()

In [None]:
my_dataset = my_dataset.train_test_split(test_size=0.3, seed=42)
print(my_dataset)

In [None]:
my_dataset_test = my_dataset['test'].train_test_split(test_size=0.5, seed=42)
print(my_dataset_test)

In [None]:
my_dataset['validation'] = my_dataset_test['train']
my_dataset['test'] = my_dataset_test['test']
print(my_dataset)

In [None]:
max_input_length = 190
max_target_length = 390
source_lang = "ar"
target_lang = "en"

def preprocess_function(examples):
    model_inputs = tokenizer(examples["Text (Arabic)"], max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Text (English)"], max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = my_dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from evaluate import load

bleu = load("bleu")
meteor = load('meteor')

In [None]:
fake_preds = ["Is there elevator?", "I've seen him before."]
fake_labels = [["Is there an elevator?"], ["I've seen him before."]] # list of list when multiple references

print(bleu.compute(predictions=fake_preds, references=fake_labels))
print(meteor.compute(predictions=fake_preds, references=fake_labels))

In [None]:
import numpy as np

def postprocess_text(preds, labels):
  preds = [pred.strip() for pred in preds]
  labels = [label.strip() for label in labels]
  return preds, labels

def compute_metrics(eval_preds):
  preds, labels = eval_preds
  if isinstance(preds, tuple):
      preds = preds[0]
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
  # Replace -100 in the labels as we can't decode them.
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  # Some simple post-processing
  decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
  result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
  result_meteor = meteor.compute(predictions=decoded_preds, references=decoded_labels)
  result = {"bleu": result["bleu"]}
  result['meteor'] = result_meteor['meteor']
  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
  result["gen_len"] = np.mean(prediction_lens)
  result = {k: round(v, 4) for k, v in result.items()}
  return result

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import Seq2SeqTrainingArguments

# try diff parameters
batch_size = 4
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}-final2",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.001,
    num_train_epochs=15,
    predict_with_generate=True,
    push_to_hub=False,
    warmup_steps=20,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    hub_private_repo=True,
    report_to=["tensorboard"]

)

In [None]:
from transformers import Seq2SeqTrainer, EarlyStoppingCallback

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(4)]
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
tokenizer.push_to_hub("itskavya/opus-mt-ar-en-finetuned-ar-to-en-final2")

In [None]:
my_dataset['test'][2]

In [None]:
from transformers import pipeline

pipe = pipeline("translation", model="itskavya/opus-mt-ar-en-finetuned-ar-to-en-final2")

In [None]:
en = pipe(my_dataset['test'][2]['Text (Arabic)'])
print(en)

In [None]:
all_predictions = []

for i in range(len(my_dataset['test'])):
  en = pipe(my_dataset['test'][i]['Text (Arabic)'])
  all_predictions.append(en[0]['translation_text'])

In [None]:
all_predictions

In [None]:
my_dataset['test']["Text (English)"]

In [None]:
from evaluate import load

bleu_metric = load("bleu")

bleu_result = bleu_metric.compute(
    references=my_dataset['test']["Text (English)"], predictions=all_predictions
)
bleu_result

In [None]:
meteor_metric = load("meteor")

meteor_result = meteor_metric.compute(
    references=my_dataset['test']["Text (English)"], predictions=all_predictions
)

meteor_result

References:

-https://medium.com/@tskumar1320/how-to-fine-tune-pre-trained-language-translation-model-3e8a6aace9f