### Imports

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import torch
from datasets import load_dataset

### Load model and Tokenizer

In [3]:
model_checkpoint = "Helsinki-NLP/opus-mt-ar-en"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


### Load and Preprocess Dataset

In [4]:
raw_datasets = load_dataset("Amr-khaled/Egyptian-Arabic_English_V1")

def is_valid_example(example):
    return isinstance(example["Egy"], str) and isinstance(example["English"], str)

filtered_datasets = raw_datasets.filter(is_valid_example)


In [5]:
raw_datasets

DatasetDict({
    NADI_2024_SubTask_EgyText_Translated: Dataset({
        features: ['Egy', 'English', 'Egy_Text_Source'],
        num_rows: 12799
    })
    Milion_Token_EGY_Songs: Dataset({
        features: ['Egy', 'English', 'Egy_Text_Source'],
        num_rows: 6554
    })
    ArzEn_MultiGenre: Dataset({
        features: ['Egy', 'English', 'Egy_Text_Source'],
        num_rows: 13946
    })
})

In [7]:
raw_datasets['NADI_2024_SubTask_EgyText_Translated'] [0]

{'Egy': 'أنا بقترح إنك تيجي في مايو أو أكتوبر اللي هما أحسن مواسم في السنة.',
 'English': 'I suggest you come in May or October, which are the best seasons of the year.',
 'Egy_Text_Source': '[NADI 2024: The Fifth Nuanced Arabic Dialect Identification Shared Task](https://aclanthology.org/2024.arabicnlp-1.79) (Abdul-Mageed et al., ArabicNLP-WS 2024)'}

### Tokenize data

In [8]:
max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    inputs = examples["Egy"]
    targets = examples["English"]
    
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
        
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = filtered_datasets.map(preprocess_function, batched=True)


### Data Collator

In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


### BLEU Scoring

In [10]:
import sacrebleu

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    preds, labels = postprocess_text(decoded_preds, decoded_labels)
    bleu = sacrebleu.corpus_bleu(preds, labels)
    return {"bleu": bleu.score}


### Training arguments

In [11]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=1000,
)





### Trainer

In [12]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["NADI_2024_SubTask_EgyText_Translated"],
    eval_dataset=tokenized_datasets["Milion_Token_EGY_Songs"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Seq2SeqTrainer(


In [13]:
trainer.train()
trainer.evaluate()

Step,Training Loss
1000,0.9738
2000,0.7181
3000,0.594
4000,0.4963




{'eval_loss': 2.3610315322875977,
 'eval_bleu': 64.75804331284182,
 'eval_runtime': 4764.4396,
 'eval_samples_per_second': 1.375,
 'eval_steps_per_second': 0.172,
 'epoch': 3.0}

In [14]:
model.save_pretrained("./my_finetuned_model_egy_en")
tokenizer.save_pretrained("./my_finetuned_model_egy_en")


('./my_finetuned_model_egy_en\\tokenizer_config.json',
 './my_finetuned_model_egy_en\\special_tokens_map.json',
 './my_finetuned_model_egy_en\\vocab.json',
 './my_finetuned_model_egy_en\\source.spm',
 './my_finetuned_model_egy_en\\target.spm',
 './my_finetuned_model_egy_en\\added_tokens.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import os

model_checkpoint = "./my_finetuned_model_egy_en"

if not os.path.isdir(model_checkpoint):
    raise ValueError(f"Model directory not found: {model_checkpoint}")

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, local_files_only=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(62834, 512, padding_idx=62833)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(62834, 512, padding_idx=62833)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [16]:
text = "لأ، للأسف مش عارف أنام بسبب الأخبار الوحشة دي"
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
translated_tokens = model.generate(**inputs)
translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
print(translated_text)


No, unfortunately, I don't sleep because of this terrible news.


## English to Arabic Model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from datasets import load_dataset
import torch

model_checkpoint = "Helsinki-NLP/opus-mt-en-ar"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

raw_datasets = load_dataset("Amr-khaled/Egyptian-Arabic_English_V1")

def is_valid_example(example):
    return isinstance(example["Egy"], str) and isinstance(example["English"], str)

filtered_datasets = raw_datasets.filter(is_valid_example)

max_input_length = 128
max_target_length = 128

def preprocess_function(examples):
    inputs = examples["English"]  
    targets = examples["Egy"]  
    
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = filtered_datasets.map(preprocess_function, batched=True)


Map: 100%|██████████| 12799/12799 [00:01<00:00, 7348.03 examples/s]
Map: 100%|██████████| 6551/6551 [00:11<00:00, 569.22 examples/s]
Map: 100%|██████████| 13946/13946 [00:02<00:00, 6892.22 examples/s]


In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results_en_to_egy",  
    save_strategy="epoch", 
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,  
    num_train_epochs=3, 
    predict_with_generate=True,
    logging_dir="./logs_en_to_egy",
    logging_steps=1000,
)


In [20]:
import sacrebleu

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    preds, labels = postprocess_text(decoded_preds, decoded_labels)
    bleu = sacrebleu.corpus_bleu(preds, labels)
    return {"bleu": bleu.score}


In [21]:
from transformers import Seq2SeqTrainer

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["NADI_2024_SubTask_EgyText_Translated"],
    eval_dataset=tokenized_datasets["Milion_Token_EGY_Songs"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()

# Save the model and tokenizer
model.save_pretrained("./my_finetuned_model_en_to_ar")
tokenizer.save_pretrained("./my_finetuned_model_en_to_ar")


  trainer = Seq2SeqTrainer(


Step,Training Loss
1000,2.19
2000,1.5577
3000,1.3519
4000,1.1944




('./my_finetuned_model_en_to_ar\\tokenizer_config.json',
 './my_finetuned_model_en_to_ar\\special_tokens_map.json',
 './my_finetuned_model_en_to_ar\\vocab.json',
 './my_finetuned_model_en_to_ar\\source.spm',
 './my_finetuned_model_en_to_ar\\target.spm',
 './my_finetuned_model_en_to_ar\\added_tokens.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_checkpoint = "./my_finetuned_model_en_to_ar"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

test_sentences = [
      "I like reading books in my free time.",
    "This is a beautiful city.",
    "How much does this cost?",
]

for sentence in test_sentences:
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True).to(device)
    translated_tokens = model.generate(**inputs)
    translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    print(f"English: {sentence}")
    print(f"Arabic: {translated_text}\n")


English: I like reading books in my free time.
Arabic: عايز اقرا كتب في وقت فراغي .

English: This is a beautiful city.
Arabic: دي مدينة جميلة.

English: How much does this cost?
Arabic: دة يكلف كام؟

