<a href="https://colab.research.google.com/github/habebamostafa/Multimodal_Translation_project/blob/main/Machine_Translation_V1_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Required Packages

In [None]:
!pip install "tensorflow-text>=2.11";
!pip install einops;
!pip install datasets

Collecting tensorflow-text>=2.11
  Downloading tensorflow_text-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading tensorflow_text-2.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorflow-text
Successfully installed tensorflow-text-2.17.0
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatibl

# Import Libraries

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from transformers import MarianMTModel, MarianTokenizer
from transformers import AutoTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
import torch
import plotly.subplots as sp
import plotly.graph_objects as go

# Load Dataset

In [None]:
data = pd.read_csv("ara.txt", delimiter="\t", names=['english', 'arabic', 'CC'])
data.head()

Unnamed: 0,english,arabic,CC
0,Hi.,مرحبًا.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
1,Run!,اركض!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
2,Duck!,اخفض رأسك!,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Duck!,اخفضي رأسك!,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4,Duck!,اخفضوا رؤوسكم!,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


# Display Dataset Preview and Information

In [None]:
data.head()
data.drop('CC', inplace=True, axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12523 entries, 0 to 12522
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   english  12523 non-null  object
 1   arabic   12523 non-null  object
dtypes: object(2)
memory usage: 195.8+ KB


# Count Duplicate Rows

In [None]:
num_duplicates = data.duplicated().sum()
print(f"Number of Duplicate Rows: {num_duplicates}")

Number of Duplicate Rows: 0


# Number of Words per Sentence

In [None]:
input_lengths = [len(seq.split()) for seq in data['english']]
output_lengths = [len(seq.split()) for seq in data['arabic']]

fig = sp.make_subplots(rows=1, cols=2, subplot_titles=('English Sentence Lengths', 'Arabic Sentence Lengths'))

hist_input = go.Histogram(x=input_lengths, nbinsx=50, name='English')
hist_output = go.Histogram(x=output_lengths, nbinsx=50, name='Arabic')

fig.add_trace(hist_input, row=1, col=1)
fig.add_trace(hist_output, row=1, col=2)

fig.update_layout(showlegend=False, title_text='Distribution of Sentence Lengths')
fig.update_xaxes(title_text='Sentence Length', row=1, col=1)
fig.update_xaxes(title_text='Sentence Length', row=1, col=2)
fig.show()

# Maximum Sentence Lengths

In [None]:
max_input = max([txt for txt in input_lengths])
max_output = max([txt for txt in output_lengths])
print(f"Maximum English Sentence Length: {max_input}")
print(f"Maximum Arabic Sentence Length: {max_output}")

Maximum English Sentence Length: 34
Maximum Arabic Sentence Length: 36


# Total Number of Unique Words in Each Language

In [None]:
unique_words_input = len(set(word for seq in data['english'] for word in seq.split()))
unique_words_output = len(set(word for seq in data['arabic'] for word in seq.split()))

fig = go.Figure()
fig.add_trace(go.Bar(x=['English'], y=[unique_words_input], name='English'))
fig.add_trace(go.Bar(x=['Arabic'], y=[unique_words_output], name='Arabic'))

fig.update_layout(title_text='Total Number of Unique Words in Each Language', barmode='group', xaxis_title='Language', yaxis_title='Total Unique Words')
print(f"Unique words in English: {unique_words_input}")
print(f"Unique words in Arabic: {unique_words_output}")
fig.show()

Unique words in English: 7291
Unique words in Arabic: 15571


# Split Data into Training and Validation Sets and Save to CSV

In [None]:
train, validation = train_test_split(data, test_size=0.2, random_state=42)
train.to_csv('train.csv')
validation.to_csv('validation.csv')

# Load Pre-trained Model and Dataset

In [None]:
import zipfile
from datasets import load_dataset

model_name = "Helsinki-NLP/opus-mt-en-ar"
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

dataset = load_dataset("csv", data_files={"train": "train.csv", "validation": "validation.csv"})



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]


Recommended: pip install sacremoses.


`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884



Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

# Tokenization Function

In [None]:
def tokenize_function(examples):
    """Tokenizes both input and target sequences with consistent max_length."""
    max_length = 128  # Set a maximum sequence length
    model_inputs = tokenizer(examples["english"], padding="max_length", truncation=True, max_length=max_length)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["arabic"], padding="max_length", truncation=True, max_length=max_length)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

**Tokenize Dataset**

In [None]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/10018 [00:00<?, ? examples/s]


`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your labels by using the argument `text_target` of the regular `__call__` method (either in the same call as your input texts if you use the same keyword arguments, or in a separate call.



Map:   0%|          | 0/2505 [00:00<?, ? examples/s]

# Building Model

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
)


`evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead



**Train the Model**

In [None]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.7137,0.065177
1000,0.0569,0.064325
1500,0.045,0.064449


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62801]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62801]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62801]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62801]], 'forced_eos_token_id': 0}


TrainOutput(global_step=1881, training_loss=0.22440671185620728, metrics={'train_runtime': 806.9418, 'train_samples_per_second': 37.244, 'train_steps_per_second': 2.331, 'total_flos': 1018781459546112.0, 'train_loss': 0.22440671185620728, 'epoch': 3.0})

**Evaluate Model Performance**

In [None]:
metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.0640525072813034, 'eval_runtime': 18.7208, 'eval_samples_per_second': 133.808, 'eval_steps_per_second': 8.386, 'epoch': 3.0}


**Save Trained Model**

In [None]:
trainer.save_model("modelE2A.h5")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62801]], 'forced_eos_token_id': 0}


# Load Saved Model for Inference

In [None]:
from transformers import pipeline

model = pipeline("translation_en_to_ar", model="modelE2A.h5")


Recommended: pip install sacremoses.

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


**Translate Sample English Sentences**

In [None]:
english_sentences = [
    "Hello, how are you?",
    "Hello, I am 21 years old",
    "Do you need anything?"
]

for sentence in english_sentences:
    translated_sentence = model(sentence)
    print(f"Translated Sentence: {translated_sentence}")

Translated Sentence: [{'translation_text': 'مرحباً، كيف حالك؟'}]
Translated Sentence: [{'translation_text': 'مرحباً، أنا في 21 من عمري.'}]
Translated Sentence: [{'translation_text': 'هل تحتاج أي شيء؟'}]


# Function to Compress Model into ZIP

In [None]:
def compress_folder(folder_path, zip_path):
    """Compresses a folder into a ZIP file."""
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), start=folder_path))

# Example usage

In [None]:
folder_to_compress = "/content/modelE2A.h5"
zip_file_name = "model.zip"
compress_folder(folder_to_compress, zip_file_name)