<a href="https://colab.research.google.com/github/habebamostafa/Multimodal_Translation_project/blob/main/Machine_TranslationV1_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
import re
import string

# 2. Load and prepare dataset

In [None]:
# from google.colab import drive
# drive.mount("/content/gdrive")

df = pd.read_csv('ara.txt', encoding='utf-8', sep='\t', names=['English', 'Arabic', 'Attribution'])

df.drop(['Attribution'], axis=1, inplace=True)
df.tail()

Unnamed: 0,English,Arabic
12518,The mobile phone you have dialed is either swi...,الهاتف المتحرك الذي طلبته مغلق أو خارج نطاق ال...
12519,If you decide to answer questions now without ...,إذا قررت الإجابة عن الأسئلة الآن دون حضور محام...
12520,A man touched down on the moon. A wall came do...,هبط إنسان على سطح القمر، وأنهار حائط في برلين،...
12521,"Ladies and gentlemen, please stand for the nat...",سيداتي و سادتي ، رجاءً قفوا للنشيد الوطني للات...
12522,There are mothers and fathers who will lie awa...,وهناك أمهات وآباء سيظلون مستيقظين بعد أن ينام ...


# 3. Preprocess sentences (text cleaning)


In [None]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()
    sentence = re.sub(r"i'm", "i am", sentence)
    sentence = re.sub(r"he's", "he is", sentence)
    sentence = re.sub(r"she's", "she is", sentence)
    sentence = re.sub(r"it's", "it is", sentence)
    sentence = re.sub(r"that's", "that is", sentence)
    sentence = re.sub(r"what's", "that is", sentence)
    sentence = re.sub(r"where's", "where is", sentence)
    sentence = re.sub(r"how's", "how is", sentence)
    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"won't", "will not", sentence)
    sentence = re.sub(r"can't", "cannot", sentence)
    sentence = re.sub(r"n't", " not", sentence)
    sentence = re.sub(r"n'", "ng", sentence)
    sentence = re.sub(r"'bout", "about", sentence)
    sentence = re.sub(r"[^a-zA-Z?.!,]+", " ", sentence)
    sentence = sentence.strip()
    return sentence

df['English'] = df['English'].apply(preprocess_sentence)

# 4. Translation and punctuation removal

In [None]:
translator = str.maketrans('', '', string.punctuation)

df.English = df.English.apply(lambda x: x.translate(translator))
df.Arabic = df.Arabic.apply(lambda x: x.translate(translator))
df.Arabic = df.Arabic.apply(lambda x: 'sos ' + x + ' eos')

eng_texts = df.English.to_list()
ara_texts = df.Arabic.to_list()

# 5. Tokenization of English and Arabic sentences

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

def tokenize_sent(text):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)
    return tokenizer, tokenizer.texts_to_sequences(text)

eng_tokenizer, eng_encoded = tokenize_sent(text=eng_texts)
ara_tokenizer, ara_encoded = tokenize_sent(text=ara_texts)

eng_index_word = eng_tokenizer.index_word
ara_index_word = ara_tokenizer.index_word
ara_word_index = ara_tokenizer.word_index

# 6. Vocabulary size calculation

In [None]:
ENG_VOCAB_SIZE = len(eng_tokenizer.word_counts) + 1
ARA_VOCAB_SIZE = len(ara_tokenizer.word_counts) + 1

ENG_VOCAB_SIZE, ARA_VOCAB_SIZE

(4273, 13598)

# 7. Determine maximum sequence lengths for padding

In [None]:
max_eng_len = max(len(seq) for seq in eng_encoded)
max_ara_len = max(len(seq) for seq in ara_encoded)

max_eng_len, max_ara_len

(37, 38)

# 8. Pad sequences to uniform lengths

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

eng_padded = pad_sequences(eng_encoded, maxlen=max_eng_len, padding='post')
ara_padded = pad_sequences(ara_encoded, maxlen=max_ara_len, padding='post')

eng_padded.shape, ara_padded.shape

((12523, 37), (12523, 38))

# 9. Split data into training and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(eng_padded, ara_padded, test_size=0.1, random_state=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((11270, 37), (1253, 37), (11270, 38), (1253, 38))

# 10. Define the LSTM seq2seq model architecture

In [None]:
from tensorflow.keras.layers import LSTM, Dropout, Dense, Embedding, Bidirectional, Add, Concatenate
from tensorflow.keras import Input, Model

encoder_input = Input(shape=(None,))
encoder_embd = Embedding(ENG_VOCAB_SIZE, 1024, mask_zero=True)(encoder_input)
encoder_lstm = Bidirectional(LSTM(512, return_state=True))
encoder_output, forw_state_h, forw_state_c, back_state_h, back_state_c = encoder_lstm(encoder_embd)

state_h_final = Concatenate()([forw_state_h, back_state_h])
state_c_final = Concatenate()([forw_state_c, back_state_c])
encoder_states = [state_h_final, state_c_final]

decoder_input = Input(shape=(None,))
decoder_embd = Embedding(ARA_VOCAB_SIZE, 1024, mask_zero=True)(decoder_input)
decoder_lstm = LSTM(1024, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embd, initial_state=encoder_states)
decoder_dense = Dense(ARA_VOCAB_SIZE, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_input, decoder_input], decoder_outputs)

# 11. Compile the model

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 12. Prepare training and testing data for the encoder-decoder model

In [None]:
encoder_input_data = X_train
decoder_input_data = y_train[:, :-1]
decoder_target_data = y_train[:, 1:]

encoder_input_test = X_test
decoder_input_test = y_test[:, :-1]
decoder_target_test = y_test[:, 1:]

# 13. Train the model


In [None]:
history = model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
                    epochs=100, batch_size=128,
                    validation_data=([encoder_input_test, decoder_input_test], decoder_target_test))

Epoch 1/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 375ms/step - accuracy: 0.2815 - loss: 6.9489 - val_accuracy: 0.0585 - val_loss: 5.5701
Epoch 2/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 406ms/step - accuracy: 0.0599 - loss: 5.1783 - val_accuracy: 0.0647 - val_loss: 5.2520
Epoch 3/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 388ms/step - accuracy: 0.0657 - loss: 4.4911 - val_accuracy: 0.0704 - val_loss: 4.9874
Epoch 4/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 400ms/step - accuracy: 0.0723 - loss: 3.7830 - val_accuracy: 0.0736 - val_loss: 4.8308
Epoch 5/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 394ms/step - accuracy: 0.0818 - loss: 3.0070 - val_accuracy: 0.0778 - val_loss: 4.7597
Epoch 6/100
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 397ms/step - accuracy: 0.0966 - loss: 2.2138 - val_accuracy: 0.0801 - val_loss: 4.7101
Epoch 7/100
[1m

# 14. Save the trained model

In [None]:
model.save("/content/Model/modellstm4.h5")



# 15. Build the inference model (encoder and decoder)

In [None]:
encoder_model = Model(encoder_input, encoder_states)

decoder_state_input_h = Input(shape=(1024,))
decoder_state_input_c = Input(shape=(1024,))
decoder_states_input = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embd, initial_state=decoder_states_input)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_input] + decoder_states_input, [decoder_outputs] + decoder_states)

# 16. Function to convert predicted sequences into sentences

In [None]:
def get_predicted_sentence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = ara_word_index['sos']
    decoded_sentence = ''

    while True:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = ara_index_word[sampled_token_index]

        if sampled_char == 'eos' or len(decoded_sentence) >= 37:
            break

        decoded_sentence += ' ' + sampled_char
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence

In [None]:
def get_eng_sent(encoded_sentence):
  """
  This function decodes an encoded English sentence back to English text.
  """
  decoded_sentence = ''
  for index in encoded_sentence:
    if index != 0:  # Exclude padding or special tokens if any
      decoded_sentence += eng_index_word[index] + ' '
  return decoded_sentence.strip()  # Remove trailing space


In [None]:
def get_arabic_sentence(encoded_sentence):
  """
  This function decodes an encoded Arabic sentence back to Arabic text.
  """
  decoded_sentence = ''
  for index in encoded_sentence:
    if index != 0:  # Exclude padding or special tokens if any
      # Check if index is within the bounds of ara_index_word keys
      if index in ara_index_word:
        decoded_sentence += ara_index_word[index] + ' '
      else:
        # Handle the case where the index is not found in ara_index_word
        decoded_sentence += '<UNK> '  # Replace with a placeholder or handle differently
  return decoded_sentence.strip()  # Remove trailing space

# 17. Translate sample test sentences

In [None]:
for i in range(20):
    print("English sentence:", get_eng_sent(X_test[i]))
    print("Actual Arabic Sentence:", get_arabic_sentence(y_test[i]))
    print("Translated Arabic Sentence:", get_predicted_sentence(X_test[i].reshape(1, 37))[:-4])
    print("\n")

English sentence: sorry my mistake
Actual Arabic Sentence: sos متأسف، إنه خطأي eos
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 419ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Translated Arabic Sentence:  دعني 


English sentence: when can we eat
Actual Arabic Sentence: sos متى يمكننا الأكل؟ eos
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Translated Arabic Sentence:  متى سيبدأ الف


English sentence: my father is not reading a book now
Actual Arabic Sentence: sos أبي لا يقرأ كتاباً 