# Word-Based Diacritics Prediction Model

In [34]:
import numpy as np
import tensorflow as tf
import os
import pickle
import re

from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional
from keras.utils import to_categorical

### Get input data

In [9]:
bare_file = r"/content/drive/MyDrive/Bin_Data/big-bare-arabic-sentences-list"
with open (bare_file, "rb") as f4:
  bare_arabic_sentences_list = pickle.load(f4)
print (bare_arabic_sentences_list[0:20])

[['ﺽﺮﻌﻣ', 'ﻥﺁﺮﻘﻠﻟ', 'ﻦﻄﻨﺷﺍﻮﺑ'], ['ﻢﻴﻘﻳ', 'ﻒﺤﺘﻣ', 'ﺚﻴﻤﺳ', 'ﻥﺎﻴﻧﻮﺳ', 'ﻦﻄﻨﺷﺍﻮﺑ', 'ﻥﻭﺎﻌﺘﻟﺎﺑ', 'ﻊﻣ', 'ﻒﺤﺘﻤﻟﺍ', 'ﻲﻛﺮﺘﻟﺍ', 'ﻦﻔﻠﻟ', 'ﻲﻣﻼﺳﻹﺍ', 'ﻲﻓ', 'ﻝﻮﺒﻨﻄﺳﺇ', 'ﺎﺿﺮﻌﻣ', 'ﻥﺁﺮﻘﻠﻟ', 'ﻢﻳﺮﻜﻟﺍ', 'ﻮﻫ', 'ﻝﻭﻷﺍ', 'ﺮﺒﻛﻷﺍﻭ', 'ﻦﻣ', 'ﻪﻋﻮﻧ', 'ﻲﻓ', 'ﺕﺎﻳﻻﻮﻟﺍ', 'ﺓﺪﺤﺘﻤﻟﺍ', 'ﺖﺤﺗ', 'ﻢﺳﺍ', 'ﻦﻓ'], ['ﻥﺁﺮﻘﻟﺍ', 'ﺢﺘﺘﻓﺍﻭ', 'ﺽﺮﻌﻤﻟﺍ', 'ﺮﻬﺸﻟﺍ', 'ﻲﺿﺎﻤﻟﺍ', 'ﻢﻀﻳﻭ', 'ﻒﺣﺎﺼﻣ'], ['ﺓﺭﺩﺎﻧ', 'ﺎﻬﻀﻌﺑ', 'ﺽﺮﻌﻳ', 'ﻝﻭﻷ', 'ﺓﺮﻣ', 'ﺝﺭﺎﺧ'], ['ﺽﺮﻌﻳﻭ', 'ﻲﻓ', 'ﺔﺤﻨﺟﺃ', 'ﺽﺮﻌﻤﻟﺍ', 'ﻱﺬﻟﺍ', 'ﻕﺮﻐﺘﺳﺍ', 'ﺩﺍﺪﻋﻹﺍ', 'ﻪﻤﻴﻈﻨﺘﻟ', 'ﺖﺳ', 'ﺕﺍﻮﻨﺳ', 'ﺮﺜﻛﺃ', 'ﻦﻣ', 'ﻦﻴﺘﺳ', 'ﺎﻔﺤﺼﻣ', 'ﺎﻃﻮﻄﺨﻣ', 'ﺩﻮﻌﺗ', 'ﺎﻬﻟﻮﺻﺃ', 'ﻰﻟﺇ', 'ﺎﻴﻛﺮﺗ', 'ﻥﺎﺘﺴﻧﺎﻐﻓﺃﻭ', 'ﻥﺍﺮﻳﺇﻭ', 'ﻝﻭﺩﻭ'], ['ﺔﻴﺑﺮﻋ', 'ﺖﺒﺘﻛﻭ', 'ﺾﻌﺑ', 'ﻩﺬﻫ', 'ﻒﺣﺎﺼﻤﻟﺍ', 'ﺬﻨﻣ', 'ﺮﺜﻛﺃ', 'ﻦﻣ', 'ﻒﻟﺃ'], ['ﻡﺎﻋ', 'ﺔﻓﺎﺿﺇ', 'ﻰﻟﺇ', 'ﺕﺍﺮﺸﻋ', 'ﺕﺎﻃﻮﻄﺨﻤﻟﺍ', 'ﻯﺮﺧﻷﺍ', 'ﻒﺤﺘﻟﺍﻭ', 'ﻊﻄﻘﻟﺍﻭ', 'ﺔﻴﻨﻔﻟﺍ'], ['ﻦﻜﻤﻳﻭ', 'ﺽﺮﻌﻤﻟﺍ', 'ﺮﺋﺍﺰﻟﺍ', 'ﻦﻣ', 'ﻉﻼﻃﻻﺍ', 'ﻰﻠﻋ', 'ﻥﻮﻨﻓ', 'ﺔﺑﺎﺘﻛ', 'ﻥﺁﺮﻘﻟﺍ', 'ﻢﻳﺮﻜﻟﺍ', 'ﺪﺠﻴﻟ', 'ﻪﻣﺎﻣﺃ', 'ﺎﺼﻧ', 'ﺎﺳﺪﻘﻣ', 'ﺍﺪﺣﺍﻭ', 'ﺐﺘﻛ', 'ﻁﻮﻄﺨﺑ', 'ﺓﺩﺪﻌﺘﻣ', 'ﺔﻠﻴﻤﺟ', 'ﺎﻘﻓﻭ', 'ﺱﺭﺍﺪﻤﻟ', 'ﺔﻴﻨﻓ', 'ﺔﻘﻳﺮﻋ', 'ﻲﻓ', 'ﻂﺨﻟﺍ', 'ﺔﻓﺮﺧﺰﻟﺍﻭ', 'ﻢﺳﺮﻟﺍﻭ'], ['ﻡﺪﻘﻳﻭ', 'ﻥﻮﻤﻈﻨﻤﻟﺍ',

### Get target data

In [6]:
in_file= r"/content/drive/MyDrive/Bin_Data/big-harakat-arabic-sentences-list"
with open (in_file, "rb") as f:
  arabic_target_sentences_list = pickle.load(f)
print (arabic_target_sentences_list[130:133])

[['وَتَشْتَبِك', 'مَسَارَاتُهُم', 'فِي', 'السَّفَر', 'إِلَى', 'الْعِرَاق', 'خِلَال', 'الرِّحْلَة', 'الْجَمَاعِيَّة', 'الشَّاقَة', 'عَن', 'طَرِيق', 'التَّهْرِيب', 'مِن', 'جَنُوب', 'الْعِرَاق', 'إِلَى', 'الْكُوَيْت', 'الَّتِي', 'كَانَت', 'فِي', 'تِلْك', 'الْفَتْرَة', 'رَمْزًا', 'لِتَحْقِيق'], ['لَكِن', 'رِحْلَة', 'الْأَمَل', 'تَحَوَّلَت', 'إِلَى', 'رِحْلَة', 'مَوْت', 'فِي', 'الصَّحْرَاء', 'قُرْب', 'الْحُدُود', 'الْكُوَيْتِيَّة', 'اخْتِنَاقًا', 'فِي', 'خَزَّان', 'مِيَاه', 'بِشَاحِنَة', 'يَقُودُهَا', 'شَخْص', 'نَمَوْذَج', 'لِلْقِيَادَة'], ['تُرْجِمَت', 'الرِّوَايَة', 'إِلَى', 'لُغَات', 'عَالَمِيَّة', 'وَاكْتَسَبَت', 'شُهْرَتَهَا', 'مِن', 'الْمَكَانَة', 'الْكَبِيرَة', 'الَّتِي', 'كَانَت', 'تَحْظَى', 'بِهَا', 'الْقَضِيَّة', 'الْفِلَسْطِينِيَّة', 'فِي', 'الضَّمِير', 'الْعَالَمِي', 'فِي', 'ذَلِك']]


In [5]:
labels_file= r"/content/drive/MyDrive/Bin_Data/big-harakat-only-unicode-sentences-list"
with open (labels_file, "rb") as f3:
  unicode_target_sentences_list = pickle.load(f3)
print (unicode_target_sentences_list[130:133])

[['1614 1614 1618 1614 1616', '1614 1614 1614 1615 1615', '1616', '1617 1614 1614', '1616 1614', '1618 1616 1614', '1616 1614', '1617 1616 1618 1614', '1618 1614 1614 1616 1617 1614', '1617 1614 1614', '1614', '1614 1616', '1617 1614 1618 1616', '1616', '1614 1615', '1618 1616 1614', '1616 1614', '1618 1615 1614 1618', '1617 1614 1616', '1614 1614', '1616', '1616 1618', '1618 1614 1618 1614', '1614 1618 1611', '1616 1614 1618 1616'], ['1614 1616', '1616 1618 1614', '1618 1614 1614', '1614 1614 1617 1614 1614', '1616 1614', '1616 1618 1614', '1614 1618', '1616', '1617 1614 1618 1614', '1615 1618', '1618 1615 1615', '1618 1615 1614 1618 1616 1617 1614', '1618 1616 1614 1611', '1616', '1614 1617 1614', '1616 1614', '1616 1614 1616 1614', '1614 1615 1615 1614', '1614 1618', '1614 1614 1618 1614', '1616 1618 1616 1614 1614'], ['1615 1618 1616 1614', '1617 1616 1614 1614', '1616 1614', '1615 1614', '1614 1614 1616 1617 1614', '1614 1618 1614 1614 1614', '1615 1618 1614 1614 1614', '1616', '1

# Reduce data size to test CPU

In [11]:
small_bare_arabic_sentences_list = bare_arabic_sentences_list[0:20000]
small_unicode_target_sentences_list = unicode_target_sentences_list[0:20000]

# Tokenization


using filtes=None to keep punctuation in the data.

In [12]:
bare_tokenizer = keras.preprocessing.text.Tokenizer(filters=None, oov_token='<OOV>')
bare_tokenizer.fit_on_texts(small_bare_arabic_sentences_list)

In [13]:
harakat_tokenizer = keras.preprocessing.text.Tokenizer(filters=None, oov_token='<OOV>')
harakat_tokenizer.fit_on_texts(small_unicode_target_sentences_list)

# Vectorization

In [15]:
X_seqs = bare_tokenizer.texts_to_sequences(small_bare_arabic_sentences_list)
y_seqs = harakat_tokenizer.texts_to_sequences(small_unicode_target_sentences_list)

In [16]:
# Inspect sequences
print (arabic_target_sentences_list[0:1])
print (unicode_target_sentences_list[0:1])
print (bare_arabic_sentences_list [0:1])

[['مَعْرِض', 'للْقُرْآن', 'بِوَاشِنْطُن']]
[['1614 1618 1616', '1618 1615 1618', '1616 1614 1616 1618 1615']]
[['ﺽﺮﻌﻣ', 'ﻥﺁﺮﻘﻠﻟ', 'ﻦﻄﻨﺷﺍﻮﺑ']]


In [17]:
MAX_LENGTH = len(max(X_seqs, key=len))
print(f"Length of longest input sequence: {MAX_LENGTH}")

Length of longest input sequence: 173


In [18]:
X_padded = keras.preprocessing.sequence.pad_sequences(X_seqs, padding='post',
                                                            maxlen=MAX_LENGTH)

In [19]:
y_padded = keras.preprocessing.sequence.pad_sequences(y_seqs, padding='post',
                                                            maxlen=MAX_LENGTH)

In [27]:
print (f" X_padded is a {type (X_padded)} of shape {np.shape(X_train)}")
print (f" y_padded is a {type (y_padded)} of shape {np.shape(y_train)}")

 X_padded is a <class 'numpy.ndarray'> of shape (16000, 173)
 y_padded is a <class 'numpy.ndarray'> of shape (16000, 173)


# Splitting To Datasets

In [20]:
X_train, X_temp, y_train, y_temp = train_test_split(X_padded, y_padded, test_size=0.2, random_state=13)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=13)

# Model Configuration

### Data

In [33]:
num_of_sentences = len(small_bare_arabic_sentences_list)
print ("(Number of input sentences: ", num_of_sentences)

num_of_tokens = len(bare_tokenizer.word_index)+1
print ("Number of tokens: ", num_of_tokens)

num_of_target_sentences = len(small_unicode_target_sentences_list)
print ("Number of target sentences: ", num_of_target_sentences )

num_of_labels = len(harakat_tokenizer.word_index)+1
print ("Number of target labels: ", num_of_labels)

(Number of input sentences:  20000
Number of tokens:  38132
Number of target sentences:  20000
Number of target labels:  2927


In [37]:
print ("X and y should be equal")
print(f"X_train has {len(X_train)} sentences, and y_train has {len(y_train)}")
print(f"X_val has {len(X_val)} sentencs, and y_val has {len(y_val)}")
print(f"X_test has {len(X_test)} sentences, and y_test has {len(y_test)}")

X and y should be equal
X_train has 16000 sentences, and y_train has 16000
X_val has 2000 sentencs, and y_val has 2000
X_test has 2000 sentences, and y_test has 2000


### Layers

In [38]:
batch_size = 40
embedding_dim = 128
num_of_batches = (num_of_sentences + batch_size) // batch_size
num_of_epochs = 18

In [None]:
model = keras.models.Sequential()
model.add(Embedding(input_dim=num_of_tokens, output_dim=embedding_dim, input_length=MAX_LENGTH))
model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model.add(Dense(units=num_of_labels, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Model Training Using CPU

In [None]:
# Initializing loss value as infinity so it goes down from there.
best_val_loss = float('inf')
best_train_loss = float('inf')

# Training loop takes time
for epoch in range(num_of_epochs):
    total_train_loss = 0

    for i in range(num_of_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size if (i + 1) * batch_size < num_of_sentences else num_of_sentences

        # Extracting a batch of sequences and labels
        X_batch = X_train[start_idx:end_idx]
        y_batch = y_train[start_idx:end_idx]

        # Converting labels to one-hot encodings per batch because that's where the memory errors were raised
        y_one_hot_batch = to_categorical(y_batch, num_classes=num_of_labels)

        # Usting train_on_batch which returns a list containing [Loss, any-metrics-in-model.complie]
        train_metrics = model.train_on_batch(X_batch, y_one_hot_batch)
        train_loss = train_metrics[0]
        total_train_loss += train_loss

    # Calculating average training loss for the epoch
    avg_train_loss = total_train_loss / num_of_batches

    # Evaluating on the validation set per epoch to compare the scores to the training Loss
    val_loss, val_accuracy = model.evaluate(X_val, to_categorical(y_val, num_classes=num_of_labels))
    print(f'Epoch {epoch + 1}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

# Saving the model
model.save(r"/content/drive/MyDrive/Colab_Notebooks/half_model.h5")

print(f"Best Training Loss: {best_train_loss:.4f}")
print(f"Best Validation Loss: {best_val_loss:.4f}")



Epoch 1, Training Loss: 0.5999, Validation Loss: 0.3094, Validation Accuracy: 0.9442
Epoch 2, Training Loss: 0.2387, Validation Loss: 0.2996, Validation Accuracy: 0.9450
Epoch 3, Training Loss: 0.2207, Validation Loss: 0.2629, Validation Accuracy: 0.9528
Epoch 4, Training Loss: 0.1897, Validation Loss: 0.2206, Validation Accuracy: 0.9610
Epoch 5, Training Loss: 0.1469, Validation Loss: 0.1673, Validation Accuracy: 0.9719
Epoch 6, Training Loss: 0.1030, Validation Loss: 0.1320, Validation Accuracy: 0.9796
Epoch 7, Training Loss: 0.0729, Validation Loss: 0.1130, Validation Accuracy: 0.9828
Epoch 8, Training Loss: 0.0533, Validation Loss: 0.1025, Validation Accuracy: 0.9849
Epoch 9, Training Loss: 0.0402, Validation Loss: 0.0963, Validation Accuracy: 0.9860
Epoch 10, Training Loss: 0.0312, Validation Loss: 0.0940, Validation Accuracy: 0.9862
Epoch 11, Training Loss: 0.0249, Validation Loss: 0.0912, Validation Accuracy: 0.9873
Epoch 12, Training Loss: 0.0201, Validation Loss: 0.0908, Valid

  saving_api.save_model(


Best Training Loss: 0.0201
Best Validation Loss: 0.0908


# Evaluation

In [39]:
from tensorflow.keras.models import load_model
my_model = load_model(r"/content/drive/MyDrive/Colab_Notebooks/half_model.h5")

In [None]:
# Using a similar loop on the test set
test_loss, test_accuracy = 0.0, 0.0
num_of_test_samples = len(X_test)
test_batch_size = 10

for i in range(0, num_of_test_samples, test_batch_size):
    start_idx = i
    end_idx = min(i + test_batch_size, num_of_test_samples)

    X_test_batch = X_test[start_idx:end_idx]
    y_test_batch = y_test[start_idx:end_idx]

    y_test_one_hot_batch = to_categorical(y_test_batch, num_classes=num_of_labels)

    # Evaluating the model on the batch
    batch_loss, batch_accuracy = my_model.evaluate(X_test_batch, y_test_one_hot_batch, verbose=2)

    # Multiplying the the results of model.evaluate by the number of sequences in each batch
    test_loss += batch_loss * (end_idx - start_idx)
    test_accuracy += batch_accuracy * (end_idx - start_idx)

# Averaging the metrics
test_loss /= num_of_test_samples
test_accuracy /= num_of_test_samples

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

1/1 - 1s - loss: 0.0769 - accuracy: 0.9855 - 940ms/epoch - 940ms/step
1/1 - 0s - loss: 0.0943 - accuracy: 0.9896 - 71ms/epoch - 71ms/step
1/1 - 0s - loss: 0.0914 - accuracy: 0.9884 - 136ms/epoch - 136ms/step
1/1 - 0s - loss: 0.0637 - accuracy: 0.9879 - 85ms/epoch - 85ms/step
1/1 - 0s - loss: 0.1012 - accuracy: 0.9879 - 86ms/epoch - 86ms/step
1/1 - 0s - loss: 0.0674 - accuracy: 0.9896 - 71ms/epoch - 71ms/step
1/1 - 0s - loss: 0.0682 - accuracy: 0.9890 - 140ms/epoch - 140ms/step
1/1 - 0s - loss: 0.0969 - accuracy: 0.9902 - 69ms/epoch - 69ms/step
1/1 - 0s - loss: 0.0663 - accuracy: 0.9936 - 78ms/epoch - 78ms/step
1/1 - 0s - loss: 0.1196 - accuracy: 0.9855 - 75ms/epoch - 75ms/step
1/1 - 0s - loss: 0.0961 - accuracy: 0.9884 - 80ms/epoch - 80ms/step
1/1 - 0s - loss: 0.1032 - accuracy: 0.9890 - 79ms/epoch - 79ms/step
1/1 - 0s - loss: 0.1062 - accuracy: 0.9890 - 78ms/epoch - 78ms/step
1/1 - 0s - loss: 0.0378 - accuracy: 0.9936 - 79ms/epoch - 79ms/step
1/1 - 0s - loss: 0.0225 - accuracy: 0.9954

In [42]:
!pip install arabic-reshaper
!pip install python-bidi

Collecting arabic-reshaper
  Downloading arabic_reshaper-3.0.0-py3-none-any.whl (20 kB)
Installing collected packages: arabic-reshaper
Successfully installed arabic-reshaper-3.0.0
Collecting python-bidi
  Downloading python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Installing collected packages: python-bidi
Successfully installed python-bidi-0.4.2


In [43]:
#from arabic_reshaper import ArabicReshaper
from bidi.algorithm import get_display
#configuration = {'delete_harakat': False}
#reshaper = ArabicReshaper(configuration=configuration)

def decode_predictions(model, X_test, input_tokenizer, target_tokenizer, start_index, end_index):

  def convert_to_arabic(tokens):
      arabic_list = []
      for token in tokens:
          arabic_chars = [chr(int(codepoint)) for codepoint in token.split()]
          arabic_list.append(' '.join(arabic_chars))
      return arabic_list

  for i in range(start_index - 1, end_index):  # -1 because the index starts at 0

      # Not sure if I should use model.predict or model.predict_on_batch, results are similar
      #prediction = model.predict(X_test[i:i + 1])[0]
      prediction = model.predict_on_batch(X_test[i:i + 1])[0]

      # Removing the padding zeros from the test sample and the prediction because they're annoying
      input_sequence = X_test[i]
      input_sequence = input_sequence[input_sequence != 0]
      prediction = prediction[:len(input_sequence)]

      # Getting the tokens corresponding to the integers in the sample vector
      input_tokens = [input_tokenizer.index_word[idx] for idx in input_sequence]
      target_tokens = [target_tokenizer.index_word[np.argmax(token)] for token in prediction]
      target_arabic = convert_to_arabic(target_tokens)

      # printing the input_sequence initially gave me left to right words
      L2R_input_sequence = ' '.join(input_tokens)
      # fixing this using Bidi:
      input_sequence = get_display(L2R_input_sequence)


      print(f"Input Sequence: {input_sequence}")
      print (f"Predicted Unicode: {target_tokens}")
      print( f"Predicted Arabic: {target_arabic}")
      print()

decode_predictions(my_model, X_test, bare_tokenizer, harakat_tokenizer, start_index=1333, end_index=1336)


Input Sequence: ﻣﻦ ﺍﻟﻤﻮﻗﻒ ﻫﺬﺍ ﻳﻘﻒ ﺇﺑﻠﻴﺲ ﷲ ﺟﻌﻞ ﻟﻘﺪ ﺃﺧﺮﻯ
Predicted Unicode: ['1615 1618 1614', '1614 1614', '1614 1614', '1617 1614', '1616 1618 1616', '1614 1616', '1614 1614', '1618 1614 1618 1616', '1616']
Predicted Arabic: ['ُ ْ َ', 'َ َ', 'َ َ', 'ّ َ', 'ِ ْ ِ', 'َ ِ', 'َ َ', 'ْ َ ْ ِ', 'ِ']

Input Sequence: ﻋﻠﻴﻢ ﷲ ﺇﻥ ﺃﺗﻘﺎﻛﻢ ﷲ ﻋﻨﺪ ﺃﻛﺮﻣﻜﻢ ﺇﻥ ﻟﺘﻌﺎﺭﻓﻮﺍ ﻭﻗﺒﺎﺋﻞ ﺷﻌﻮﺑﺎ ﻭﺟﻌﻠﻨﺎﻛﻢ ﻭﺃﻧﺜﻰ ﺫﻛﺮ ﻣﻦ ﺧﻠﻘﻨﺎﻛﻢ ﺇﻧﺎ ﺍﻟﻨﺎﺱ ﺃﻳﻬﺎ ﻳﺎ ﺑﺎﻟﺘﻘﻮﻯ ﺍﻟﻘﺮﺁﻥ ﺣﺪﺩﻩ ﻣﺎ ﻭﻫﻮ ﻏﻴﺮﻩ
Predicted Unicode: ['1614 1618 1616', '1614 1615', '1614', '1614 1617 1614 1614', '1618 1615 1618', '1616 1617 1614 1618 1614', '1614', '1614 1617 1615 1614', '1617 1614', '1616 1617 1614', '1614 1614 1618 1614 1615', '1616', '1614 1614', '1614 1615 1618 1614', '1614 1614 1614 1618 1614 1615', '1615 1615 1611', '1614 1614 1614 1616', '1616 1614 1614 1614 1615', '1616', '1614 1618 1614 1614 1615', '1616 1618', '1617 1614', '1614 1618 1614 1615', '1616', '1617 1614', '1614 1616']
Predicted Arabic: ['َ ْ ِ', 'َ ُ', 'َ', 'َ ّ َ َ', 'ْ ُ ْ', 'ِ ّ َ ْ َ