In [1]:
import re
import os
import numpy as np
import tensorflow as tf

In [2]:
! wget "https://www.cs.cmu.edu/%7Eark/ArabicNER/AQMAR_Arabic_NER_corpus-1.0.zip"
! unzip "/content/AQMAR_Arabic_NER_corpus-1.0.zip" -d "/content/corpus"

--2023-12-15 23:52:55--  https://www.cs.cmu.edu/%7Eark/ArabicNER/AQMAR_Arabic_NER_corpus-1.0.zip
Resolving www.cs.cmu.edu (www.cs.cmu.edu)... 128.2.42.95
Connecting to www.cs.cmu.edu (www.cs.cmu.edu)|128.2.42.95|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7815886 (7.5M) [application/zip]
Saving to: ‘AQMAR_Arabic_NER_corpus-1.0.zip’


2023-12-15 23:52:58 (2.89 MB/s) - ‘AQMAR_Arabic_NER_corpus-1.0.zip’ saved [7815886/7815886]

Archive:  /content/AQMAR_Arabic_NER_corpus-1.0.zip
  inflating: /content/corpus/Atom.txt  
  inflating: /content/corpus/Christiano_Ronaldo.txt  
  inflating: /content/corpus/Computer.txt  
  inflating: /content/corpus/Computer_Software.txt  
  inflating: /content/corpus/Crusades.txt  
  inflating: /content/corpus/Damascus.txt  
  inflating: /content/corpus/Enrico_Fermi.txt  
  inflating: /content/corpus/Football.txt  
  inflating: /content/corpus/Ibn_Tolun_Mosque.txt  
  inflating: /content/corpus/Imam_Hussein_Shrine.txt  
  inflating:

In [3]:
# Entity Cleaner: Unites entity tags and fixs misspellings
def tags_cleaner(entity):
  entity = re.sub('\n','',entity) # Remove the newline (\n)
  if entity in ['B-LOC', 'B-MIS', 'B-ORG','B-PER','I-LOC','I-MIS','I-ORG','I-PER','O']:
    return entity
  elif entity in ['B-MIS0','B-MIS1', 'B-MIS2', 'B-MIS3', 'B-MIS-1','B-MIS-2', 'B-MIS1`', 'B-MISS1']:
    return 'B-MIS'
  elif entity in ['I-MIS0','I-MIS1', 'I-MIS2', 'I-MIS3']:
    return 'I-MIS'
  elif entity in ['B-ENGLISH', 'B-SPANISH', 'OO', 'IO']:
    return 'O'
  elif entity == 'I--ORG':
    return 'I-ORG'
  else:
    print('Error with entity:', entity)


# Clean/Normalize Arabic Text
def clean_str(text):
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']

    # Remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)

    # Remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)

    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')

    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])

    # Trim
    text = text.strip()

    return text

# Remove empty strings or strings that contains spaces only from sentences
def re_clean(old_sentence, old_tags):
  space_regex = re.compile("\s+")
  new_sentence = []
  new_tags = []
  for j in range(len(old_sentence)):
    # add word if not empty and doesn't contain spaces only
    if old_sentence[j]!="" and space_regex.match(old_sentence[j])==None:
      new_sentence.append(old_sentence[j])
      new_tags.append(old_tags[j])

  return new_sentence, new_tags


In [4]:
# Read sentences
sentences = []
tags = []
vocab = set()

corpus_path = "/content/corpus/"
for file in os.listdir(corpus_path):
  if file.endswith('.txt'): # Get txt files only
    print('Processing:', file)
    topic = open(corpus_path+file)
    sentence = []
    entity = []
    for line in topic.readlines():
      if line == '\n': # Sentence end
        recleaned = re_clean(sentence, entity)
        sentences.append(recleaned[0].copy())
        tags.append(recleaned[1].copy())
        sentence.clear()
        entity.clear()
      else:
        line = line.split(sep=' ')
        clean_word = clean_str(line[0])       # Cleaning word
        vocab.add(clean_word)                 # Add word to the vocab
        sentence.append(clean_word)           # Add the word
        entity.append(tags_cleaner(line[1]))  # Clean and add entity


print('Done [Sentences:', len(sentences), ', Tags:', len(tags), ', Unique Words:', len(vocab))

Processing: Islamic_History.txt
Processing: Ummaya_Mosque.txt
Processing: Richard_Stallman.txt
Processing: Computer_Software.txt
Processing: X_window_system.txt
Processing: Damascus.txt
Processing: Summer_Olympics2004.txt
Processing: Christiano_Ronaldo.txt
Processing: Imam_Hussein_Shrine.txt
Processing: Periodic_Table.txt
Processing: Solaris.txt
Processing: Ibn_Tolun_Mosque.txt
Processing: Real_Madrid.txt
Processing: Linux.txt
Processing: Computer.txt
Processing: Raul_Gonzales.txt
Processing: Football.txt
Processing: Razi.txt
Processing: Portugal_football_team.txt
Processing: Nuclear_Power.txt
Processing: Islamic_Golden_Age.txt
Processing: Atom.txt
Processing: Light.txt
Processing: Crusades.txt
Processing: Soccer_Worldcup.txt
Processing: Physics.txt
Processing: Internet.txt
Processing: Enrico_Fermi.txt
Done [Sentences: 2687 , Tags: 2687 , Unique Words: 17481


In [5]:
# Make a mapping betwween words and their IDs
word2id = {word:id for  id, word in enumerate(vocab)}
id2word = {id:word for  id, word in enumerate(vocab)}

In [6]:
from tensorflow.keras.utils import to_categorical

# Sentence encoder
def encode_sentence(old_sentence):
  encoded_sentence = []
  for word in old_sentence:
    try:
      encoded_sentence.append(word2id[word])
    except KeyError:
      encoded_sentence.append(0) # A dummy digit for out of vocab

  return encoded_sentence

# Encode Tags
tags_encoding = {
    'B-LOC':0,
    'B-MIS':1,
    'B-ORG':2,
    'B-PER':3,
    'I-LOC':4,
    'I-MIS':5,
    'I-ORG':6,
    'I-PER':7,
    'O':8
  }
def encode_tags(old_tags):
  new_tags = [tags_encoding[tag] for tag in old_tags]
  new_tags = to_categorical(y = new_tags, num_classes=9)
  return new_tags

In [7]:
# Encoding
sentences_encoded = []
tags_encoded = []

for i in range(len(sentences)):
  sentences_encoded.append(encode_sentence(sentences[i]))
  tags_encoded.append(encode_tags(tags[i]))

In [8]:
from keras.preprocessing.sequence import pad_sequences

# Padding
MAX_SEQUENCE_LENGTH = 40

sentences_padded = pad_sequences(sequences = sentences_encoded,
                                 maxlen=MAX_SEQUENCE_LENGTH,
                                 dtype='int32',
                                 padding='post',
                                 truncating='post',
                                 value = 0)
tags_padded = pad_sequences(sequences = tags_encoded,
                                 maxlen=MAX_SEQUENCE_LENGTH,
                                 dtype='int32',
                                 padding='post',
                                 truncating='post',
                                 value = np.array([0., 0., 0., 0., 0., 0., 0., 0., 1.]))

In [9]:
from sklearn.model_selection import train_test_split

# Splitting data
train_sentences, test_sentences, train_labels, test_labels = train_test_split(sentences_padded,
                                                                              tags_padded,
                                                                              train_size=0.8,
                                                                              random_state=42)

In [10]:
# Download AraVec (Word2Vec Model) by Abu Bakr Soliman, Kareem Eissa, and Samhaa R.El-Beltagy.
! wget "https://archive.org/download/aravec2.0/wiki_cbow_300.zip"
! unzip "/content/wiki_cbow_300.zip" -d "/content/word2vec_model"

--2023-12-15 23:53:05--  https://archive.org/download/aravec2.0/wiki_cbow_300.zip
Resolving archive.org (archive.org)... 207.241.224.2
Connecting to archive.org (archive.org)|207.241.224.2|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://ia803107.us.archive.org/0/items/aravec2.0/wiki_cbow_300.zip [following]
--2023-12-15 23:53:05--  https://ia803107.us.archive.org/0/items/aravec2.0/wiki_cbow_300.zip
Resolving ia803107.us.archive.org (ia803107.us.archive.org)... 207.241.232.157
Connecting to ia803107.us.archive.org (ia803107.us.archive.org)|207.241.232.157|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 364888893 (348M) [application/zip]
Saving to: ‘wiki_cbow_300.zip’


2023-12-15 23:54:29 (4.14 MB/s) - ‘wiki_cbow_300.zip’ saved [364888893/364888893]

Archive:  /content/wiki_cbow_300.zip
  inflating: /content/word2vec_model/wikipedia_cbow_300  
  inflating: /content/word2vec_model/wikipedia_cbow_300.trainables.syn1neg.npy  

In [11]:
import gensim

# Load the Word2Vec model
weights_path = "/content/word2vec_model/wikipedia_cbow_300"
araVec = gensim.models.Word2Vec.load(weights_path)

# Testing
most_similar = araVec.wv.most_similar( "محمد" )
for term, score in most_similar:
	print(term, score)

لمحمد 0.726012110710144
احمد 0.7142193913459778
عبدالرحمن 0.6745273470878601
ابراهيم 0.6723851561546326
مهدي 0.6686975955963135
محمود 0.6648465991020203
يحي 0.637116551399231
اسماعيل 0.6307213306427002
حموده 0.6287057995796204
عبدالحميد 0.6267550587654114


In [12]:

num_words = len(vocab)
embed_size = araVec.wv.vector_size
embedding_matrix = np.zeros(shape=(num_words, embed_size))

for word, id in word2id.items():
    try:
        embedding_matrix[id] = araVec.wv[word]
    except KeyError:
        embedding_matrix[id] = np.zeros(embed_size)

embedding_matrix.shape

(17481, 300)

In [13]:
from tensorflow.keras.layers import  Input, Dense, Embedding, TimeDistributed ,GRU , Bidirectional
from tensorflow.keras.models import Model, Sequential

tf.keras.backend.clear_session() # Makes sure old model was deleted if exists

GRU_model = Sequential()
# Adding Layers
GRU_model.add(Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32'))
GRU_model.add(Embedding(input_dim = len(vocab),              # Vocabulary Size (number of unique words for training)
                        output_dim = embed_size,              # Length of the vector for each word (embedding dimension)
                        input_length = MAX_SEQUENCE_LENGTH,   # Maximum length of a sequence
                        weights = [embedding_matrix],         # Send the needed AraVec Weights
                        trainable = False))

GRU_model.add(Bidirectional(GRU(10, return_sequences=True)))
GRU_model.add(TimeDistributed(Dense(9, activation='softmax')))

# Compile the model
GRU_model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999),
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])
GRU_model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 300)           5244300   
                                                                 
 bidirectional (Bidirection  (None, 40, 20)            18720     
 al)                                                             
                                                                 
 time_distributed (TimeDist  (None, 40, 9)             189       
 ributed)                                                        
                                                                 
Total params: 5263209 (20.08 MB)
Trainable params: 18909 (73.86 KB)
Non-trainable params: 5244300 (20.01 MB)
_________________________________________________________________


In [14]:
GRU_model.fit(train_sentences,
               train_labels,
               validation_split=0.15,
               batch_size = 10,
               epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7828fc7fc760>

In [15]:
GRU_model.evaluate(test_sentences, test_labels)



[0.13867048919200897, 0.9558550119400024]

In [16]:
def GRU_predict(sentence:str):
  sentence = sentence.split(sep=' ')
  # Keeping track of words so not to process 40 words every time
  word_count = len(sentence)
  # Clean sentence
  ready_sentence = [clean_str(word) for word in sentence]
  # Encode sentence
  ready_sentence = encode_sentence(ready_sentence)
  # Padding sentence
  ready_sentence = pad_sequences(sequences = [ready_sentence],
                                 maxlen=MAX_SEQUENCE_LENGTH,
                                 dtype='int32',
                                 padding='post',
                                 truncating='post',
                                 value = 0)
  tag_classes = ['B-LOC', 'B-MIS', 'B-ORG', 'B-PER', 'I-LOC', 'I-MIS', 'I-ORG', 'I-PER', 'O']
  # Predict and return actual words only
  predictions = GRU_model.predict(ready_sentence)

  from terminaltables import AsciiTable
  table_data = [['word', 'prediction']]
  for i, word in enumerate(sentence):
      table_data.append([word, tag_classes[np.argmax(predictions[0][i])]])
  table = AsciiTable(table_data)
  print(table.table)



In [19]:
GRU_predict('منشئ المسجد هو أحمد بن طولون مؤسس الدولة الطولونية في مصر والشام، تعود أصوله إلى قبيلة التغزغز التركية، وكانت أُسرته تقيم في بخاري.')

+-----------+------------+
| word      | prediction |
+-----------+------------+
| منشئ      | O          |
| المسجد    | O          |
| هو        | O          |
| أحمد      | B-PER      |
| بن        | I-PER      |
| طولون     | I-PER      |
| مؤسس      | I-PER      |
| الدولة    | O          |
| الطولونية | O          |
| في        | O          |
| مصر       | O          |
| والشام،   | B-LOC      |
| تعود      | O          |
| أصوله     | O          |
| إلى       | O          |
| قبيلة     | O          |
| التغزغز   | O          |
| التركية،  | O          |
| وكانت     | O          |
| أُسرته    | O          |
| تقيم      | O          |
| في        | O          |
| بخاري.    | B-LOC      |
+-----------+------------+


In [20]:
GRU_predict('محمود حسام ذهب الي مسجد')

+-------+------------+
| word  | prediction |
+-------+------------+
| محمود | B-PER      |
| حسام  | I-PER      |
| ذهب   | O          |
| الي   | O          |
| مسجد  | O          |
+-------+------------+
