<a href="https://colab.research.google.com/github/zaidalyafeai/AraMeter/blob/master/AraMeter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyarabic

Collecting pyarabic
  Downloading PyArabic-0.6.15-py3-none-any.whl (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.4/126.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarabic
Successfully installed pyarabic-0.6.15


We use a product review dataset

In [2]:
!unzip baits.zip

Archive:  baits.zip
  inflating: final_baits/labels.txt  
  inflating: final_baits/test.txt    
  inflating: final_baits/train.txt   


## Imports

In [3]:
import tensorflow as tf
import numpy as np
import os
import time
import glob
from random import shuffle
from pyarabic import araby
from tensorflow.keras.layers import GRU, Embedding, Dense, Input, Dropout, Bidirectional, BatchNormalization, Flatten, Reshape
from tensorflow.keras.models import Sequential
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [4]:
with open('final_baits/labels.txt', 'r') as f:
  label2name = f.readlines()
  label2name = [name.replace('\n', '') for name in label2name]

## Read the Dataset

preprocess a review by removing special characters and long spaces

In [5]:
# Read, then decode for py2 compat.
def extract_data(path, thresh = 70, on_shatrs = False):
  global vocab

  text = ""

  X = []
  y = []

  t = open(path, 'r').read()
  t = araby.strip_tashkeel(t)

  # remove some exteranous chars
  execluded = '!()*-ـ.:=o[]«»;؛,،~?؟\u200f\ufeffـ'
  out = ""

  for char in t:
    if char not in execluded:
      out += char

  text += out
  baits = out.split('\n')
  for line in baits:
    if len(line) <= 1:
      continue
    label, bait = line.split(' ', 1)
    label = int(label)

    bait  = bait.strip()
    if on_shatrs:
      shatrs = bait.split('#')
      for shatr in shatrs:
        X.append(shatr.strip())
        y.append(label)
    else:
      X.append(bait.strip())
      y.append(label)

  #create the vocab
  vocab = sorted(set(' '.join(X)))

  #shuffle the data
  X, y = shuffle(X, y)
  return X, y

In [6]:
X, y = extract_data("final_baits/train.txt", on_shatrs=False)

In [7]:
for i in range(5):
  print(X[i], ' ', label2name[y[i]])

زكي النفس محمود السجايا # مصان العرض ممدوح الجناب   الوافر
وأثمرت السمر هام الكماة # وعاجلنها بأوان الجناء   المتقارب
قد أعلنت بالثناء تنشره # وابتهلت بالدعاء تخلصه   المنسرح
جار ويرى ليس بجار # لأناة فيه ووقار   المتدارك
قد حمى الغيران شمس هوى # منه عين الشمس في رمد   المديد


## Create Sequences
Create sequences by using the most repeated 500 words

## Create Numpy Arrays

In [8]:
X_train, X_valid , y_train, y_valid = train_test_split(X, y, test_size = 0.15, random_state = 41)

In [9]:
# Creating a mapping from unique characters to indices
char2idx = {u:i+1 for i, u in enumerate(vocab)}

def to_sequences(X):
  X = [[char2idx[char] for char in line] for line in X]
  X = pad_sequences(X, padding='post', value=0, maxlen = 100)
  return X

X_train = to_sequences(X_train)
X_valid = to_sequences(X_valid)

y_train = np.array(y_train)
y_valid = np.array(y_valid)

## Create the model

In [10]:
model = Sequential()
model.add(Input((100,)))
model.add(Embedding(len(char2idx)+1, 256))
model.add(Bidirectional(GRU(units = 256, return_sequences=True)))
model.add(Bidirectional(GRU(units = 256, return_sequences=True)))
model.add(Bidirectional(GRU(units = 256)))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(len(label2name), activation = 'softmax'))
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 256)          9984      
                                                                 
 bidirectional (Bidirection  (None, 100, 512)          789504    
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 100, 512)          1182720   
 onal)                                                           
                                                                 
 bidirectional_2 (Bidirecti  (None, 512)               1182720   
 onal)                                                           
                                                                 
 dense (Dense)               (None, 128)               65664     
                                                        

In [12]:
model(tf.zeros((10, 100))).shape

TensorShape([10, 14])

## Train the model

In [13]:
callbacks = [tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_delta=0.0001, min_lr=0.0001)]
callbacks += [tf.keras.callbacks.ModelCheckpoint('full_verse.h5', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')]

In [14]:
model.fit(X_train, y_train, validation_data= (X_valid, y_valid), epochs = 15, batch_size= 128, shuffle = True, callbacks=callbacks)

Epoch 1/15
Epoch 1: val_accuracy improved from -inf to 0.45551, saving model to full_verse.h5
Epoch 2/15


  saving_api.save_model(


Epoch 2: val_accuracy improved from 0.45551 to 0.75173, saving model to full_verse.h5
Epoch 3/15
Epoch 3: val_accuracy improved from 0.75173 to 0.85090, saving model to full_verse.h5
Epoch 4/15
Epoch 4: val_accuracy improved from 0.85090 to 0.88824, saving model to full_verse.h5
Epoch 5/15
Epoch 5: val_accuracy improved from 0.88824 to 0.89885, saving model to full_verse.h5
Epoch 6/15
Epoch 6: val_accuracy improved from 0.89885 to 0.91187, saving model to full_verse.h5
Epoch 7/15
Epoch 7: val_accuracy did not improve from 0.91187
Epoch 8/15
Epoch 8: val_accuracy did not improve from 0.91187
Epoch 9/15
Epoch 9: val_accuracy improved from 0.91187 to 0.93323, saving model to full_verse.h5
Epoch 10/15
Epoch 10: val_accuracy improved from 0.93323 to 0.93479, saving model to full_verse.h5
Epoch 11/15
Epoch 11: val_accuracy did not improve from 0.93479
Epoch 12/15
Epoch 12: val_accuracy did not improve from 0.93479
Epoch 13/15
Epoch 13: val_accuracy did not improve from 0.93479
Epoch 14/15
Ep

<keras.src.callbacks.History at 0x7f404d4bceb0>

In [15]:
model = tf.keras.models.load_model('full_verse.h5')

## Tests

In [18]:
def classify(sentence):
    sentence = araby.strip_tashkeel(sentence)
    sequence = [char2idx[char] for char in sentence]
    sequence = pad_sequences([sequence], maxlen = X_train.shape[1], padding='post', value=0)

    pred = model.predict(sequence)[0]
    predicted_label = label2name[np.argmax(pred, 0).astype('int')]
    print(predicted_label)

In [19]:
classify("وَإِن ظَهَرَت مِنهُ قَوارِصُ جَمَّةٌ # وَأَفرَعَ في لَومي مِراراً وَأَصعَدا")
classify("أَقصَدَتني سِهامُهُ إِذ رَأَتني # وَتَوَلَّت عَنهُ سُلَيمى نِبالي")
classify("تَحِنُّ حَنيناً إِلى مالِكٍ # فَحِنّي حَنينَكِ إِنّي مُعالي")
classify("لا تَغبِطِ المَرءَ أَن يُقالَ لَهُ # أَمسى فَلانٌ لِعُمرِهِ حَكَما")
classify("يا ليلُ الصبّ متى غدهُ # أقيامُ الساعة موعدهُ")
classify(" لك يا منازل في القلوب منازل # أقفرت أنت وهن منك أواهل")

الطويل
الخفيف
المتقارب
المنسرح
المتدارك
الكامل


In [20]:
classify("ما تردون على هذا المحب # دائبا يشكو إليكم في الكتب")
classify("ولد الهدى فالكائنات ضياء # وفم الزمان تبسم وسناء")
classify(" لك يا منازل في القلوب منازل # أقفرت أنت وهن منك أواهل")
classify("ومن لم يمت بالسيف مات بغيره # تعددت الأسباب والموت واحد")
classify("أنا النبي لا كذب # أنا ابن عبد المطلب")
classify("هذه دراهم اقفرت # أم ربور محتها الدهور")
classify("هزجنا في بواديكم # فأجزلتم عطايانا")
classify("بحر سريع ماله ساحل # مستفعلن مستفعلن فاعلن")
classify("مَا مَضَى فَاتَ وَالْمُؤَمَّلُ غَيْبٌ # وَلَكَ السَّاعَةُ الَّتِيْ أَنْتَ فِيْهَا")
classify("يا ليلُ الصبّ متى غدهُ # أقيامُ الساعة موعدهُ")

الرمل
الكامل
الكامل
الطويل
الهزج
المديد
الهزج
السريع
الخفيف
المتدارك


In [21]:
!zip model.zip full_verse.h5

  adding: full_verse.h5 (deflated 7%)


In [22]:
!ls -l

total 75548
-rw-r--r-- 1 root root  2422044 Nov  4 14:56 baits.zip
drwxr-xr-x 2 root root     4096 Nov  4 15:03 final_baits
-rw-r--r-- 1 root root 38894432 Nov  4 15:09 full_verse.h5
-rw-r--r-- 1 root root 36032183 Nov  4 15:14 model.zip
drwxr-xr-x 1 root root     4096 Nov  3 18:00 sample_data
