#### In this notebook we are going to build Arabic Names generator using char-level language model 

In [0]:
import tensorflow as tf
import numpy as np
import random
import pandas as pd
from keras import backend as K
from keras.models import Sequential, Model
from keras.optimizers import Adam, RMSprop
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, Dropout, LSTM, Reshape, TimeDistributed, Activation
from keras.callbacks import LambdaCallback
from keras.utils import np_utils

# Loading and preprocessing data

In [3]:
data = open('arabic_names.txt', 'r').read()
data= data.lower()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('Total characters: ',data_size)
print('Unique characters: ',vocab_size)

Total characters:  6659
Unique characters:  42


## Dictionary

we create a python dictionary to map each character to an index from 1-42. 

In [4]:
char_to_ix = { ch:i+1 for i,ch in enumerate(sorted(chars)) }
ix_to_char = { i+1:ch for i,ch in enumerate(sorted(chars)) }# the 0 will be reserved to the one timestamp delay 
print(ix_to_char)

{1: '\n', 2: ' ', 3: 'ء', 4: 'آ', 5: 'أ', 6: 'ؤ', 7: 'إ', 8: 'ئ', 9: 'ا', 10: 'ب', 11: 'ة', 12: 'ت', 13: 'ث', 14: 'ج', 15: 'ح', 16: 'خ', 17: 'د', 18: 'ذ', 19: 'ر', 20: 'ز', 21: 'س', 22: 'ش', 23: 'ص', 24: 'ض', 25: 'ط', 26: 'ظ', 27: 'ع', 28: 'غ', 29: 'ف', 30: 'ق', 31: 'ك', 32: 'ل', 33: 'م', 34: 'ن', 35: 'ه', 36: 'و', 37: 'ى', 38: 'ي', 39: 'َ', 40: 'ُ', 41: 'ِ', 42: 'ّ'}


## Generate output and input data

In [0]:
with open("arabic_names.txt") as f:
        examples = f.readlines()
examples = [x.lower().strip() for x in examples] # list of names in the input data
num_examples=len(examples)

At every time-step  $t$, we have $y^{\langle t \rangle} = x^{\langle t+1 \rangle}$.

In [0]:
X=[]
Y=[]
max_len=0
for j in range(num_examples):
  if len(examples[j])>max_len:
    max_len=len(examples[j])+1 # +1 for the end of word character "\n"
  ex_x = [0]+[char_to_ix[ch] for ch in examples[j]] # we add [0] to delay target by one timstamp
  ex_y = ex_x[1:]+[char_to_ix['\n']]# the target is the input delayed by one timestamp
  X.append(ex_x)
  Y.append(ex_y)

In [41]:
X = pad_sequences(X, maxlen=max_len, padding='post', value=0) # setting all the inputs to the same length (=max length of a name)
Y = pad_sequences(Y, maxlen=max_len, padding='post', value=0)

X=tf.one_hot(X, vocab_size+1, axis=2) # converting sequences of characters indexes to one hot vectors
Y=tf.one_hot(Y, vocab_size+1,  axis=2)# the +1 in vocab_size+1 refers to the delay character
sess = tf.Session()
X=X.eval(session=sess)
Y=Y.eval(session=sess)
#Y=Y.reshape(Y.shape[1], Y.shape[0], Y.shape[2])
X.shape, Y.shape


((1186, 14, 43), (1186, 14, 43))

# Character-level language model for name generation

At each time-step, the RNN tries to predict what is the next character given the previous characters.

In [0]:
model = Sequential()
model.add(LSTM(128, input_shape=(max_len, vocab_size+1), return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size+1)))
model.add(TimeDistributed(Activation('softmax')))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [43]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 14, 128)           88064     
_________________________________________________________________
time_distributed_4 (TimeDist (None, 14, 43)            5547      
_________________________________________________________________
time_distributed_5 (TimeDist (None, 14, 43)            0         
Total params: 93,611
Trainable params: 93,611
Non-trainable params: 0
_________________________________________________________________


This function is to sample a sequence of characters according to a sequence of probability distributions output of the RNN.

In [0]:
def sample(epoch, logs):
  char_to_ix["\t"]=0
  if epoch%2==0 and epoch !=0:
    print('----- Generating text after Epoch: %d' % epoch)
    for i in range(0,10):
      stop=False
      counter=1
      target_seq = np.zeros((1, max_len, vocab_size+1))
      # Create the one-hot vector x for the first character (initializing the sequence generation)
      target_seq[0, 0, 0] = 1.
      ch=''
      while stop == False and counter < 10:#We'll stop if we reach 10 characters or we detect \n
        #sample the data
        # At each time-step, sample a character from a probability distribution and append
        probs = model.predict_proba(target_seq, verbose=0)[:,counter-1,:]
        c= np.random.choice(['\t']+sorted(list(chars)), replace =False, p=probs.reshape(vocab_size+1))
        if c=='\n':
          stop=True
        else:
          ch=ch+c
          target_seq[0,counter , char_to_ix[c]] = 1.
          counter=counter+1
      print(ch)

In [61]:
# fitting the model
print_callback = LambdaCallback(on_epoch_end=sample)
model.fit(X, Y, batch_size=32,epochs=250, callbacks=[print_callback])

Epoch 1/250
Epoch 2/250
Epoch 3/250
----- Generating text after Epoch: 2
هاد
جوبةةيزلو
حو
ىلائلأثزي
وبئمسؤغبت
ثوريدغ
سجيض
اعن
بياّاترجَ
سفيم
Epoch 4/250
Epoch 5/250
----- Generating text after Epoch: 4
روروديلال
جورينيل
سماننيت
نيروهت
منوتن
جولوليميل
حُويّة
نامني
ربجي
رطليليم
Epoch 6/250
Epoch 7/250
----- Generating text after Epoch: 6
تجيف
فرتاريناد
بهيا
مجارة
جوشية
فاوية
أيسور
ن ارلة
دارادان
فجلة
Epoch 8/250
Epoch 9/250
----- Generating text after Epoch: 8
جرل
	بالين
فيلفان
سمى
نوهر
بَس
إيتور
صائل
أحبُِزسرت
مسر
Epoch 10/250
Epoch 11/250
----- Generating text after Epoch: 10
أيسزي
أمهبة
مى
أسرد
لولية
صفوض
رولان
هنال
حهطي
شَشل
Epoch 12/250
Epoch 13/250
----- Generating text after Epoch: 12
مرؤم
رمين
سمرسانر
نيمة
كريم
طانر
غيلاء
فؤدة
سيل
جيثنارا
Epoch 14/250
Epoch 15/250
----- Generating text after Epoch: 14
وسيل
نزار
ثاري
بَاد
ميسر
أمينة
جاله
مير
مارد
كور
Epoch 16/250
Epoch 17/250
----- Generating text after Epoch: 16
فورية
نهاب
يأسار
أسار
حَنز
ضحى
حارثم
أنجال
يسر
نايفة
Epoch 18/250
Ep

<keras.callbacks.History at 0x7fde8219c518>

As you can see at the first epochs the algorithm generated random characters but through the training it has started to generate plausible names :D 

Some of the names that I liked and that do not exist in the training data:
جاولينا
, بمارئيف
,اماري