In [None]:
#####################################################################################################
##Copyright    O., Hamel et S. Lamari for The C00L07UN100120180002 Project.##########################
#####################################################################################################

#this file contains the code relating to the training of the EDAM paraphrase generation model (Arabic).

In [None]:
# import packages

import csv
import numpy as np
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
import sys
import tensorflow as tf
from sklearn.model_selection import train_test_split
import unicodedata
import io
import time

In [None]:
# Load Dataset

TRAIN_DATA_FILE = '/content/drive/My Drive/ArabicDataset/ArabicDataSet.txt'
MAX_SEQUENCE_LENGTH = 30

x = list()
y = list()

file=open(TRAIN_DATA_FILE,"r",encoding="utf8")
test=csv.reader(file)
i = -1

for row in test:
    sentence = ''.join(map(str, row))  
    sentence = sentence.replace(u'أ',u'ا')
    sentence = sentence.replace(u'إ', u'ا')
    sentence = sentence.replace(u'ٳ', u'ا')
    sentence = sentence.replace(u'آ', u'ا')
    sentence = sentence.replace(u'ة', u'ه')
    sentence = sentence.replace('-', '')
    sentence = sentence.replace('/', '')
    sentence = sentence.replace('\\', '')
    sentence = sentence.replace('.', '')
    sentence = sentence.replace(',', '')
    sentence = sentence.replace(';', '')
    sentence = sentence.replace("'", '')
    sentence = sentence.replace("،", '')
    sentence = sentence.replace('?', '')
    sentence = sentence.replace('!', '')
    sentence = sentence.replace("’", '')
    sentence = sentence.replace("؟", '')
    sentence = sentence.replace("=", '')
    sentence = sentence.replace("+", '')
    sentence = sentence.replace("'", '')
    sentence = sentence.replace("\"", '')
    sentence = sentence.replace('(', '')
    sentence = sentence.replace(")", '')
    sentence = sentence.replace("؟", '')
    sentence = sentence.replace("=", '')
    sentence = sentence.replace("+", '')
    sentence = sentence.replace("'", '')
    sentence = sentence.replace("\"", '')
    
    i = i+1

    if i%2==0 :
        x.append(sentence)
    else :
        y.append(sentence)
    
#Remove sentences with length >30

data = [(e1,e2) for e1,e2 in zip(x,y)] 

print(len(data))

texts_1 = [] # list of lists
texts_2 = [] # list of lists

for (e1,e2) in data : 

  j1 = e1.split()
  j2 = e2.split()

  if len(j1) <= 30 and len(j2) <= 30 :

    texts_1.append(e1)
    texts_2.append(e2)
    

print(len(texts_1))
print(len(texts_2))

In [None]:
#PreTreatment

def preTreatement(w):
  
  w = w.strip()

  # unicode to ascii
  w = ''.join(c for c in unicodedata.normalize('NFD', w) if unicodedata.category(c) != 'Mn')

  # make spaces between the words and the ponctuations
  
  w = re.sub(r"([?.!,¿])", r" \1 ", w)

  w = re.sub(r'[" "]+', " ", w)


  # add the start and end tokens
  w = '<start> ' + w + ' <end>'
    
  return w

In [None]:
# making the pre treatement to the sentences, and return the dataset in the format : phrases, praphrases
def extractSentences():

  paraphrase = [preTreatement(w) for w in texts_1]
  phrase = [preTreatement(w) for w in texts_2]


  return phrase, paraphrase

In [None]:
#Tokenization

def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')

  return tensor, lang_tokenizer

In [None]:
# Load the dataset + Tokenization for the inputs and outputs

def loadSentences():
    
  # creating cleaned input, output pairs
  targetLang, inputLang = extractSentences()

  inputTensor, inputLang_tokenizer = tokenize(inputLang)
  targetTensor, targetLang_tokenizer = tokenize(targetLang)

  return inputTensor, targetTensor, inputLang_tokenizer, targetLang_tokenizer

In [None]:
# Load input and output tensors

inputTensor, targetTensor, inputLang, targetLang = loadSentences()

# Calculate max_length of the target tensors
maxLengthTarget, maxLengthInput = targetTensor.shape[1], inputTensor.shape[1]

In [None]:
# Creating training and validation sets using an 80-20 split (you must also leave another 20% aside for the test ... we have left this 20% in a separate file)

inputTensor_train, inputTensor_val, targetTensor_train, targetTensor_val = train_test_split(inputTensor, targetTensor, test_size=0.2)

# Show length
print(len(inputTensor_train), len(targetTensor_train), len(inputTensor_val), len(targetTensor_val))

In [None]:
#Convert from index to word

def convert(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ----> %s" % (t, lang.index_word[t]))

In [None]:
#show an example

print ("Input Language; index to word mapping")
convert(inputLang, inputTensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targetLang, targetTensor_train[0])

In [None]:
#Definition of hyper Parameters

bufferSize = len(inputTensor_train)
batchSize = 64
stepsPerEpoch = len(inputTensor_train)//batchSize
embeddingDimension = 256
units = 1024
vocabInputSize = len(inputLang.word_index)+1
vocabTargetSize = len(targetLang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((inputTensor_train, targetTensor_train)).shuffle(bufferSize)
dataset = dataset.batch(batchSize, drop_remainder=True)

#example of input and target batches

exampleInputBatch, exampleTargetBatch = next(iter(dataset))
exampleInputBatch.shape, exampleTargetBatch.shape

In [None]:
# Encoder class

class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embeddingDimension, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embeddingDimension)
    self.gru = tf.keras.layers.GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [None]:
encoder = Encoder(vocabInputSize, embeddingDimension, units, batchSize)

sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(exampleInputBatch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

In [None]:
#Attention mechanism class

class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    
    query_with_time_axis = tf.expand_dims(query, 1)
    score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))

    attention_weights = tf.nn.softmax(score, axis=1)

    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [None]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batchSize, sequence_length, 1) {}".format(attention_weights.shape))

In [None]:
#Decoder class

class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embeddingDimension, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embeddingDimension)
    self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)
    

    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    
    context_vector, attention_weights = self.attention(hidden, enc_output)

    x = self.embedding(x)
    
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    output, state = self.gru(x)

    output = tf.reshape(output, (-1, output.shape[2]))
    
    x = self.fc(output)

    return x, state, attention_weights

In [None]:
decoder = Decoder(vocabTargetSize, embeddingDimension, units, batchSize)

sample_decoder_output, _, _ = decoder(tf.random.uniform((batchSize, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batchSize, vocab size) {}'.format(sample_decoder_output.shape))

In [None]:
#Define the optimizer and the loss function

optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [None]:
#Checkpoints 

checkpoint_dir = '/content/drive/My Drive/TenAtt'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

In [None]:
#Training function

"""
Pass the input through the encoder which return encoder output and the encoder hidden state.
The encoder output, encoder hidden state and the decoder input (which is the start token) is passed to the decoder.
The decoder returns the predictions and the decoder hidden state.
The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.
Use teacher forcing to decide the next input to the decoder.
Teacher forcing is the technique where the target word is passed as the next input to the decoder.
The final step is to calculate the gradients and apply it to the optimizer and backpropagate.

"""

@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
        
    # return the encoder output and the decoder hidden state
    
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([targetLang.word_index['<start>']] * batchSize, 1)

    # giving the target as the next input (teacher forcing)
    
    for t in range(1, targ.shape[1]):
        
      # passing encoder output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      dec_input = tf.expand_dims(targ[:, t], 1)

  #calculate the loss

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  #calculate the gradients (for backpropagation + optimizer)

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [None]:
# Run the training 

# restore the latest checkpoint 
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

EPOCHS = 40 # 40 is just an example

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(stepsPerEpoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
    
  # saving a checkpoint of the model after every epoch
  
  checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss / stepsPerEpoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

  #Save loss value in a txt file after every epoch (Optional)
  
  loss_file=open("/content/drive/My Drive/loss/loss.txt", "a+")
  loss_file.write(str(total_loss / stepsPerEpoch)+'\n')
  loss_file.close()
  
  print("Model and loss saved\n")

In [None]:
# Passing to the inference

# restore the latest checkpoint 
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
#Generate Paraphrases

def evaluate(sentence):

  sentence = preTreatement(sentence)

  inputs = [inputLang.word_index[i] for i in sentence.split(' ') if i in inputLang.word_index.keys()]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=maxLengthInput, padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targetLang.word_index['<start>']], 0)

  for t in range(maxLengthTarget):
        
    predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += targetLang.index_word[predicted_id] + ' '

    if targetLang.index_word[predicted_id] == '<end>':
      return result, sentence

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [None]:
def paraphrasing(sentence):
    
  result, sentence = evaluate(sentence)
  
  return result

In [None]:
# Example

result = paraphrasing('علقت الجلسه الساعه الواحده ظهرا واستونفت الساعه الثالثه مساء')
print('the result is : ', result.split(' ')[0:-2])

In [None]:
#For beam search

listOfIndexes = [[0], [1], [2], [0], [1], [2], [0,1], [0,2], [1,2]]
listOfChoice = [1,1,1,2,2,2,1,1,1]

In [None]:
def evaluate2(sentence, indexes, choice):
  attention_plot = np.zeros((maxLengthTarget, maxLengthInput))

  sentence = preTreatement(sentence)
  
  inputs = [inputLang.word_index[i] for i in sentence.split(' ') if i in inputLang.word_index.keys()]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=maxLengthInput, padding='post')
    
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targetLang.word_index['<start>']], 0)

  for t in range(maxLengthTarget):
    predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)

    
    if t not in indexes :
      predicted_id = tf.argmax(predictions[0]).numpy()
    
    else:
      arr = predictions[0].numpy().argsort()[-3:][::-1]
      
      predicted_id = arr[choice]

    result += targetLang.index_word[predicted_id] + ' '

    if targetLang.index_word[predicted_id] == '<end>':
      return result, sentence

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [None]:
def paraphrasing2(sentence, indexes, choice):
    
  result, sentence = evaluate2(sentence, indexes, choice)

  return result

In [None]:
#Example

result = paraphrasing2('علقت الجلسه الساعه الواحده ظهرا واستونفت الساعه الثالثه مساء', listOfIndexes[0], listOfChoice[0])
print('the result is : ', result.split(' ')[0:-2])