In [1]:
!pip install sentencepiece
!pip install keras_metrics

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 27.8 MB/s eta 0:00:01[K     |▌                               | 20 kB 30.3 MB/s eta 0:00:01[K     |▉                               | 30 kB 21.5 MB/s eta 0:00:01[K     |█                               | 40 kB 17.3 MB/s eta 0:00:01[K     |█▍                              | 51 kB 7.7 MB/s eta 0:00:01[K     |█▋                              | 61 kB 9.0 MB/s eta 0:00:01[K     |██                              | 71 kB 8.1 MB/s eta 0:00:01[K     |██▏                             | 81 kB 9.1 MB/s eta 0:00:01[K     |██▍                             | 92 kB 9.8 MB/s eta 0:00:01[K     |██▊                             | 102 kB 7.5 MB/s eta 0:00:01[K     |███                             | 112 kB 7.5 MB/s eta 0:00:01[K     |███▎                            | 122 kB 7.5 MB/s eta 0:00:01[K     |███▌      

In [2]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
import keras_metrics
import pandas as pd
import sentencepiece as spm

In [4]:
with open('min-ind.txt','r',encoding='utf-8') as f:
  lines = f.read().split('\n')
with open('min-ind_train.txt','w',encoding='utf-8') as w:
  for line in lines:
    w.write(line+'\n')
lines_1 = []
lines_2 = []
with open('min-ind_train.txt','r',encoding='utf-8') as min:
  for line in min.readlines():
    split = line.split(',')
    if split!=['\n']:
      lines_1.append(split[0])
      lines_2.append(split[1])
with open('min-ind_input.txt','w',encoding='utf-8') as enc_txt:
  for line in lines_1:
    enc_txt.write(line+'\n')
with open('min-ind_target.txt','w',encoding='utf-8') as tar_txt:
  for line in lines_2:
    tar_txt.write(line)

In [7]:
#import sentencepiece as spm
model_inp = spm.SentencePieceTrainer.train("--input=min-ind_input.txt --model_prefix=input --vocab_size=100")
sp_inp = spm.SentencePieceProcessor()
sp_inp.load('input.model')
model_tar = spm.SentencePieceTrainer.train("--input=min-ind_target.txt --model_prefix=target --vocab_size=100")
sp_tar = spm.SentencePieceProcessor()
sp_tar.load('target.model')

True

In [8]:
### Prepare data ###
input_encode = []
target_encode = []
with open('min-ind_input.txt','r',encoding='utf-8') as inp:
  for line in inp:
    input_encode.append(sp_inp.encode_as_pieces(line))
with open('min-ind_target.txt','r',encoding='utf-8') as tar:
  for line in tar:
    target_encode.append(['\t']+sp_tar.encode_as_pieces(line)+['\n'])
 

In [9]:
## Create a dictionary for characters mapping ##
input_subwords = np.unique(np.array([tk for tok in input_encode for tk in tok]))
input_subwords = np.append(input_subwords," ")
target_subwords = np.unique(np.array([tk for tok in target_encode for tk in tok]))
target_subwords = np.append(target_subwords," ")
input_token_index = dict([(char,i) for i,char in enumerate(sorted(list(input_subwords)))])
target_token_index = dict([(char,i) for i,char in enumerate(sorted(list(target_subwords)))])

num_encoder_tokens= len(input_subwords)
num_decoder_tokens = len(target_subwords)

In [10]:
max_encoder_seq_length = max([len(words) for words in input_encode])
max_decoder_seq_length = max([len(words) for words in target_encode])

encoder_input_data = np.zeros((len(input_encode),max_encoder_seq_length,num_encoder_tokens),dtype='float32')
decoder_input_data = np.zeros((len(input_encode),max_decoder_seq_length,num_decoder_tokens),dtype='float32')
decoder_target_data = np.zeros((len(target_encode),max_decoder_seq_length,num_decoder_tokens),dtype='float32')
 
for i,(enc_word,tar_word) in enumerate(zip(input_encode,target_encode)):
  for t,char in enumerate(enc_word):
    encoder_input_data[i,t,input_token_index[char]] = 1.0
  decoder_input_data[i,t+1 :,input_token_index[" "]] = 1.0 # encoding space at the end of encoder word for decoder input to reach to next word for encoding
  for t,char in enumerate(tar_word):
    decoder_input_data[i,t,target_token_index[char]] = 1.0
    if t>0:
      decoder_target_data[i,t-1,target_token_index[char]] = 1.0
  decoder_input_data[i,t+1 :,target_token_index[" "]] = 1.0
  decoder_target_data[i,t:,target_token_index[" "]] = 1.0

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
seed = 7
np.random.seed(seed)
kfold_index,test_index = train_test_split(list(range(len(encoder_input_data))),test_size=0.2,random_state=seed)
fold_var = 5
kf = KFold(n_splits = fold_var,random_state=seed, shuffle=True)
validation_accuracy = []
validation_loss = []
exac_match_accuracy = []
#validation_recall = []

In [12]:
encoder_input_data_k = pd.Series(list(encoder_input_data)).iloc[kfold_index].to_numpy()
decoder_input_data_k = pd.Series(list(decoder_input_data)).iloc[kfold_index].to_numpy()
decoder_target_data_k = pd.Series(list(decoder_target_data)).iloc[kfold_index].to_numpy()

encoder_input_data_k = np.array([list(en) for en in encoder_input_data_k])
decoder_input_data_k = np.array([list(en) for en in decoder_input_data_k])
decoder_target_data_k = np.array([list(en) for en in decoder_target_data_k])

encoder_input_data_test = pd.Series(list(encoder_input_data)).iloc[test_index].to_numpy()
decoder_input_data_test = pd.Series(list(decoder_input_data)).iloc[test_index].to_numpy()
decoder_target_data_test = pd.Series(list(decoder_target_data)).iloc[test_index].to_numpy()

encoder_input_data_test = np.array([list(en) for en in encoder_input_data_test])
decoder_input_data_test = np.array([list(en) for en in decoder_input_data_test])
decoder_target_data_test = np.array([list(en) for en in decoder_target_data_test])

In [13]:
batch_size = 64  # Batch size for training.
epochs = 20  # Number of epochs to train for.
latent_dim_1 = 128
latent_dim_2 = 256
latent_dim_3 = 512 # Latent dimensionality of the encoding space.
num_samples = 13762  # Number of samples to train on.

In [None]:
for i,(train_index,val_index) in enumerate(kf.split(np.zeros(len(encoder_input_data_k)))):
  print('Iteration number : ', i+1)

  # Define an input sequence and process it.
  encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
  encoder_outputs,forward_h,forward_c,backward_h,backward_c= keras.layers.Bidirectional(keras.layers.LSTM(latent_dim_2, return_state=True),merge_mode='concat',name='encoder_lstm1')(encoder_inputs)

  # We discard `encoder_outputs` and only keep the states.
  state_h = tf.keras.layers.Concatenate()([forward_h,forward_c])
  state_c = tf.keras.layers.Concatenate()([backward_h,backward_c])
  encoder_states = [state_h, state_c]

  # Set up the decoder, using `encoder_states` as initial state.
  decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))

  # We set up our decoder to return full output sequences,
  # and to return internal states as well. We don't use the
  # return states in the training model, but we will use them in inference.
  decoder_outputs,*decoder_states= keras.layers.LSTM(latent_dim_3, return_sequences=True, return_state=True)(decoder_inputs, initial_state=encoder_states)
  decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
  decoder_outputs = decoder_dense(decoder_outputs)

  # Define the model that will turn
  # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
  model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

  encoder_input_data_t = pd.Series(list(encoder_input_data_k)).iloc[train_index].to_numpy()
  decoder_input_data_t = pd.Series(list(decoder_input_data_k)).iloc[train_index].to_numpy()
  decoder_target_data_t = pd.Series(list(decoder_target_data_k)).iloc[train_index].to_numpy()

  encoder_input_data_val = pd.Series(list(encoder_input_data_k)).iloc[val_index].to_numpy()
  decoder_input_data_val = pd.Series(list(decoder_input_data_k)).iloc[val_index].to_numpy()
  decoder_target_data_val = pd.Series(list(decoder_target_data_k)).iloc[val_index].to_numpy()

  encoder_input_data_t = np.array([list(en) for en in encoder_input_data_t])
  decoder_input_data_t = np.array([list(en) for en in decoder_input_data_t])
  decoder_target_data_t = np.array([list(en) for en in decoder_target_data_t])

  encoder_input_data_val = np.array([list(en) for en in encoder_input_data_val])
  decoder_input_data_val = np.array([list(en) for en in decoder_input_data_val])
  decoder_target_data_val = np.array([list(en) for en in decoder_target_data_val])

  x_train = [encoder_input_data_t, decoder_input_data_t]
  y_train = decoder_target_data_t

  x_val = [encoder_input_data_val,decoder_input_data_val]
  y_val = decoder_target_data_val

  # Compile Model
  model.compile(
      optimizer="adam", loss="categorical_crossentropy", 
      metrics=["accuracy",keras.metrics.Recall(),keras.metrics.Precision()]
  )
  
  # Fitting Model
  model_history = model.fit(
      [encoder_input_data_t, decoder_input_data_t],
      decoder_target_data_t,
      batch_size=batch_size,
      epochs=epochs,
      validation_data = (x_val,y_val)
  )
  # Save model
  model_name = 'model_bilstm_256_{}-Fold-{}'.format(fold_var,i+1)
  model.save(model_name)

  # Define sampling models
  # Restore the model and construct the encoder and decoder.
  model_inf = keras.models.load_model(model_name)

  encoder_inputs = model_inf.input[0]  # input_1
  encoder_outputs,state_h_enc,state_c_enc,backward_h,backward_c  = model_inf.layers[1].output  # lstm_1
  states_h = keras.layers.Concatenate()([state_h_enc,state_c_enc])
  states_c = keras.layers.Concatenate()([backward_h,backward_c])
  encoder_states = [states_h,states_c]
  encoder_model = keras.Model(encoder_inputs, encoder_states)

  decoder_inputs = model_inf.input[1]  # input_2
  decoder_state_input_h = keras.Input(shape=(latent_dim_3,), name="input_3")
  decoder_state_input_c = keras.Input(shape=(latent_dim_3,), name="input_5")
  decoder_states_inputs = [decoder_state_input_h,decoder_state_input_c]
  decoder_lstm = model_inf.layers[5]
  decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
      decoder_inputs, initial_state=decoder_states_inputs
  )
  decoder_states = [state_h_dec, state_c_dec]
  decoder_dense = model_inf.layers[6]
  decoder_outputs = decoder_dense(decoder_outputs)
  decoder_model = keras.Model(
      [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
  )

  # Reverse-lookup token index to decode sequences back to
  # something readable.
  reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
  reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

  def decode_sequence(input_seq):
      # Encode the input as state vectors.
      states_value = encoder_model.predict(input_seq)

      # Generate empty target sequence of length 1.
      target_seq = np.zeros((1, 1, num_decoder_tokens))
      # Populate the first character of target sequence with the start character.
      target_seq[0, 0, target_token_index["\t"]] = 1.0

      # Sampling loop for a batch of sequences
      # (to simplify, here we assume a batch of size 1).
      stop_condition = False
      decoded_sentence = ""
      while not stop_condition:
          output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

          # Sample a token
          sampled_token_index = np.argmax(output_tokens[0, -1, :])
          sampled_char = reverse_target_char_index[sampled_token_index]
          decoded_sentence += sampled_char

          # Exit condition: either hit max length
          # or find stop character.
          if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
              stop_condition = True

          # Update the target sequence (of length 1).
          target_seq = np.zeros((1, 1, num_decoder_tokens))
          target_seq[0, 0, sampled_token_index] = 1.0

          # Update states
          states_value = [h, c]
      return decoded_sentence

  # Evaluate Model
  results = model.evaluate(x_val, y_val, batch_size=batch_size)
  results = dict(zip(model.metrics_names,results))
  
  # Save Evaluation Result
  validation_accuracy.append(results['accuracy'])
  validation_loss.append(results['loss'])
  #validation_recall.append(results['recall'])
  
  # Check Result Translation
  list_match = []
  count = 0
  print("Checking Translation Accuracy")
  for seq_index in val_index:
      # Take one sequence (part of the training set)
      # for trying out decoding.
      input_seq = encoder_input_data[seq_index : seq_index + 1]
      decoded_sentence = decode_sequence(input_seq)
      decoded_sentence = decoded_sentence[1:-1]
      exact_result = decoded_sentence==lines_2[seq_index][:-1]
      # print("-",seq_index)
      # print("Input sentence:", ''.join(lines_1[seq_index]))
      # print("Decoded sentence:", ''.join(decoded_sentence))
      # print("Target sentence:", ''.join(lines_2[seq_index][:-1]))
      # print("Exact Match:", ''.join(str(exact_result)))
      list_match.append(exact_result)
      count+=1
  print("Exact Match count : ", sum(list_match))
  print("Tested Data count : ", count)
  print("Translation Accuracy : ", sum(list_match)/count)
  exac_match_accuracy.append(sum(list_match)/count)

Iteration number :  1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20




INFO:tensorflow:Assets written to: model_bilstm_256_5-Fold-1/assets


INFO:tensorflow:Assets written to: model_bilstm_256_5-Fold-1/assets


Checking Translation Accuracy
Exact Match count :  1603
Tested Data count :  2202
Translation Accuracy :  0.7279745685740237
Iteration number :  2
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20




INFO:tensorflow:Assets written to: model_bilstm_256_5-Fold-2/assets


INFO:tensorflow:Assets written to: model_bilstm_256_5-Fold-2/assets


Checking Translation Accuracy
Exact Match count :  1644
Tested Data count :  2202
Translation Accuracy :  0.7465940054495913
Iteration number :  3
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20




INFO:tensorflow:Assets written to: model_bilstm_256_5-Fold-3/assets


INFO:tensorflow:Assets written to: model_bilstm_256_5-Fold-3/assets


Checking Translation Accuracy
Exact Match count :  1675
Tested Data count :  2202
Translation Accuracy :  0.7606721162579473
Iteration number :  4
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20




INFO:tensorflow:Assets written to: model_bilstm_256_5-Fold-4/assets


INFO:tensorflow:Assets written to: model_bilstm_256_5-Fold-4/assets


Checking Translation Accuracy


In [None]:
# Show Accuracy Result
for i,val_acc in enumerate(validation_accuracy):
  print("Fold {} Accuracy : {}".format(i+1,val_acc))
print("Average Accuracy : %.2f%% (+/- %.2f%%)" % (np.mean(validation_accuracy), np.std(validation_accuracy)))

In [None]:
#Prep Test Data
x_test = [encoder_input_data_test,decoder_input_data_test]
y_test = decoder_target_data_test

# Evaluate Model
results = model.evaluate(x_test, y_test, batch_size=batch_size)
results = dict(zip(model.metrics_names,results))

In [None]:
# Check Result Translation -> Test Data
list_match = []
count = 0
for seq_index in test_index:
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    decoded_sentence = decoded_sentence[1:-1]
    print("-",seq_index)
    print("Input sentence:", ''.join(lines_1[seq_index]))
    print("Decoded sentence:", ''.join(decoded_sentence))
    print("Target sentence:", ''.join(lines_2[seq_index][:-1]))
    print("Exact Match:", ''.join(str(exact_result)))
    exact_result = decoded_sentence==lines_2[seq_index][:-1]
    list_match.append(exact_result)
    count+=1
print("Exact Match count : ", sum(list_match))
print("Tested Data count : ", count)
print("Translation Accuracy : ", sum(list_match)/count)