In [95]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from collections import Counter

# tensorflow imports
import tensorflow as tf
from tensorflow.keras.models import load_model,Model
from tensorflow.keras.layers import (Dense, LSTM, Bidirectional, Input, Concatenate, 
                                     Permute, Dot, Multiply, RepeatVector,Lambda,Activation)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

In [8]:
# loading data
nmt_data = pd.read_csv(r'NMT_data.csv')

In [42]:
# creating a vocabulary for human_readable_dates

all_char = [char for word in list(nmt_data['human_readable_date']) for char in word]
all_distinct_char = list(set(all_char))
all_distinct_char.sort()
human_readable_dates_vocab = {el:idx for idx,el in enumerate(all_distinct_char)}

# adding <unk> and <pad> tokens at the end
human_readable_dates_vocab['<unk>'] = len(all_distinct_char)
human_readable_dates_vocab['<pad>'] = len(all_distinct_char) + 1

print(human_readable_dates_vocab)

{' ': 0, '/': 1, '0': 2, '1': 3, '2': 4, '3': 5, '4': 6, '5': 7, '6': 8, '7': 9, '8': 10, '9': 11, 'a': 12, 'b': 13, 'c': 14, 'd': 15, 'e': 16, 'f': 17, 'g': 18, 'h': 19, 'i': 20, 'j': 21, 'l': 22, 'm': 23, 'n': 24, 'o': 25, 'p': 26, 'r': 27, 's': 28, 't': 29, 'u': 30, 'v': 31, 'w': 32, 'y': 33, '<unk>': 34, '<pad>': 35}


In [25]:
# creating a vocabulary for machine_readable_dates

all_char_m = [char for word in list(nmt_data['machine_readable_date']) for char in word]
all_distinct_char_m = list(set(all_char_m))
all_distinct_char_m.sort()
machine_readable_dates_vocab = {el:idx for idx,el in enumerate(all_distinct_char_m)}
inv_machine_readable_dates_vocab = {idx:el for idx,el in enumerate(all_distinct_char_m)}

print("char_as_key_dict : ",machine_readable_dates_vocab)
print("index_as_key_dixt : ",inv_machine_readable_dates_vocab)

char_as_key_dict :  {'-': 0, '0': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10}
index_as_key_dixt :  {0: '-', 1: '0', 2: '1', 3: '2', 4: '3', 5: '4', 6: '5', 7: '6', 8: '7', 9: '8', 10: '9'}


In [29]:
# eda to select max length for each date
nmt_data['char_count'] = nmt_data['human_readable_date'].map(lambda x : len(x))

In [43]:
# convert words to numbers
def string_to_int(string, length, vocab):
    string = string.lower()
    string = string.replace(',','')
    
    if len(string) > length:
        string = string[:length]
        
    rep = list(map(lambda x: vocab.get(x, '<unk>'), string))
    
    if len(string) < length:
        rep += [vocab['<pad>']] * (length - len(string))
    
    return rep

In [44]:
def preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty):
    X, Y = zip(*dataset)
    
    X = np.array([string_to_int(i, Tx, human_vocab) for i in X])
    Y = [string_to_int(t, Ty, machine_vocab) for t in Y]
    
    Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X)))
    Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))

    return X, np.array(Y), Xoh, Yoh

In [82]:
# Tx = length of each human_readable_date
# Ty = length of each machine_readbale_date
Tx = 30
Ty = 10

In [78]:
def vectorizer(list_of_values,max_length,value_dict):
    
    vector = []
    for el in list_of_values:
        if len(el) > max_length:
            el   = el[:max_length]
            vect = list(map(lambda x : value_dict.get(x,'<unk>')),el)
        elif len(el) == max_length:
            vect = list(map(lambda x : value_dict.get(x,'<unk>'),el))
        else:
            vect = list(map(lambda x : value_dict.get(x,'<unk>'),el)) + [value_dict['<pad>']] * (max_length - len(el))
                        
        vector.append(vect)
        
    return np.array(vector)
               

In [79]:
X = vectorizer(list(nmt_data['human_readable_date']),30,human_readable_dates_vocab)
Y = vectorizer(list(nmt_data['machine_readable_date']),10,machine_readable_dates_vocab)

In [80]:
x_cat_vector = np.array(to_categorical(X,len(human_readable_dates_vocab)))
y_cat_vector = np.array(to_categorical(Y,len(machine_readable_dates_vocab)))

In [81]:
print(" X : ",X.shape)
print(" Y : ",Y.shape)
print(" x_cat_vector : ",x_cat_vector.shape)
print(" y_cat_vector : ",y_cat_vector.shape)

 X :  (9548, 30)
 Y :  (9548, 10)
 x_cat_vector :  (9548, 30, 36)
 y_cat_vector :  (9548, 10, 11)


In [114]:
# Defined shared layers as global variables
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor1 = Dense(10, activation = "tanh")
densor2 = Dense(1, activation = "relu")
activator = Activation('softmax', name='attention_weights') # We are using a custom softmax(axis = 1) loaded in this notebook
dotor = Dot(axes = 1)

In [115]:
# single step attention
def one_step_attention(a, s_prev):
    
    s_prev = repeator(s_prev)
    concat = concatenator([s_prev,a])
    e = densor1(concat)
    energies = densor2(e)
    alphas = activator(energies)
    context = dotor([alphas,a])
    
    return context

In [116]:
n_a = 32 # number of units for the pre-attention, bi-directional LSTM's hidden state 'a'
n_s = 64 # number of units for the post-attention LSTM's hidden state "s"

# this is the post attention LSTM cell. 
post_activation_LSTM_cell = LSTM(n_s, return_state = True)
output_layer = Dense(len(machine_readable_dates_vocab), activation='softmax')

In [117]:
# using for loop on each token to generate attention on all tokens

def modelf(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size):
    
    X = Input(shape=(Tx, human_vocab_size))
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')
    s = s0
    c = c0
    
    outputs = []
    
    a = Bidirectional(LSTM(n_a,return_sequences=True))(X)
    
    for t in range(Ty):
        
        print("t : .....",t)
        context = one_step_attention(a,s)
        s, _, c = post_activation_LSTM_cell(inputs=context, initial_state=[s,c])
        out = output_layer(inputs=s)
        outputs.append(out)
    
    model = Model(inputs=[X,s0,c0],outputs=outputs)
    
    return model

In [118]:
model = modelf(Tx, Ty, n_a, n_s, len(human_readable_dates_vocab), len(machine_readable_dates_vocab))

t : ..... 0
t : ..... 1
t : ..... 2
t : ..... 3
t : ..... 4
t : ..... 5
t : ..... 6
t : ..... 7
t : ..... 8
t : ..... 9


In [119]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
s0 (InputLayer)                 [(None, 64)]         0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 30, 36)]     0                                            
__________________________________________________________________________________________________
repeat_vector_5 (RepeatVector)  (None, 30, 64)       0           s0[0][0]                         
                                                                 lstm_10[0][0]                    
                                                                 lstm_10[1][0]                    
                                                                 lstm_10[2][0]                

In [120]:
# compiing the model
opt = Adam(lr=0.005,beta_1=0.9,beta_2=0.999,decay=0.01) # Adam(...) 
model.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])

In [127]:
m = nmt_data.shape[0]
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
outputs = list(y_cat_vector.swapaxes(0,1))

In [129]:
model.fit([x_cat_vector, s0, c0], outputs, epochs=20, batch_size=100)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20


Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x273ef395470>

In [130]:
model.load_weights('models/model.h5')

OSError: Unable to open file (unable to open file: name = 'models/model.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [135]:
# sample predicitons
EXAMPLES = ['3 May 1979', '21th of August 2016', 'Tue 10 Jul 2007', 'Saturday May 9 2018', 'March 3 2001', 'March 3rd 2001', '1 March 2001']
s00 = np.zeros((1, n_s))
c00 = np.zeros((1, n_s))
for example in EXAMPLES:
    source = string_to_int(example, Tx, human_readable_dates_vocab)
    #print(source)
    source = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_readable_dates_vocab)), source))).swapaxes(0,1)
    source = np.swapaxes(source, 0, 1)
    source = np.expand_dims(source, axis=0)
    prediction = model.predict([source, s00, c00])
    prediction = np.argmax(prediction, axis = -1)
    output = [inv_machine_readable_dates_vocab[int(i)] for i in prediction]
    print("source:", example)
    print("output:", ''.join(output),"\n")

source: 3 May 1979
output: 1979-05-03 

source: 21th of August 2016
output: 2016-08-21 

source: Tue 10 Jul 2007
output: 2007-07-10 

source: Saturday May 9 2018
output: 2018-05-09 

source: March 3 2001
output: 2001-03-03 

source: March 3rd 2001
output: 2001-03-03 

source: 1 March 2001
output: 2001-03-01 

