# Neural Machine Translation
Automatic data manipulation using NLP.  Convert various date structures to consistent machine-readable format. 
Use Keras, RNN, LSTM, Bidirectional, 

In [1]:
### Import modules
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from keras.layers import RepeatVector, Dense, Activation, Lambda
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import load_model, Model
import keras.backend as K
import numpy as np

from faker import Faker
import random
from tqdm import tqdm
from babel.dates import format_date
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [2]:
### Set up fake module instance
fake = Faker()
fake.seed(12345)
random.seed(12345)

# Define format of the data we would like to generate
FORMATS = ['short','medium','long','full','d MMM YYY', 'd MMMM YYY','dd MMM YYY','d MMM, YYY',
           'd MMMM, YYY','dd, MMM YYY','d MM YY','d MMMM YYY','MMMM d YYY','MMMM d, YYY','dd.MM.YY']

In [3]:
### Select 10,000 samples
m = 10000

### Set up outputs   
human_vocab2 = set()
machine_vocab2 = set()
dataset = []
Tx = 30

for i in tqdm(range(m)):

    dt = fake.date_object()

    human_readable1 = format_date(dt, format=random.choice(FORMATS),  locale='en_US') 
    human_readable2 = human_readable1.lower()
    human_readable = human_readable2.replace(',','')
    machine_readable = dt.isoformat()
    
    if human_readable is not None:
        dataset.append((human_readable, machine_readable))
        human_vocab2.update(tuple(human_readable))
        machine_vocab2.update(tuple(machine_readable))
    
human_vocab = dict(zip(sorted(human_vocab2) + ['<unk>', '<pad>'], 
                     list(range(len(human_vocab2) + 2))))
inv_machine_vocab = dict(enumerate(sorted(machine_vocab2)))
machine_vocab = {v:k for k,v in inv_machine_vocab.items()}


100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [00:01<00:00, 9162.04it/s]


In [4]:
dataset[:10]

[('09 may 1998', '1998-05-09'),
 ('10 september 1970', '1970-09-10'),
 ('4/28/90', '1990-04-28'),
 ('january 26 1995', '1995-01-26'),
 ('march 7 1983', '1983-03-07'),
 ('may 22 1988', '1988-05-22'),
 ('8 jul 2008', '2008-07-08'),
 ('september 8 1999', '1999-09-08'),
 ('1 january 1981', '1981-01-01'),
 ('22.05.95', '1995-05-22')]

In [5]:
### convert string to numeric representation
def string_to_int(string, length, vocab):

    #make lower to standardize
    string = string.lower()
    string = string.replace(',','')
    
    if len(string) > length:
        string = string[:length]
        
    rep = list(map(lambda x: vocab.get(x, '<unk>'), string))
    
    if len(string) < length:
        rep += [vocab['<pad>']] * (length - len(string))
    
    return rep


In [36]:
### Set up X and Y input length
Tx = 30
Ty = 10

# separate X and Y
X1, Y1 = zip(*dataset)
    
# Prepare input variables
X2 = np.array([string_to_int(i, Tx, human_vocab) for i in X1])
Y2 = [string_to_int(t, Ty, machine_vocab) for t in Y1]
    
Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X2)))
Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y2)))

Y = np.array(Y2)

print("X.shape:", X2.shape)
print("Y.shape:", Y.shape)
print("Xoh.shape:", Xoh.shape)
print("Yoh.shape:", Yoh.shape)

X.shape: (10000, 30)
Y.shape: (10000, 10)
Xoh.shape: (10000, 30, 37)
Yoh.shape: (10000, 10, 11)


In [7]:
### Define softmax function
def softmax(x, axis=1):
 
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s

In [8]:
### Define size of a and s states
n_a = 32
n_s = 64

### Build Keras Recurrent Neural Network Model

human_vocab_size = len(human_vocab)
machine_vocab_size = len(machine_vocab)

# Define the inputs of your model with a shape (Tx,)
# Define s0 and c0, initial hidden state for the decoder LSTM of shape (n_s,)
X = Input(shape=(Tx, human_vocab_size), name= 'X')
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
s = s0
c = c0
    
# Initialize empty list of outputs
outputs = []

# Define your pre-attention Bi-LSTM. Remember to use return_sequences=True.
a = Bidirectional(LSTM(n_a, return_sequences=True, name='bidirectional_1'), merge_mode='concat')(X)
    
# Iterate for Ty steps
for t in range(Ty):
    
    # Perform one step of the attention mechanism to get back the context vector at step t
    
    # Use repeator to repeat s_prev to be of shape (m, Tx, n_s) so that you can concatenate it with all hidden states "a" 
    s_prev1 = RepeatVector(Tx)(s)
   
    # Use concatenator to concatenate a and s_prev on the last axis 
    concat = Concatenate(axis=-1)([a, s_prev1])
    
    # Use densor1 to propagate concat through a small fully-connected neural network to compute the "intermediate energies" variable e   
    e = Dense(10, activation = "tanh")(concat)
    # Use densor2 to propagate e through a small fully-connected neural network to compute the "energies" variable energies. 
    energies = Dense(1, activation = "relu")(e)
    # Use "activator" on "energies" to compute the attention weights "alphas" 
    alphas = Activation(softmax)(energies)
    # Use dotor together with "alphas" and "a" to compute the context vector to be given to the next (post-attention) LSTM-cell 
    context = Dot(axes = 1)([alphas, a])
        
    # Apply the post-attention LSTM cell to the "context" vector.
    s, _, c = LSTM(n_s, return_state = True)(context, initial_state = [s, c])
        
    # Apply Dense layer to the hidden state output of the post-attention LSTM
    out = Dense(len(machine_vocab), activation=softmax)(s)
    
    # Append "out" to the "outputs" list
    outputs.append(out)
    
# Create model instance taking three inputs and returning the list of outputs.
model = Model(inputs=(X, s0, c0), outputs=outputs)
    

In [9]:
### List summary of model
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
X (InputLayer)                  (None, 30, 37)       0                                            
__________________________________________________________________________________________________
s0 (InputLayer)                 (None, 64)           0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 30, 64)       17920       X[0][0]                          
__________________________________________________________________________________________________
repeat_vector_1 (RepeatVector)  (None, 30, 64)       0           s0[0][0]                         
__________________________________________________________________________________________________
concatenat

activation_9 (Activation)       (None, 30, 1)        0           dense_26[0][0]                   
__________________________________________________________________________________________________
dot_9 (Dot)                     (None, 1, 64)        0           activation_9[0][0]               
                                                                 bidirectional_1[0][0]            
__________________________________________________________________________________________________
lstm_9 (LSTM)                   [(None, 64), (None,  33024       dot_9[0][0]                      
                                                                 lstm_8[0][0]                     
                                                                 lstm_8[0][2]                     
__________________________________________________________________________________________________
repeat_vector_10 (RepeatVector) (None, 30, 64)       0           lstm_9[0][0]                     
__________

In [10]:
### Set up Keras Optimizers
from keras import optimizers

opt = optimizers.Adam(lr=0.005, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.001)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [11]:
### Set up input parameter as 0
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))

### Prepare train Y data
outputs = list(Yoh.swapaxes(0,1))

In [12]:
### Train compiled model with train data
model.fit([Xoh, s0, c0], outputs, epochs=10, batch_size=100)

Epoch 1/10










Epoch 2/10










Epoch 3/10










Epoch 4/10










Epoch 5/10










Epoch 6/10










Epoch 7/10










Epoch 8/10










Epoch 9/10










Epoch 10/10












<keras.callbacks.History at 0x22d4fd6b748>

In [13]:
## Import already trained model
#model.load_weights('models/model.h5')

In [60]:
### Test with some example data
EXAMPLES = ['3 May 1979', '5 April 09', '21th of August 2016', 'Tue 10 Jul 2007', 'Saturday May 9 2018', 'March 3 2001', 'March 3rd 2001', '1 March 2001']
for example in EXAMPLES:
    
    source = string_to_int(example, Tx, human_vocab)
    #source1 = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), source))).swapaxes(0,1)
    source1 = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), source)))
    source2 = np.reshape(source1, (1, 30, 37))
    prediction = model.predict([source2, s0, c0])
    prediction2 = np.argmax(prediction, axis = -1)
    output = [inv_machine_vocab[int(i)] for i in prediction2]
    
    print("source:", example)
    print("output:", ''.join(output))

source: 3 May 1979
output: 1999-05-09
source: 5 April 09
output: 1970-04-04
source: 21th of August 2016
output: 2013-03-03
source: Tue 10 Jul 2007
output: 2000-08-10
source: Saturday May 9 2018
output: 1912-05-07
source: March 3 2001
output: 2003-03-03
source: March 3rd 2001
output: 2000-03-03
source: 1 March 2001
output: 2003-03-11
