## Baby Name Generator using simple RNN and Keras package 

In [1]:
import numpy as np
import pandas as pd

In [16]:
names_df = pd.read_csv('names.txt',header=None)
names_df.rename(columns={0:'input'},inplace=True)
names_df['input'] = names_df['input'].str.lower()

In [17]:
names_df.head()

Unnamed: 0,input
0,john
1,william
2,james
3,charles
4,george


In [18]:
# Specifying the start and end of a name using start(\t) and end token(\n)
names_df['input'] = names_df['input'].apply(lambda x : '\t' + x)
names_df['target'] = names_df['input'].apply(lambda x : x[1:len(x)] + '\n')

In [19]:
names_df.head()

Unnamed: 0,input,target
0,\tjohn,john\n
1,\twilliam,william\n
2,\tjames,james\n
3,\tcharles,charles\n
4,\tgeorge,george\n


- Encoding these values as numeric because machine learning models only accept numerical inputs 

In [20]:
# Get vocabulary of Names dataset - set of all unique characters used in the dataset
def get_vocabulary(names):  
    # Define vocabulary to be set
    all_chars=set()
    
    # Add the start and end token to the vocabulary
    all_chars.add('\t')
    all_chars.add('\n')  
    
    # Iterate for each name
    for name in names:

        # Iterate for each character of the name
        for c in name:

            if c not in all_chars:
            # If the character is not in vocabulary, add it
                all_chars.add(c)

    # Return the vocabulary
    return all_chars

In [23]:
vocabulary = get_vocabulary(names_df['input'])
vocabulary_sorted = sorted(vocabulary)

# character to integer mapping and integer to character mapping

# Create the mapping of the vocabulary chars to integers
char_to_idx = { char : idx for idx, char in enumerate(vocabulary_sorted) }
# Create the mapping of the integers to vocabulary chars
idx_to_char = { idx : char for idx, char in enumerate(vocabulary_sorted)}
# Print the dictionaries
print(char_to_idx)
print(idx_to_char)

{'\t': 0, '\n': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'h': 9, 'i': 10, 'j': 11, 'k': 12, 'l': 13, 'm': 14, 'n': 15, 'o': 16, 'p': 17, 'q': 18, 'r': 19, 's': 20, 't': 21, 'u': 22, 'v': 23, 'w': 24, 'x': 25, 'y': 26, 'z': 27}
{0: '\t', 1: '\n', 2: 'a', 3: 'b', 4: 'c', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'h', 10: 'i', 11: 'j', 12: 'k', 13: 'l', 14: 'm', 15: 'n', 16: 'o', 17: 'p', 18: 'q', 19: 'r', 20: 's', 21: 't', 22: 'u', 23: 'v', 24: 'w', 25: 'x', 26: 'y', 27: 'z'}


- Create input and target tensors - two tensors to encode the input and target sequences

    - The input is a list containing all the names in the dataset.
        - So, the first dimension of the input tensor will be the number of names in the dataset. Each name can be thought of as a string having length equal to the length of the longest name and each character in each name is a one-hot encoded vector of size vocabulary. 
        - So, the second and third dimensions of the input tensor will be the length of the longest name and the size of the vocabulary. 
        
    - Similar is the case for the target tensor

In [2]:
# Number of time-steps - length of longest name
def get_max_len(names):
    """
    Function to return length of the longest name.
    Input: list of names
    Output: length of the longest name
    """

    # create a list to contain all the name lengths
    length_list=[]

    # Iterate over all names and save the name length in the list.]
    for l in names:
        length_list.append(len(l))

    # Find the maximum length
    max_len = np.max(length_list)

    # return maximum length
    return max_len

In [25]:
# Find the length of longest name - time step
max_len = get_max_len(names_df['input']) # Each name as a sequence of length maxlen


# Initialize the input vector - 3-D vector of required shape for input
input_data = np.zeros((len(names_df['input']), max_len+1, len(vocabulary)), dtype='float32')

# Initialize the target vector - 3-D vector of required shape for target
target_data = np.zeros((len(names_df['input']), max_len+1, len(vocabulary)), dtype='float32')

# The input and target tensors of appropriate shape containing all zeros
# Now, we'll fill these with actual values. The input and target tensors contain all the names in the dataset. 
# Each name can be thought of as a string having length equal to the length of the longest name and each character 
# in each name is a one-hot encoded vector of size vocabulary.

In [26]:
# Fill the vectors with data

# Iterate for each name in the dataset
for n_idx, name in enumerate(names_df['input']):
  # Iterate over each character and convert it to a one-hot encoded vector
  for c_idx, char in enumerate(name):
    input_data[n_idx, c_idx, char_to_idx[char]] = 1

# Iterate for each name in the dataset
for n_idx, name in enumerate(names_df['target']):
  # Iterate over each character and convert it to a one-hot encoded vector
  for c_idx, char in enumerate(name):
    target_data[n_idx, c_idx, char_to_idx[char]] = 1

 - Build and compile RNN using Keras
      - Create Sequential Model
      - Add RNN layer of 50 units, we are setting return sequences to true to make sure RNN outputs a sequence and not a single vector
      - The output layer is then passed to a dense layer with softmax activation to generate the output 
          - The softmax activation predicts prob values for each char in the vocabulary
          - The time distributed wrapper layer is used to make sure the dense layer can handle 3-dimensional input
      - We can compile this model now using categorical cross-entropy loss and adam optimizer
          - Categorical cross-entropy loss is used when we have more than two labels. Here the output will be a character from the vocabulary and so, the number of labels is the size of the vocabulary
          - Adam is an advanced optimizer which converges faster
          
- We can verify the architecture of the model using the model summary

In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Activation, TimeDistributed

In [49]:
# Build and compile RNN

# Create a Sequential model
model = Sequential()

# Add SimpleRNN layer of 50 units
model.add(SimpleRNN(50, input_shape=(max_len+1, len(vocabulary)), return_sequences=True))

# Add a TimeDistributed Dense layer of size same as the vocabulary
model.add(TimeDistributed(Dense(len(vocabulary), activation='softmax')))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Print the model summary
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 13, 50)            3950      
_________________________________________________________________
time_distributed_1 (TimeDist (None, 13, 28)            1428      
Total params: 5,378
Trainable params: 5,378
Non-trainable params: 0
_________________________________________________________________


- Train the build RNN model
    - Input and target vectors are 3-dimensional vectors whose first dimension is the number of samples or names in the datsset
    - The second dimension is the number of time steps which we defined as the length of the longest name
    - The third dimension is the size of the one-hot encoded vectors which is the size of the vocabulary
    
- We need to use these vectors to train the model we built

- Keras fit to train the model. We need to pass the input and the target data. In addition, we need to specify the batch size and the number of epochs. It is efficient to adjust the parameters of the network after accumulating the error over a set of samples than to adjust after every single sample.
    - The number of samples after which the model adjusts the parameters is specified by the batch size
- We also need to iterate over the full dataset a number of times to get the best result
    - Epoch specifies the number of times the full dataset will be iterated

In [50]:
# Fit the model for 5 epochs using a batch size of 128 
model.fit(input_data, target_data, batch_size=128, epochs=50)

Train on 258000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x19bd2737d30>

- We trained the model in such a way that it'll produce the next character given the current character as input. And, the first character is the tab character which is the start token.
- We can feed the tab character to the network and get the most probable next character as the output. We can create a 3-dimensional zero vector for the output sequence and initialize it to contain the tab character
- We can use the predict_proba method to get the probability distribution for the next character in the sequence. As we want to generate the first character after tab, we need to slice the probability distribution list to get the prob distribution for the first character
- Now, we can find the next character by sampling the vocabulary randomly using this probability distribution
- We can use the generated first character to predict the second character in the sequence
- The same process can be used to predict the most probable second character given the tab and the first character
- We can keep on generating characters in this manner until the end token/new line is encountered
- We can also put a constraint on the maximum length of the names and stop when the number of generated characters reaches this maximum

In [51]:
# Create a 3-D zero vector and initialize it with the start token
# initializing the first character of the sequence
output_seq = np.zeros((1, max_len+1, len(vocabulary)))
output_seq[0, 0, char_to_idx['\t']] = 1

In [52]:
# Get the probabilities for the first character
probs = model.predict_proba(output_seq, verbose=0)[:,1,:]

# Sample vocabulary to get first character
first_char = np.random.choice(sorted(list(vocabulary)), replace=False, p=probs.reshape(len(vocabulary)))

# Print the character generated
print('first character: ', first_char)
 
# Update the vector to contain first the character
output_seq[0, 1, char_to_idx[first_char]] = 1

# Get the probabilities for the second character
probs = model.predict_proba(output_seq, verbose=0)[:,2,:]

# Sample vocabulary to get second character
second_char = np.random.choice(sorted(list(vocabulary)), replace=False, p=probs.reshape(len(vocabulary)))

# Print the second character
print('second character: ',second_char)

first character:  o
second character:  i


In [53]:
# Function to generate baby names
def generate_baby_names(n):
    
    # Repeat for each name to be generated
    for i in range(0,n):

        # Flag to indicate when to stop generating characters
        stop=False

    # Number of characters generated so far
        counter=1

    # Define a zero vector to contain the output sequence
        output_seq = np.zeros((1, max_len+1, 28))

        # Initialize the first character of output sequence as the start token
        output_seq[0, 0, char_to_idx['\t']] = 1.

    # Variable to contain the name
        name = ''

        # Repeat until the end token is generated or we get the maximum no of characters
        while stop == False and counter < 10:

            # Get probabilities for the next character in sequence
            probs = model.predict_proba(output_seq, verbose=0)[:,counter-1,:]
            
            # Sample the vocabulary according to the probability distribution
            c = np.random.choice(sorted(list(vocabulary)), replace=False, p=probs.reshape(28))
            
            if c=='\n':
                # Stop if end token is encountered, else append to existing sequence
                stop=True
            else:
                # Append this character to the name generated so far
                name = name + c

                # Append this character to existing sequence for prediction of next characters
                output_seq[0,counter , char_to_idx[c]] = 1.
                
                # Increment the number of characters generated
                counter=counter+1

        # Output generated sequence or name
        print(name)

In [54]:
generate_baby_names(10)

stacey
janine
darkah
enoy
loynona
kriselm
rhadgia
george
scott
hildo


- RNN's are not very effective for longer sequences and we need a different kind of recurrence to handle long sequences - LSTM(Long Short Term Memory)

## Text Generation in the author's style of writing
-- LSTM - Long Short Term Memory
- Does not suffer from vanishing and exploding gradient problems and as a result can handle longer sequences efficiently

In [89]:
def read_file(filepath):
    
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text
text = read_file('shakespeare.txt').lower()

In [90]:
text



In [91]:
# Find the vocabulary
vocabulary = sorted(set(text))

# Print the vocabulary size
print('Vocabulary size:', len(vocabulary))

# Dictionary to save the mapping from char to integer
char_to_idx = { char : idx for idx, char in enumerate(vocabulary) }

# Dictionary to save the mapping from integer to char
idx_to_char = { idx : char for idx, char in enumerate(vocabulary) }

# Print char_to_idx and idx_to_char
print(char_to_idx)
print(idx_to_char)

Vocabulary size: 36
{'\n': 0, ' ': 1, '!': 2, "'": 3, ',': 4, '-': 5, '.': 6, ':': 7, ';': 8, '?': 9, 'a': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'i': 18, 'j': 19, 'k': 20, 'l': 21, 'm': 22, 'n': 23, 'o': 24, 'p': 25, 'q': 26, 'r': 27, 's': 28, 't': 29, 'u': 30, 'v': 31, 'w': 32, 'x': 33, 'y': 34, 'z': 35}
{0: '\n', 1: ' ', 2: '!', 3: "'", 4: ',', 5: '-', 6: '.', 7: ':', 8: ';', 9: '?', 10: 'a', 11: 'b', 12: 'c', 13: 'd', 14: 'e', 15: 'f', 16: 'g', 17: 'h', 18: 'i', 19: 'j', 20: 'k', 21: 'l', 22: 'm', 23: 'n', 24: 'o', 25: 'p', 26: 'q', 27: 'r', 28: 's', 29: 't', 30: 'u', 31: 'v', 32: 'w', 33: 'x', 34: 'y', 35: 'z'}


In [66]:
import spacy
nlp = spacy.load('en',disable=['parser', 'tagger','ner'])

nlp.max_length = 1198623
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']
tokens = separate_punc(text)

In [77]:
# organize into sequences of tokens
train_len = 40 + 1

# Empty list of sequences
text_sequences = []

for i in range(train_len, len(tokens)):
    
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)

In [78]:
' '.join(text_sequences[0])

"that poor contempt or claim'd thou slept so faithful i may contrive our father and in their defeated queen her flesh broke me and puttance of expedition house and in that same that ever i lament this stomach and he nor"

In [79]:
len(text_sequences)

18973

In [80]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [81]:
sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:,-1]
maxlen = X.shape[1]

In [92]:
# Create empty lists for input and target datasets
input_data = []
target_data = []
#maxlen = 40
# Iterate to get all substrings of length maxlen
for i in range(0, len(text) - maxlen):
    # Find the sequence of length maxlen starting at i
    input_data.append(text[i : i+maxlen])
    
    # Find the next char after this sequence 
    target_data.append(text[i+maxlen])

# Print number of sequences in input data
print('No of Sequences:', len(input_data))

No of Sequences: 99953


In [94]:
# Create and initialize the input and target vectors
# Create a 3-D zero vector to contain the encoded input sequences
x = np.zeros((len(input_data), maxlen, len(vocabulary)), dtype='float32')

# Create a 2-D zero vector to contain the encoded target characters
y = np.zeros((len(target_data), len(vocabulary)), dtype='float32')

# Iterate over the sequences
for s_idx, sequence in enumerate(input_data):
    # Iterate over all characters in the sequence
    for idx, char in enumerate(sequence):
        # Fill up vector x
        x[s_idx, idx, char_to_idx[char]] = 1    
    # Fill up vector y
    y[s_idx, char_to_idx[target_data[s_idx]]] = 1

In [167]:
from tensorflow.keras.layers import LSTM,Input

In [98]:
# Create Sequential model 
model = Sequential()

# Add an LSTM layer of 128 units
model.add(LSTM(128, input_shape=(maxlen, len(vocabulary))))

# Add a Dense output layer
model.add(Dense(len(vocabulary), activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Print model summary
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               84480     
_________________________________________________________________
dense_18 (Dense)             (None, 36)                4644      
Total params: 89,124
Trainable params: 89,124
Non-trainable params: 0
_________________________________________________________________


- Training is nothing but adjusting the weights of the network so that the overall error reduces
- This reduction in error is on the training set and after training, the model will be able to perform better on the training set
- However, this doesn't ensure that the model will have good prediction performance on new unseen data
- To remedy this, a subset of the data is kept aside and never used for training
- In each training iteration, we can check the accuracy on this set which gives a good indication of how the model generalizes on new data
- This set is called the test or validation set

- validation_split: percentage of samples set aside for test set 

In [111]:
# Fit the model
model.fit(x, y, batch_size=256, epochs=5, validation_split=0.2)

Train on 79962 samples, validate on 19991 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x19be6dc7c88>

In [112]:
# Input sequence
sentence = "that, poor contempt, or claim'd thou sle"

# Create a 3-D zero vector to contain the encoding of sentence.
X_test = np.zeros((1, maxlen, len(vocabulary)))

# Iterate over each character and convert them to one-hot encoded vector.
for s_idx, char in enumerate(sentence):
    X_test[0, s_idx, char_to_idx[char]] = 1
# Get the probability distribution using model predict
preds = model.predict(X_test, verbose=0)

# Get the probability distribution for the first character after the sequence
preds_next_char = preds[0]

In [113]:
# Get the index of the most probable next character
next_index = np.argmax(preds_next_char)

# Map the index to the actual character and print it
next_char = idx_to_char[next_index]

# Print the next character
print(next_char)

a


In [114]:
def generate_text(sentence, n):
    """
    Function to generate text
    Inputs: seed sentence and number of characters to be generated.
    Output: returns nothing but prints the generated sequence.
    """
    
    # Initialize the generated sequence with the seed sentence
    generated = ''
    generated += sentence
    
    # Iterate for each character to be generated
    for i in range(n):
      
        # Create input vector from the input sentence
        x_pred = np.zeros((1, maxlen, len(vocabulary)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_to_idx[char]] = 1.

        # Get probability distribution for the next character
        preds = model.predict(x_pred, verbose=0)[0]
        
        # Get the index with maximum probability
        next_index = np.argmax(preds)
        next_char = idx_to_char[next_index]

        # Append the new character to the input sentence for next iteration
        sentence = sentence[1:] + next_char

        # Append the new character to the text generated so far
        generated += next_char
    
    # Print the generated text
    print(generated)

In [115]:
# Input sequence and generate text
sentence = "that, poor contempt, or claim'd thou sle"
generate_text(sentence, 500)

that, poor contempt, or claim'd thou slead the shall and the prought the sing the shall and the sear and the shall and the sear and the shall and the sear and the shall and the sear and the shall and the sear and the shall and the sear and the shall and the sear and the shall and the sear and the shall and the sear and the shall and the sear and the shall and the sear and the shall and the sear and the shall and the sear and the shall and the sear and the shall and the sear and the shall and the sear and the shall and the sear and the


- Increasing the sample size and the number of iterations(epochs) will lead to better predictions

## Neural Translation Model to translate English sentences to French
-- Sequence to Sequence Models
-  Aim to map a fixed length input with a fixed length output where the lengths of the input and output may differ
    - Eg: Tanslation from one language to other, automated question-answering systems, named entity recognition, parts of speech tagging, text summarization, grammar correction etc

In [150]:
lines = []                             
with open ('fra.txt', 'rt') as file: 
    for line in file:
        line = line.partition('CC-BY')[0]
        lines.append(line)           
print(lines[0])

Go.	Va !	


In [151]:
lines[0]

'Go.\tVa !\t'

In [190]:
len(lines)

175623

In [199]:
english_sentences = []
french_sentences = []
# Consider only the first 50 lines of the dataset
for i in range(10000):
    # Split each line into two at the tab character
    eng_fra_line = str(lines[i]).split('\t')
    
    # Separate out the English sentence 
    eng_line = eng_fra_line[0]
    
    # Append the start and end token to each French sentence
    fra_line = '\t' + eng_fra_line[1] + '\n'
    
    # Append the English and French sentence to the list of sentences
    english_sentences.append(eng_line)
    french_sentences.append(fra_line)

In [200]:
english_sentences[:10]

['Go.',
 'Hi.',
 'Hi.',
 'Run!',
 'Run!',
 'Who?',
 'Wow!',
 'Fire!',
 'Help!',
 'Jump.']

In [201]:
# Create an empty set to contain the English vocabulary 
english_vocab = set()

# Iterate over each English sentence
for eng_line in english_sentences:
  
    # Convert the English line to a set
    eng_line_set = set(eng_line)
    
    # Update English vocabulary with new characters from this line.
    english_vocab = english_vocab.union(eng_line_set)

# Sort the vocabulary
english_vocab = sorted(list(english_vocab))

In [202]:
# Create an empty set to contain the French vocabulary 
french_vocab = set()

# Iterate over each French sentence
for fra_line in french_sentences:
  
    # Convert the French line to a set
    fra_line_set = set(fra_line)
    
    # Update French vocabulary with new characters from this line.
    french_vocab = french_vocab.union(fra_line_set)

# Sort the vocabulary
french_vocab = sorted(list(french_vocab))

In [203]:
# Dictionary to contain the character to integer mapping for English
eng_char_to_idx = dict((char, idx) for idx, char in enumerate(english_vocab))

# Dictionary to contain the integer to character mapping for English
eng_idx_to_char = dict((idx, char) for idx, char in enumerate(english_vocab))

In [204]:
# Dictionary to contain the character to integer mapping for French
fra_char_to_idx = dict((char, idx) for idx, char in enumerate(french_vocab))

# Dictionary to contain the integer to character mapping for French
fra_idx_to_char = dict((idx, char) for idx, char in enumerate(french_vocab))

##### Neural Machine Translation 

- Encoder-Decoder architecture consists of two separate neural networks
- Encoder
    - The encoder accepts the input sentences and summarizes the information in its state vectors
    - Encoders are implemented using LSTMs and here the states refer to the cell and hidden states from the LSTM layer
    - During training, the encoder learns these states from data. Intuitively, we can think of the states as a summarization of all the useful information from the input
    - The encoder output is ignored
- Decoder
    - The decoder is also implemented using LSTM's and the initial hidden and cell states are initialized to the encoder final states
    - Intuitively, the decoder gets to know about all the useful information from the input from these states. the decoder uses this information to generate the output
    - The final decoder states are ignored
    - The output of the decoder is compared with the target sequence to calculate the error which is minimized during the training process by updating the weights of the encoder and decoder networks
    - The input to the decoder at each time-step is the predicted output from the previous time-step as usual
    - However, during training, the input to the decoder at each time-step is the actual output from the previous step instead of the predicted output
    - This technique is known as 'Teacher-Forcing' which helps the model to learn faster

- Now that we know how the encoder and decoder works, let's apply this to the case study of the machine translation 
- Encoder
    - The encoder will accept the english sentences, the number of time-steps in the encoder will be the length of the english sentences
    - As we have sentences of varying legths, the length of the longest english sentence can be taken as the step-size
    - Shorter sentences can be padded with zeros at the end
    - Encoder summarizes all the necessary information from the English sentences in its state vectors which are then passed to the decoder
    - Encoder outputs are ignored
- Decoder
    - The initial states of the decoder are the final states from the encoder
    - The encoder consolidates all the useful information from the english sentences in its state vectors which are needed in the decoder to generate the translated french sentence
    - The decoder inputs during the training are the French sentences because of teacher-forcing
    - Decoder outputs are the translated sentences
    - Decoder states are ignored
    - Similar to the encoder, as we have sentences of varying lengths, the number of time steps in the decoder can be set to the length of the longest french sentence

- There are two inputs to the network - English sentences for the encoder and French sentences for the decoder. The targets are the french sentences
- All these vectors are 3-Dimensional - the first dimension being the number of sentences, the second being the number of time steps which is the length of the longest English or French sentence and the third being the length of the one-hot encoded vector for the characters which is the respective vocabulary size

In [205]:
# We need to find the length of the longest english and french sentences to define the number of time steps
# Find the length of the longest English sentence
max_len_eng_sent = max([len(sentence) for sentence in english_sentences])

# Find the length of the longest French sentence
max_len_fra_sent = max([len(sentence) for sentence in french_sentences])

In [206]:
# the input and target vectors 
# Create a 3-D zero vector for the input English data
eng_input_data = np.zeros((len(english_sentences), max_len_eng_sent, len(english_vocab)), dtype='float32')

# Create a 3-D zero vector for the input French data
fra_input_data = np.zeros((len(french_sentences), max_len_fra_sent, len(french_vocab)), dtype='float32')

# Create the target vector
target_data = np.zeros((len(french_sentences), max_len_fra_sent, len(french_vocab)), dtype='float32')

In [207]:
# We can initialize these vectors by iterating over all the characters in each sentence and converting into a 
# one-hot encoded vector
# Iterate over the 50 sentences
for i in range(50):
    # Iterate over each English character of each sentence
    for k, ch in enumerate(english_sentences[i]):
        # Convert the character to one-hot encoded vector
        eng_input_data[i, k, eng_char_to_idx[ch]] = 1.
    
    # Iterate over each French character of each sentence
    for k, ch in enumerate(french_sentences[i]):
        # Convert the character to one-hot encoded vector
        fra_input_data[i, k, fra_char_to_idx[ch]] = 1.

        # Target data will be one timestep ahead and excludes start character
        if k > 0:
            target_data[i, k-1, fra_char_to_idx[ch]] = 1.

In [208]:
# Encoder-Decoder network using keras

# Encoder

encoder_input = Input(shape=(None, len(english_vocab)))
encoder_LSTM = LSTM(256, return_state = True)

# The first dimension of the input is None indicating that it can take varying number of input sequences at run time
# Feed this input to the LSTM layer to produce the output and the state vectors
# Save encoder output, hidden and cell state
encoder_outputs, encoder_h, encoder_c = encoder_LSTM(encoder_input)

# Ignore the output and combine the hidden and cell states
encoder_states = [encoder_h, encoder_c]

# Decoder
# Input layer is similar to the encoder
decoder_input = Input(shape=(None, len(french_vocab)))

# The initial state of the LSTM layer is the final state of the encoder
decoder_LSTM = LSTM(256, return_sequences=True, return_state = True)

# The output from this LSTM layer will be fed into a Dense softmax layer which will give us the final output
decoder_out, _ , _ = decoder_LSTM(decoder_input, initial_state=encoder_states)
# the output states of the decoder are ignored, that is why we are not saving them

# Create a dense layer with softmax activation. The dense layer predicts the next character and so the size of the dense
# layer will be the same as the vocabulary
decoder_dense = Dense(len(french_vocab), activation='softmax')

# The output of the decoder LSTM layer is then fed to this Dense layer which generates the probability distribution of the 
# next character over the vocabulary
decoder_out = decoder_dense(decoder_out)

# The index with the maximum probability value is the index of the most probable next character.
# The characters corresponding to this index can be found using the character to index mapping

In [209]:
# combine the encoder and decoder using the model function from keras
# Build model
model = tf.keras.Model(inputs=[encoder_input, decoder_input], outputs=[decoder_out])

# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
# categorical class entropy is used when we have more than two class labels, here, the vocab size gives the no of labels
# adam is advanced optimizer which converges faster

# Print model summary to check the correctness of the encoder-decoder
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, None, 72)]   0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           [(None, None, 92)]   0                                            
__________________________________________________________________________________________________
lstm_4 (LSTM)                   [(None, 256), (None, 336896      input_9[0][0]                    
__________________________________________________________________________________________________
lstm_5 (LSTM)                   [(None, None, 256),  357376      input_10[0][0]                   
                                                                 lstm_4[0][1]               

In [210]:
# Training the model
model.fit(x=[eng_input_data, fra_input_data], y=target_data,batch_size=64, epochs=50, validation_split=0.2)

Train on 8000 samples, validate on 2000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x19c1defe908>

In [211]:
# Create encoder inference model
# ecoder_states is the state vectors of the trained model
encoder_model_inf = tf.keras.Model(encoder_input, encoder_states)

# Create decoder input states for inference
# Decoder requires the hidden and cell states of the encoder inference model as the initial state
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
# These states will be provided as input to the model and therefore, needs to be initialized as inputs
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

# Create decoder output states for inference
# The decoder is called recursively for each character to be generated in the suffix sequence
# On the first call, the hidden and cell states from the encoder will be used to initialize the states of the decoder LSTM
# which will generate output and the hidden and the cell states
decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_input, initial_state=decoder_input_states)

# These states will be used to generate the nect character in the next iteration
decoder_states = [decoder_h , decoder_c]

# The output of the decoder LSTM is again fed to the dense layer to get the predicted character
# Create decoder dense layer
decoder_out = decoder_dense(decoder_out)

# Combining all this together using the model function from keras to build the decoder inference model
decoder_model_inf = tf.keras.Model(inputs=[decoder_input] + decoder_input_states, outputs=[decoder_out] + decoder_states )

In [212]:
# Now that we have the inference models built, we can use these to generate a suffix, given a prefix as input

# Get encoder internal state by passing a sentence as input
inp_seq = eng_input_data[0:1] # random prefix sequence
states_val = encoder_model_inf.predict(inp_seq) # outouts encoder internal states

# Seed the first character and get output from the decoder 
target_seq = np.zeros((1, 1, len(french_vocab))) # defining variable for the suffix to be generated

# Initializing the variable with the start token(\t)
target_seq[0, 0, fra_char_to_idx['\t']] = 1 

# Passing target sequence and current values to the decoder inference model to generate the output along with state values
decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
# The output is a prob distribution for the next character over the vocabulary

# Find out the next character from the Decoder output
max_val_index = np.argmax(decoder_out[0,-1,:]) # index with the maximum probability
sampled_fra_char = fra_idx_to_char[max_val_index] # this is the character that the index maps to

# Print the first character predicted by the decoder
print(sampled_fra_char)

V


In [213]:
# The first character can be fed again to the decoder inference model

# Update the target sequence with the new char generated 
target_seq = np.zeros((1, 1, len(french_vocab)))
target_seq[0, 0, max_val_index] = 1

# Get decoder final states from last time-step
states_val = [decoder_h, decoder_c]

# Passing this updated seq and ew state values to the decoder inference model to generate the next character
decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)

# Map the prediction to char and print it
max_val_index = np.argmax(decoder_out[0,-1,:])
sampled_fra_char = fra_idx_to_char[max_val_index]

print(sampled_fra_char)

a


In [214]:
def translate_eng_sentence(inp_seq):
    # Get encoder states 
    states_val = encoder_model_inf.predict(inp_seq)
    
    # Create a vector for the output sentence
    target_seq = np.zeros((1, 1, len(french_vocab)))
    
    # Initialize the first char of the output to tab
    target_seq[0, 0, fra_char_to_idx['\t']] = 1
    
    # Keep track of the translated sequence
    translated_sent = ''
    
    # Stop condition will be true when we encounter a newline or maximum lenght of sentence is reached
    stop_condition = False
    
    while not stop_condition:
        
        # Get decoder output
        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        
        # Get index of most probable next character
        max_val_index = np.argmax(decoder_out[0,-1,:])
        
        # Map index to the actual character
        sampled_fra_char = fra_idx_to_char[max_val_index]
        
        # Add generated character to the translated sentence so far
        translated_sent += sampled_fra_char
        
        # If newline is encountered or maximum lenght of sentence is reached, stop
        if ( (sampled_fra_char == '\n') or (len(translated_sent) > max_len_fra_sent)) :
            stop_condition = True
        
        # Save current generated character for next iteration
        target_seq = np.zeros((1, 1, len(french_vocab)))
        target_seq[0, 0, max_val_index] = 1
        
        # Save states for next iteration
        states_val = [decoder_h, decoder_c]
    
    # Return translated sentence
    return translated_sent

In [215]:
# Generate 10 French sentences from inp_seq
for seq_index in range(10):
  
    # Get next encoded english sentence
    inp_seq = eng_input_data[seq_index:seq_index+1]
    
    # Get the translated sentence
    translated_sent = translate_eng_sentence(inp_seq)
    
    # Print the original English sentence
    print('English sentence:', english_sentences[seq_index])
    
    # Print the translated French sentence
    print('French sentence:', translated_sent)

English sentence: Go.
French sentence: Va !

English sentence: Hi.
French sentence: Salut !

English sentence: Hi.
French sentence: Salut !

English sentence: Run!
French sentence: Au feu !

English sentence: Run!
French sentence: Au feu !

English sentence: Who?
French sentence: Qh non !

English sentence: Wow!
French sentence: Ã‡a alorsâ€¯!

English sentence: Fire!
French sentence: Au feu !

English sentence: Help!
French sentence: Ã€ l'aideâ€¯!

English sentence: Jump.
French sentence: Attaquez !



- Model complexity can be increased to improve the model performance by
    - increasing the number of hidden layers in the encoder
    - increasing the number of hidden layers in the decoder
    - increasing the number of nodes in each layer

## Generate Natural Language Autocomplete Sentences
-- Sequence to Sequence (seq2seq) Model
    - sentence auto-completion

In [3]:
emails = pd.read_csv('C:/Users/himaj/OneDrive/Desktop/BUAN 6V99 - NLP/Project/DataCamp - Natural Language Generation in Python/Chapter4/emails.csv')
data = emails[:100]
print(data.shape)
print(data.head())

(100, 2)
                       file                                            message
0     allen-p/_sent_mail/1.  Message-ID: <18782981.1075855378110.JavaMail.e...
1    allen-p/_sent_mail/10.  Message-ID: <15464986.1075855378456.JavaMail.e...
2   allen-p/_sent_mail/100.  Message-ID: <24216240.1075855687451.JavaMail.e...
3  allen-p/_sent_mail/1000.  Message-ID: <13505866.1075863688222.JavaMail.e...
4  allen-p/_sent_mail/1001.  Message-ID: <30922949.1075863688243.JavaMail.e...


In [4]:
def parse_raw_message(raw_message):
    lines = raw_message.split('\n')
    email = {}
    message = ''
    keys_to_extract = ['from', 'to']
    for line in lines:
        if ':' not in line:
            message += line.strip()
            email['body'] = message
        else:
            pairs = line.split(':')
            key = pairs[0].lower()
            val = pairs[1].strip()
            if key in keys_to_extract:
                email[key] = val
    return email

In [5]:
def map_to_list(emails, key):
    results = []
    for email in emails:
        if key not in email:
            results.append('')
        else:
            results.append(email[key])
    return results

In [6]:
def parse_into_emails(messages):
    emails = [parse_raw_message(message) for message in messages]
    return {
        'body': map_to_list(emails, 'body')
    }

In [7]:
email_df = pd.DataFrame(parse_into_emails(data.message))
email_df.head()

Unnamed: 0,body
0,Here is our forecast
1,Traveling to have a business meeting takes the...
2,test successful. way to go!!!
3,"Randy,Can you send me a schedule of the salary..."
4,


In [8]:
corpus = email_df.body.tolist()
corpus

['Here is our forecast',
 "Traveling to have a business meeting takes the fun out of the trip.  Especially if you have to prepare a presentation.  I would suggest holding the business plan meetings here then take a trip without any formal business meetings.  I would even try and get some honest opinions on whether a trip is even desired or necessary.As far as the business meetings, I think it would be more productive to try and stimulate discussions across the different groups about what is working and what is not.  Too often the presenter speaks and the others are quiet just waiting for their turn.   The meetings might be better if held in a round table discussion format.My suggestion for where to go is Austin.  Play golf and rent a ski boat and jet ski's.  Flying somewhere takes too much time.",
 'test successful.  way to go!!!',
 'Randy,Can you send me a schedule of the salary and level of everyone in thescheduling group.  Plus your thoughts on any changes that need to be made.(Patt

In [9]:
# Empty lists to store the prefixes and the suffixes
prefix_sentences = []
suffix_sentences = []

# Create one prefix and one suffix at each character of each email
for email in corpus:
    for index in range(len(email)):
        # Find the prefix and suffix
        prefix = email[: index+1]
        suffix = '\t' + email[index+1 :] + '\n'
        
        # Add the prefix and suffix to the list of prefix and suffix sentences
        prefix_sentences.append(prefix)
        suffix_sentences.append(suffix)

In [10]:
# Initialize vocabulary with the start and end token
vocabulary = set(['\t', '\n'])

# Iterate for each char in each email
for email in corpus:
    for char in email:
        # Add the char if not in vocabulary, 
        if (char not in vocabulary):
            vocabulary.add(char)            
# Sort the vocabulary
vocabulary = sorted(vocabulary)

# Create char to int and int to char mapping
char_to_idx = dict((char, idx) for idx, char in enumerate(vocabulary))
idx_to_char = dict((idx, char) for idx, char in enumerate(vocabulary))

In [11]:
# Find the length of the longest prefix
max_len_prefix_sent = max([len(prefix) for prefix in prefix_sentences])

# Find the length of the longest suffix
max_len_suffix_sent = max([len(suffix) for suffix in suffix_sentences])

In [13]:
# Define a 3-D zero vector for the prefix sentences
input_data_prefix = np.zeros((len(prefix_sentences), max_len_prefix_sent,len(vocabulary)), dtype='float32')

# Define a 3-D zero vector for the suffix sentences
input_data_suffix = np.zeros((len(suffix_sentences), max_len_suffix_sent,len(vocabulary)), dtype='float32')

# Define a 3-D zero vector for the target data
target_data = np.zeros((len(suffix_sentences), max_len_suffix_sent,len(vocabulary)), dtype='float32')

In [14]:
for i in range(len(prefix_sentences)):
    # Iterate over each character in each prefix
    for k, ch in enumerate(prefix_sentences[i]):
        # Convert the character to a one-hot encoded vector
        input_data_prefix[i, k, char_to_idx[ch]] = 1
        
    # Iterate over each character in each suffix
    for k, ch in enumerate(suffix_sentences[i]):
        # Convert the character to a one-hot encoded vector
        input_data_suffix[i, k, char_to_idx[ch]] = 1

        # Target data is one timestep ahead and excludes start character
        if k > 0:
            target_data[i, k-1, char_to_idx[ch]] = 1

- Just like the encoder-decoder architecture in the previous use case, 
    - the encoder accepts the input sequences and summarizes th einformation in its internal state vectors which are used in the decoder as the intial states. 
    - the encoder is implemented using LSTM and so the states refer to the hidden and cell states from the LSTM layer
    - Encoder learns these states from the input sequences. Intuitively, the states consolidate all the useful information from the input sequences need to generate the output sequences
    - The encoder output is ignored
    - The decoder produces the output sequence
    - The final deocder states are ignored
    - During training, the input to the decoder are the target sequences
    - During inference, the input at each time step is the predicted output from the previous step
    
    
    - The encoder takes the prefixes as input and summarizes the information in its state vectors which are passed to the decoder as the initial states. These state vectors consolidate all the useful information from the prefix sequences which are needed in the decoder to generate the suffix sentences. The decoder takes the suffixes as input. The target sequences from the decoder will be the suffixes, but they will be one time-step ahead and skip the first character

In [15]:
# Create the input layer of the encoder
encoder_input = Input(shape=(None, len(vocabulary)))

# Create LSTM Layer of size 256
encoder_LSTM = LSTM(256, return_state = True)

# Save encoder output, hidden and cell state
encoder_outputs, encoder_h, encoder_c = encoder_LSTM(encoder_input)

# Save encoder states
encoder_states = [encoder_h, encoder_c]

In [16]:
# Create decoder input layer
decoder_input = Input(shape=(None, len(vocabulary)))

# Create LSTM layer of size 256
decoder_LSTM = LSTM(256,return_sequences=True, return_state = True)

# Save decoder output
decoder_out, _ , _ = decoder_LSTM(decoder_input, initial_state=encoder_states)

# Create a `Dense` layer with softmax activation
decoder_dense = Dense(len(vocabulary),activation='softmax')

# Save the decoder output
decoder_out = decoder_dense(decoder_out)

In [17]:
# Build model
model = Model(inputs=[encoder_input, decoder_input], outputs=[decoder_out])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy')

# Print model summary
#model.summary()

In [18]:
# Fit the model
model.fit(x=[input_data_prefix, input_data_suffix], y=target_data,
          batch_size=64, epochs=1, validation_split=0.2)

- Autocomplete sentences using inference models

- Let's use the trained model for predictions
    - Input will be incomplete sentences or prefixes  
    - and predictions will be the suffix which completes the sentences 

In [19]:
# Create encoder inference model
encoder_model_inf = Model(encoder_input, encoder_states)

# Create decoder input states for inference
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

# Get decoder output and feed it to the dense layer for final output prediction
decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_input, initial_state=decoder_input_states)
decoder_states = [decoder_h , decoder_c]
decoder_out = decoder_dense(decoder_out)

# Create decoder inference model
decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states, outputs=[decoder_out] + decoder_states )

In [20]:
# Pass input prefix to the Encoder inference model and get the states
inp_seq = input_data_prefix[4:5]
states_val = encoder_model_inf.predict(inp_seq)

# Seed the first character and get output from the decoder 
target_seq = np.zeros((1, 1, len(vocabulary)))
target_seq[0, 0, char_to_idx['\t']] = 1  
decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)

# Find out the next character from the Decoder output
max_val_index = np.argmax(decoder_out[0,-1,:])
sampled_suffix_char = idx_to_char[max_val_index]

# Print the first character
print(sampled_suffix_char)

d


In [21]:
# Insert the generated character from last time to the target sequence 
target_seq = np.zeros((1, 1, len(vocabulary)))
target_seq[0, 0, max_val_index] = 1

# Initialize the decoder state to the states from last iteration
states_val = [decoder_h, decoder_c]

# Get decoder output
decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)

# Get most probable next character and print it.
max_val_index = np.argmax(decoder_out[0,-1,:])
sampled_suffix_char = idx_to_char[max_val_index]
print(sampled_suffix_char)

o


In [22]:
def generate_suffix_sentence(inp_seq):

    # Initialize states value to the final states of the encoder
    states_val = encoder_model_inf.predict(inp_seq)

    # Initialize the target sequence to contain the start token
    target_seq = np.zeros((1, 1, len(vocabulary)))
    target_seq[0, 0, char_to_idx['\t']] = 1

    # Define a variable to store the suffix sentence
    suffix_sent = ''

    # Define stop condition flag
    stop_condition = False

    # Iterate until the end token is found or maximum length of the suffix sentence is reached
    while not stop_condition:

        # Get output from decoder inference model
        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)

        # Get most probable next character
        max_val_index = np.argmax(decoder_out[0,-1,:])
        sampled_output_char = idx_to_char[max_val_index]

        # Append the generated char to the suffix sentence
        suffix_sent += sampled_output_char

        # Check if end token is encountered or maximum length of the suffix sentence is exceeded
        if ((sampled_output_char == '\n') or (len(suffix_sent) > max_len_suffix_sent)) :
            stop_condition = True

        # Add the new generated char to the existing target sequence
        target_seq = np.zeros((1, 1, len(vocabulary)))
        target_seq[0, 0, max_val_index] = 1

        # Save state values to use in the next iteration
        states_val = [decoder_h, decoder_c]

    # Return the suffix sentence
    return suffix_sent

In [26]:
# Generate 10 suffixes
for seq_index in range(2):
  
    # Get the next tokenized sentence
    inp_seq = input_data_prefix[seq_index:seq_index+1]
    
    # Generate the suffix sentence
    suffix_sent = generate_suffix_sentence(inp_seq)
    
    # Print the prefix sentence
    print('Prefix Sentence:', prefix_sentences[seq_index])
    
    # Print the suffix sentence
    print('Suffix Sentence:', suffix_sent)

Prefix Sentence: h
Suffix Sentence: ttp:www.denverpost.combroncosbrnx0408sa.htm
Prefix Sentence: he
Suffix Sentence: y: do you have jp's email address?


- Accuracy of the model can be improved by
    - increasing the model complexity
    - training for more epochs
    - training with bigger dataset