# <center> Generate baby names </center>

## Keras with TF background

In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation,Dropout
from keras.callbacks import ModelCheckpoint,Callback
from keras.utils import to_categorical,multi_gpu_model
from keras.models import model_from_yaml
from random import randint


Using TensorFlow backend.


## Corpus
    - data    : raw string input data
    - vocab   : vocabulary
    - encoder : python dictionary. { 'char' : index }
    - decoder : python dictionary. { index  : 'char'}
    - Tx      : timestep
    - m       : Number of samples
    - Vx      : Length of the vocabular or Channel Length after encoding
    - X       : Features
    - Y       : Labels

In [2]:
class corpus:
    def __init__(self):
        self.data    = ""
        self.vocab   = ""
        
        self.encoder = None
        self.decoder = None
        
        self.Tx      = 0
        self.m       = 0
        self.Vx      = 0
        
        self.X       = []
        self.Y       = []
        
    def __str__(self):
        s = ''
        s += f'Number of Examples  m  = {self.m}\n'
        s += f'Number of Timesteps Tx = {self.Tx}\n'
        s += f'Vocabulary Length   Vx = {self.Vx}\n'
        return s
     


In [3]:
crp = corpus() 

## Read the file to populate data and vocabulary

In [4]:
def load_data(fpath):
    with open(fpath) as corpus_file:
        data = corpus_file.read().lower()
    vocab = sorted(list(set(data)))
    return data,vocab

In [5]:
crp.data,crp.vocab  = load_data('data/babynames.txt')
print("data sample = '{}' data len = {} vocab = {} vocab_len = {}".
               format(crp.data[0:20],len(crp.data), crp.vocab,len(crp.vocab)))

data sample = 'aamir
aaron
abbey
ab' data len = 55869 vocab = ['\n', ' ', "'", '-', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] vocab_len = 30


## Create Encoder and Decoders

In [6]:
def get_encoder(crp):
    return {c: i for i, c in enumerate(crp.vocab)}
def get_decoder(crp):
    return {i: c for i, c in enumerate(crp.vocab)}

In [7]:
# crp.encoder = get_encoder(crp)
# crp.decoder = get_decoder(crp)

## Slice the continous text data into list of features and Labels
 - For each name, prepend space so that each name of time step size.
 - Slide the window by one character until we get space
 - Tx = max(len(names))

In [8]:
def slice_name(name,Tx):
    X=[]
    Y=[]
    name="%*s"%(Tx,name)+'\n'
    for i in range(Tx):
        if(name[:-1][-1] == ' '):
            break
        x=name[:-1]
        y=name[-1]
        X.append(x)
        Y.append(y)
        name = ' ' + name[:-1]
    X.reverse()
    Y.reverse()
    return X,Y


In [9]:
X,Y = slice_name('madhu',8)
for x,y in zip(X,Y):
    print(x,y,len(x))


       m a 8
      ma d 8
     mad h 8
    madh u 8
   madhu 
 8


In [10]:
def slice_data(data):
    samples = data.split('\n')
    np.random.shuffle(samples)
    Tx = len(max(samples,key=len))
    feature = []
    label   = []
    for name in samples:
        x,y = slice_name(name,Tx)
        feature = feature + x
        label   = label + y
    return feature,label


In [11]:
crp.X,crp.Y = slice_data(crp.data)


In [12]:
r = randint(1,len(crp.X))
for i in range(10): 
    print(f"X[{r+i}] = {crp.X[r+i]} Y[{r+i}] = {crp.Y[r+i]}")


X[349] =             ozz Y[349] = y
X[350] =            ozzy Y[350] = 

X[351] =               m Y[351] = e
X[352] =              me Y[352] = l
X[353] =             mel Y[353] = o
X[354] =            melo Y[354] = s
X[355] =           melos Y[355] = a
X[356] =          melosa Y[356] = 

X[357] =               g Y[357] = u
X[358] =              gu Y[358] = n


## Update parameters (Tx, Vx, m)

In [13]:
# crp.Tx = len(crp.X[0])
# crp.Vx = len(crp.vocab)
# crp.m      = len(crp.X)


In [14]:
# print(crp)

## Feature Engineering
    - Transform X data (m,Tx,Vx) to Y (m,1,Vx) i.e (m,Vx).
    - Many to One RNN architecture
    - Lets convert into one hot encoding


In [15]:
def encode_data(crp):        
    # First each char after Tx, we will have one example.
    feature = np.zeros((crp.m,crp.Tx))
    label   = np.zeros(crp.m)
        
    for i in range (0, crp.m, 1):
        sentence  = crp.X[i]
        next_char = crp.Y[i]
            
        for j in range(crp.Tx):
            feature[i,j] = crp.encoder[sentence[j]]
            label[i]     = crp.encoder[next_char]

    feature = to_categorical(feature,num_classes=crp.Vx)
    label   = to_categorical(label,num_classes=crp.Vx)
    return feature,label

In [16]:
# X,Y = encode_data(crp)

# print("Sliced our corpus into {} examples. feature.shape (m,Tx,Vx) = {} label.shape (m,Vx) = {}".
#         format(crp.m, X.shape,Y.shape))


## Create Model
    - LSTM model will remove Tx dimension if you don't specify return_sequences=True.
    - In the predict, you can pass a random text of length upto Tx to kick start the prediction. Loop it after it gives each word.

In [17]:
def build_LSTM_model(units,Tx,Vx,layers=1,dropout=None):
    model = Sequential()
    for i in range(layers):
        if(layers == 1):
            model.add(LSTM(units, input_shape=(Tx,Vx)))
        elif(i == 0): 
            model.add(LSTM(units, input_shape=(Tx,Vx),return_sequences=True))
        elif(i != layers -1):
            model.add(LSTM(units, return_sequences=True))
        else:
            model.add(LSTM(units))

        if(dropout is not None):
            model.add(Dropout(dropout))
                    
    model.add(Dense(Vx))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [18]:
# model = build_LSTM_model(256,crp.Tx,crp.Vx)
# model.summary()


## Inference Logic

In [19]:
def generate_text(model, crp, seed_text,cnt):

    def get_encoded_data(text):
        X = np.zeros((1, crp.Tx, crp.Vx), dtype=np.bool)
        for i, c in enumerate(text):
            X[0, i, crp.encoder[c]] = 1    
        return X

    ret = []
    
    for t in range(cnt):
        if(seed_text is None):
            generated_text = ''+crp.decoder[randint(1,crp.Vx-1)]
        else:
            generated_text = seed_text
        for i in range(crp.Tx):
            generated_text ="%*s"%(crp.Tx,generated_text)
            X = get_encoded_data(generated_text)
            prediction = model.predict(X, verbose=0)
            #prediction = crp.decoder[np.argmax(prediction)]
            prediction = crp.decoder[np.random.choice(crp.Vx,p=prediction.ravel())]
            if(prediction == '\n'):
                break
            generated_text = generated_text.strip() + prediction  
        ret.append(generated_text.strip())
    return ret


## Custom call back to save the final best model

In [20]:
class mycallback(Callback):
    def __init__(self,crp,model_path):
        super(mycallback, self).__init__()
        self.best_model = None
        self.best_loss  = 1000
        self.best_epoch = -1
        self.model_path = model_path
        
    def on_train_end(self, logs={}):
        print(f'saving the model with loss = {self.best_loss} on epoch {self.best_epoch}')
        self.best_model.save(self.model_path)
        return
 
    def on_epoch_end(self, epoch, logs={}):
        loss = logs['loss']
        if(loss < self.best_loss):
            self.best_model = self.model
            self.best_loss  = loss
            self.best_epoch = epoch
            print(generate_text(self.model,crp,None,10))
        return 


In [21]:
# hist = model.fit(X, Y, epochs=30, batch_size=128, callbacks=[mycallback(crp,'babynames-best.h5')])

## <center> Execution </center>

## Load and Preprocess Data

In [22]:
crp                 = corpus()
crp.data,crp.vocab  = load_data('data/babynames.txt')
crp.encoder         = get_encoder(crp)
crp.decoder         = get_decoder(crp)


crp.X,crp.Y         = slice_data(crp.data)
crp.Tx              = len(crp.X[0])
crp.Vx              = len(crp.vocab)
crp.m               = len(crp.X)

X,Y = encode_data(crp)



## create the model

In [23]:
model = build_LSTM_model(256,crp.Tx,crp.Vx)
model.summary()

W1120 14:18:28.601363 4734387648 deprecation_wrapper.py:119] From /Users/madhukandasamy/miniconda3/envs/py3.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1120 14:18:28.622963 4734387648 deprecation_wrapper.py:119] From /Users/madhukandasamy/miniconda3/envs/py3.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1120 14:18:28.627760 4734387648 deprecation_wrapper.py:119] From /Users/madhukandasamy/miniconda3/envs/py3.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1120 14:18:28.915183 4734387648 deprecation_wrapper.py:119] From /Users/madhukandasamy/miniconda3/envs/py3.6/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprec

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 256)               293888    
_________________________________________________________________
dense_1 (Dense)              (None, 30)                7710      
_________________________________________________________________
activation_1 (Activation)    (None, 30)                0         
Total params: 301,598
Trainable params: 301,598
Non-trainable params: 0
_________________________________________________________________


In [24]:
hist = model.fit(X, Y, epochs=10, batch_size=128, verbose=1, callbacks=[mycallback(crp,'babynames-best.h5')])

W1120 14:18:46.421600 4734387648 deprecation.py:323] From /Users/madhukandasamy/miniconda3/envs/py3.6/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W1120 14:18:46.994416 4734387648 deprecation_wrapper.py:119] From /Users/madhukandasamy/miniconda3/envs/py3.6/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Epoch 1/10
['duvle', 'vonssa', 'ytda', 'olead', 'bacinel', 'qelce', 'qunzel', 'ubelie', 'dina', '-amesla']
Epoch 2/10
['herry', 'karie', 'trike', "'cao", 'watanayne', 'lynta', 'jesta', 'kassienenna', 'varera', 'wigceshen']
Epoch 3/10
['oheity', 'barine', 'usferie', 'eenalbe', 'sundaro', 'zlomerlo', 'darvina', 'viwlare', 'donnitine', 'dohel']
Epoch 4/10
['ursynela', 'lila', 'gaustw', 'ushaline', 'aancie', 'sopithy', 'dyve', '-igryetra', "'anissa", '-alana']
Epoch 5/10
['nyv', 'werf', 'uli', '-eannic', 'zannel', 'sallee', 'willh', 'pere', 'teomer', 'wolletta']
Epoch 6/10
['varcha', 'goranda', "'onda", 'brichel', 'charde', 'riclen', 'marrice', 'putti', 'fendala', 'stenhia']
Epoch 7/10
['ile-marlene', 'nisie', 'zontee', 'johmieah', 'lyzze', 'taramine', 'eregh', 'lariee', 'zolaria', 'sudd']
Epoch 8/10
['uphilis', 'bal', '-dein', 'zelly', 'sundalie', 'zuli', 'car', 'gortiephus', 'indwett', 'lutin']
Epoch 9/10
['quante', 'regra', 'lverre', 'oren', 'pace', 'ubbie', 'kaim', 'yothie', 'toby', 'w

In [None]:
# mgpu_model = multi_gpu_model(model,gpus=2)
# mgpu_model.compile(loss='categorical_crossentropy', optimizer='adam')
# mgpu_model.fit(X, Y, epochs=30, batch_size=256) #, callbacks=callbacks)


In [30]:
generate_text(model,crp,'mad',100)

['madie',
 'madia',
 'madrie',
 'madelsa',
 'madelins',
 'madris',
 'madianty',
 'madette',
 'madeasne',
 'madrise',
 'mada',
 'madich',
 'madran',
 'madenia',
 'madrio',
 'madian',
 'madam',
 'madele',
 'madsee',
 'madna',
 'madel',
 'madila',
 'mad',
 'mady',
 'madsia',
 'madra',
 'madala',
 'madian',
 'madelie',
 'madgiette',
 'maderiette',
 'madvit',
 'madfie',
 'madol',
 'madersch',
 'madelia',
 'madyanne',
 'mademar',
 'madianne',
 'madeis',
 'madden',
 'madgit',
 'madeensie',
 'madras',
 'madaria',
 'madelita',
 'madelie',
 'madolee',
 'madilanne',
 'mada',
 'madeline',
 'madie',
 'madoh',
 'maderite',
 'madgianne',
 'madyoh',
 'madimar',
 'madjen',
 'madette',
 'madilia',
 'madine',
 'madymanye',
 'madella',
 'madionne',
 'madest',
 'madrye',
 'madilynne',
 'madetta',
 'madeline',
 'madraid',
 'madikki',
 'madina',
 'madal',
 'madila',
 'madilyn',
 'madian',
 'madida',
 'madci',
 'madya',
 'madius',
 'madimet',
 'madson',
 'madasta',
 'madusty',
 'madyol',
 'madelina',
 'madest