# <center> Generate baby names </center>

## Keras with TF background

In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation,Dropout
from keras.callbacks import ModelCheckpoint,Callback
from keras.utils import to_categorical,multi_gpu_model
from keras.models import model_from_yaml
from random import randint


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Corpus
    - data    : raw string input data
    - vocab   : vocabulary
    - encoder : python dictionary. { 'char' : index }
    - decoder : python dictionary. { index  : 'char'}
    - Tx      : timestep
    - m       : Number of samples
    - Vx      : Length of the vocabular or Channel Length after encoding
    - X       : Features
    - Y       : Labels

In [2]:
class corpus:
    def __init__(self):
        self.data    = ""
        self.vocab   = ""
        
        self.encoder = None
        self.decoder = None
        
        self.Tx      = 0
        self.m       = 0
        self.Vx      = 0
        
        self.X       = []
        self.Y       = []
        
    def __str__(self):
        s = ''
        s += f'Number of Examples  m  = {self.m}\n'
        s += f'Number of Timesteps Tx = {self.Tx}\n'
        s += f'Vocabulary Length   Vx = {self.Vx}\n'
        return s
     


In [3]:
# crp = corpus() 

## Read the file to populate data and vocabulary

In [4]:
def load_data(fpath):
    with open(fpath) as corpus_file:
        data = corpus_file.read().lower()
    vocab = sorted(list(set(data)))
    return data,vocab

In [5]:
# crp.data,crp.vocab  = load_data('data/babynames.txt')
# print("data sample = '{}' data len = {} vocab = {} vocab_len = {}".
#               format(crp.data[0:20],len(crp.data), crp.vocab,len(crp.vocab)))

## Create Encoder and Decoders

In [6]:
def get_encoder(crp):
    return {c: i for i, c in enumerate(crp.vocab)}
def get_decoder(crp):
    return {i: c for i, c in enumerate(crp.vocab)}

In [7]:
# crp.encoder = get_encoder(crp)
# crp.decoder = get_decoder(crp)

## Slice the continous text data into list of features and Labels
 - For each name, prepend space so that each name of time step size.
 - Slide the window by one character until we get space
 - Tx = max(len(names))

In [8]:
def slice_name(name,Tx):
    X=[]
    Y=[]
    name="%*s"%(Tx,name)+'\n'
    for i in range(Tx):
        if(name[:-1][-1] == ' '):
            break
        x=name[:-1]
        y=name[-1]
        X.append(x)
        Y.append(y)
        name = ' ' + name[:-1]
    X.reverse()
    Y.reverse()
    return X,Y


In [9]:
# X,Y = slice_name('madhu',5)
# for x,y in zip(X,Y):
#     print(x,y)


In [10]:
def slice_data(data):
    samples = data.split('\n')
    np.random.shuffle(samples)
    Tx = len(max(samples,key=len))
    feature = []
    label   = []
    for name in samples:
        x,y = slice_name(name,Tx)
        feature = feature + x
        label   = label + y
    return feature,label


In [11]:
# crp.X,crp.Y = slice_data(crp.data)


In [12]:
# for i in range(3):
#     r = randint(1,len(crp.X))
#     print(f"X[{r}] = {crp.X[r]} Y[{r}] = {crp.Y[r]}")


## Update parameters (Tx, Vx, m)

In [13]:
# crp.Tx = len(crp.X[0])
# crp.Vx = len(crp.vocab)
# crp.m      = len(crp.X)


In [14]:
# print(crp)

## Feature Engineering
    - Transform X data (m,Tx,Vx) to Y (m,1,Vx) i.e (m,Vx).
    - Many to One RNN architecture
    - Lets convert into one hot encoding


In [15]:
def encode_data(crp):        
    # First each char after Tx, we will have one example.
    feature = np.zeros((crp.m,crp.Tx))
    label   = np.zeros(crp.m)
        
    for i in range (0, crp.m, 1):
        sentence  = crp.X[i]
        next_char = crp.Y[i]
            
        for j in range(crp.Tx):
            feature[i,j] = crp.encoder[sentence[j]]
            label[i]     = crp.encoder[next_char]

    feature = to_categorical(feature,num_classes=crp.Vx)
    label   = to_categorical(label,num_classes=crp.Vx)
    return feature,label

In [16]:
# X,Y = encode_data(crp)

# print("Sliced our corpus into {} examples. feature.shape (m,Tx,Vx) = {} label.shape (m,Vx) = {}".
#         format(crp.m, X.shape,Y.shape))


## Create Model
    - LSTM model will remove Tx dimension if you don't specify return_sequences=True.
    - In the predict, you can pass a random text of length upto Tx to kick start the prediction. Loop it after it gives each word.

In [17]:
def build_LSTM_model(units,Tx,Vx,layers=1,dropout=None):
    model = Sequential()
    for i in range(layers):
        if(layers == 1):
            model.add(LSTM(units, input_shape=(Tx,Vx)))
        elif(i == 0): 
            model.add(LSTM(units, input_shape=(Tx,Vx),return_sequences=True))
        elif(i != layers -1):
            model.add(LSTM(units, return_sequences=True))
        else:
            model.add(LSTM(units))

        if(dropout is not None):
            model.add(Dropout(dropout))
                    
    model.add(Dense(Vx))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [18]:
# model = build_LSTM_model(256,crp.Tx,crp.Vx)
# model.summary()


## Inference Logic

In [37]:
def generate_text(model, crp, seed_text,cnt):

    def get_encoded_data(text):
        X = np.zeros((1, crp.Tx, crp.Vx), dtype=np.bool)
        for i, c in enumerate(text):
            X[0, i, crp.encoder[c]] = 1    
        return X

    ret = []
    
    for t in range(cnt):
        if(seed_text is None):
            generated_text = ''+crp.decoder[randint(1,crp.Vx-1)]
        else:
            generated_text = seed_text
        for i in range(crp.Tx):
            generated_text ="%*s"%(crp.Tx,generated_text)
            X = get_encoded_data(generated_text)
            prediction = model.predict(X, verbose=0)
            #prediction = crp.decoder[np.argmax(prediction)]
            prediction = crp.decoder[np.random.choice(crp.Vx,p=prediction.ravel())]
            if(prediction == '\n'):
                break
            generated_text = generated_text.strip() + prediction  
        ret.append(generated_text.strip())
    return ret


## Custom call back to save the final best model

In [20]:
class mycallback(Callback):
    def __init__(self,crp,model_path):
        super(mycallback, self).__init__()
        self.best_model = None
        self.best_loss  = 1000
        self.best_epoch = -1
        self.model_path = model_path
        
    def on_train_end(self, logs={}):
        print(f'saving the model with loss = {self.best_loss} on epoch {self.best_epoch}')
        self.best_model.save(self.model_path)
        return
 
    def on_epoch_end(self, epoch, logs={}):
        loss = logs['loss']
        if(loss < self.best_loss):
            self.best_model = self.model
            self.best_loss  = loss
            self.best_epoch = epoch
            print(generate_text(self.model,crp,None,10))
        return 


In [21]:
# hist = model.fit(X, Y, epochs=30, batch_size=128, callbacks=[mycallback(crp,'babynames-best.h5')])

## <center> Execution </center>

## Load and Preprocess Data

In [23]:
crp                 = corpus()
crp.data,crp.vocab  = load_data('data/babynames.txt')
crp.encoder         = get_encoder(crp)
crp.decoder         = get_decoder(crp)


crp.X,crp.Y         = slice_data(crp.data)
crp.Tx              = len(crp.X[0])
crp.Vx              = len(crp.vocab)
crp.m               = len(crp.X)

X,Y = encode_data(crp)



## create the model

In [24]:
model = build_LSTM_model(256,crp.Tx,crp.Vx)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 256)               293888    
_________________________________________________________________
dense_1 (Dense)              (None, 30)                7710      
_________________________________________________________________
activation_1 (Activation)    (None, 30)                0         
Total params: 301,598
Trainable params: 301,598
Non-trainable params: 0
_________________________________________________________________


In [26]:
hist = model.fit(X, Y, epochs=30, batch_size=128, verbose=1, callbacks=[mycallback(crp,'babynames-best.h5')])

Epoch 1/30
['-peszla', 'barde', 'ekybe', 'cagjiek', 'ricna', 'vgynlle', 'penderite', 'elbentol', 'yrincon', 'rbielieb']
Epoch 2/30
['quoda', 'oly', 'garmy', 'frker', 'wagdet', 'salmeta', 'nan', 'inkestan', 'eadrine', 'wah']
Epoch 3/30
['erasia', 'quanana', 'xinsie', 'bertda', 'sashvor', 'zoleina', 'sabery', 'calda', 'gelbiag', 'audusta']
Epoch 4/30
['ebelin', 'fryretta', 'ynchassia', 'rricy', 'oren', 'anella', 'kash', "'per", 'otabelta', 'rine']
Epoch 5/30
['-lulike', 'cherolul', 'yderrey', 'ashorina', 'list', 'zolbeyt', 'osbaoni', 'veve', 'usora', 'nanona']
Epoch 6/30
["'uste", 'himv', 'jeay', 'zanay', 'aneshell', 'will', 'porli', 'elexine', 'alvard', 'wolly']
Epoch 7/30
['-ebbethelia', 'ren', "'amoys", 'iah', 'waves', 'essamunth', 'elina', 'aligkos', 'netoria', 'la']
Epoch 8/30
['yvolyn', 'zeron', "'loet", 'maliere', 'jusa', 'nuistina', 'annalone', 'yyne', 'udy', 'sabel']
Epoch 9/30
['odie', 'ven', 'xyseonens', '-ulossa', 'ursil', 'meressa', 'lened', 'tobe', 'phena', 'jussita']
Epoch

In [None]:
# mgpu_model = multi_gpu_model(model,gpus=2)
# mgpu_model.compile(loss='categorical_crossentropy', optimizer='adam')
# mgpu_model.fit(X, Y, epochs=30, batch_size=256) #, callbacks=callbacks)


In [None]:
sorted(set(generate_text(model,crp,'gom',100)))