In [340]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation,Dropout
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
from keras.models import model_from_yaml
from random import randint


In [418]:
class Corpus:
    """
    Tx       = Time Steps
    Vx       = Vocabulary Size or Size of One Hot Encoded Vector for each time step input
    m        = Number of Total Examples available in this corpus
    XD,YD    = Decoded Feature, Label. Single dimention list of Strings.
    XE,YE    = Encoded Feature, Label. Multi dimentional numpy array.
    XD.shape = (m,Tx,Vx)
    YD.shape = (m,Vx) i.e Tx = 1 for label. Multi input, one output vector.
    """
    def __init__(self, corpus_path):
    
        with open(corpus_path) as corpus_file:
            self.corpus = corpus_file.read()
        
        self.vocab    = sorted(list(set(self.corpus)))
        self.Vx       = len(self.vocab)
        
        self.encoder  = {c: i for i, c in enumerate(self.vocab)}
        self.decoder  = {i: c for i, c in enumerate(self.vocab)}
        
        self.examples = self.corpus.lower().split('\n')
        np.random.shuffle(self.examples)
        
        self.Tx       = len(max(self.examples,key=len))
        
        self.XD,self.YD = self.get_decoded_dataset()
        self.XE,self.YE = self.get_encoded_dataset()

    def get_decoded_dataset(self):        
        X = []
        Y = []
        for name in self.examples:
            if (len(name) == 0):
                continue
            x = name[0]
            for c in name[1:]:
                X.append(x)
                Y.append(c)
                x = x + c
            X.append(name)
            Y.append('\n')
        self.m = len(X)
        return X,Y
    
    def get_encoded_dataset(self):
        X = np.zeros((self.m,self.Tx,self.Vx))
        Y = np.zeros((self.m,self.Vx))
        for i in range(self.m):
            x = self.XD[i]
            y = self.YD[i]
            for j in range(self.Tx):
                if(j<len(x)):
                    #print('i=',i,'x=',x,'j=',j,'x[j]=',x[j:j+1])
                    idx = self.encoder[x[j:j+1]]
                    X[i,j,idx] = 1
                else:
                    X[i,j,self.encoder['\n']] = 1  ## Use some rarely used char in dataset.
            Y[i,self.encoder[y[0]]] = 1
        return X,Y

In [419]:
crp = Corpus('data/babynames.txt')

In [420]:
for a in range(20):
    r = randint(0,crp.m)
    x = ''
    for i in range(crp.Tx):
        c = crp.decoder[np.argmax(crp.XE[r,i,:])]
        if (c == '\n'): break
        x = x + c
    y = crp.decoder[np.argmax(crp.YE[r,:])]
    
    print('decoded X = {} Y = {} '.format(crp.XD[r],crp.YD[r]))
    print('encoded X = {} Y = {} '.format(x,y))

decoded X = greg Y = o 
encoded X = greg Y = o 
decoded X = harro Y = l 
encoded X = harro Y = l 
decoded X = had Y = r 
encoded X = had Y = r 
decoded X = l Y = e 
encoded X = l Y = e 
decoded X = jerry Y = 
 
encoded X = jerry Y = 
 
decoded X = m Y = a 
encoded X = m Y = a 
decoded X = w Y = a 
encoded X = w Y = a 
decoded X = m Y = a 
encoded X = m Y = a 
decoded X = m Y = o 
encoded X = m Y = o 
decoded X = ken Y = 
 
encoded X = ken Y = 
 
decoded X = be Y = r 
encoded X = be Y = r 
decoded X = sh Y = a 
encoded X = sh Y = a 
decoded X = keena Y = n 
encoded X = keena Y = n 
decoded X = sherl Y = i 
encoded X = sherl Y = i 
decoded X = doe Y = 
 
encoded X = doe Y = 
 
decoded X = cassan Y = d 
encoded X = cassan Y = d 
decoded X = concetti Y = n 
encoded X = concetti Y = n 
decoded X = kri Y = s 
encoded X = kri Y = s 
decoded X = ali Y = s 
encoded X = ali Y = s 
decoded X = jacqu Y = e 
encoded X = jacqu Y = e 


- Transform X data (m,Tx,vec_size) to Y (m,1,vec_size) i.e (m,vec_size).
- Many to One RNN architecture
- Lets convert into one hot encoding
- LSTM model will remove Tx dimension if you don't specify return_sequences=True.
- In the predict, you can pass a random text of length upto Tx to kick start the prediction. Loop it after it gives each word.
- 

In [421]:
class CRNN:
    
    def __init__(self,Tx,Vx,encoder,decoder):
        self.Tx  = Tx
        self.Vx  = Vx
        self.encoder       = encoder
        self.decoder       = decoder
        
    def build(self,units,layers=1,dropout=None):
        model = Sequential()

        for i in range(layers):
            if(layers == 1):
                model.add(LSTM(units, input_shape=(self.Tx, self.Vx)))
            elif(i == 0): 
                model.add(LSTM(units, input_shape=(self.Tx, self.Vx),return_sequences=True))
            elif(i != layers -1):
                model.add(LSTM(units, return_sequences=True))
            else:
                model.add(LSTM(units))

            if(dropout is not None):
                model.add(Dropout(dropout))
                    
        model.add(Dense(self.Vx))
        model.add(Activation('softmax'))
        
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        self.model = model
        self.model.summary()
    
    def load(self,mfile,cpfile):
        with open(mfile) as model_file:
            architecture = model_file.read()

        self.model = model_from_yaml(architecture)
        self.model.load_weights(cpfile)
        self.model.compile(loss='categorical_crossentropy', optimizer='adam')
        self.model.summary()
        
    def train(self,mfile,cpfile,X_train,Y_train,epochs,batch_size):
        architecture = self.model.to_yaml()
        with open(mfile, 'w') as model_file:
            model_file.write(architecture)

        file_path= cpfile + "-checkpoint-{epoch:02d}-{loss:.3f}.hdf5"
        checkpoint = ModelCheckpoint(file_path, monitor="loss", verbose=1, save_best_only=True, mode="min")
        callbacks = [checkpoint]

        self.model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, callbacks=callbacks)
    
    def get_encoded_data(self,text):
        X = np.zeros((1, self.Tx, self.Vx), dtype=np.bool)
        for i, c in enumerate(text):
            X[0, i, self.encoder[c]] = 1    
        return X

    def generate(self, text,cnt):        
        ret = []
        for t in range(cnt):
            generated_text = text
            for i in range(self.Tx-(len(text))):
                X = self.get_encoded_data(generated_text)
                prediction = self.model.predict(X, verbose=0)
                prediction = np.random.choice(self.Vx,p=prediction.ravel())
                if(prediction == 0):
                    break
                generated_text += self.decoder[prediction]
            ret.append(generated_text)
        return ret


In [422]:
crp = Corpus('data/babynames.txt')
m  = crp.m
Vx = crp.Vx
Tx = crp.Tx

net = CRNN(Tx,Vx,crp.encoder,crp.decoder)

X_train,Y_train = crp.get_encoded_dataset()
net.build(256,layers=1)
net.train('babynames_model.yaml','babynames_model',X_train,Y_train,epochs=20,batch_size=128)



#net.load('babynames_model.yaml','babynames_model-checkpoint-20-1.822.hdf5')
print(net.generate('x',100))


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_26 (LSTM)               (None, 256)               320512    
_________________________________________________________________
dense_19 (Dense)             (None, 56)                14392     
_________________________________________________________________
activation_19 (Activation)   (None, 56)                0         
Total params: 334,904
Trainable params: 334,904
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20

Epoch 00001: loss improved from inf to 2.59240, saving model to babynames_model-checkpoint-01-2.592.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.59240 to 2.39561, saving model to babynames_model-checkpoint-02-2.396.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.39561 to 2.33493, saving model to babynames_model-checkpoint-03-2.335.hdf5
Epoch 4/20

Epoch 00004: loss improved fr

In [416]:
net.load('babynames_model.yaml','babynames_model-checkpoint-20-1.504.hdf5')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_25 (LSTM)               (None, 256)               293888    
_________________________________________________________________
dense_18 (Dense)             (None, 30)                7710      
_________________________________________________________________
activation_18 (Activation)   (None, 30)                0         
Total params: 301,598
Trainable params: 301,598
Non-trainable params: 0
_________________________________________________________________


In [414]:
net.load('babynames_model-best.yaml','babynames_model-checkpoint-best.hdf5')


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_24 (LSTM)               (None, 256)               320512    
_________________________________________________________________
dense_17 (Dense)             (None, 56)                14392     
_________________________________________________________________
activation_17 (Activation)   (None, 56)                0         
Total params: 334,904
Trainable params: 334,904
Non-trainable params: 0
_________________________________________________________________


In [423]:
print(net.generate('mad',100))

['mad', 'mad', 'madgqi', 'madyn', 'mad', "madgdF'SIYJE'G'", "madkp'al", 'madhgq', 'madsashUSBLSAwJ', "madm'l'jjgQD kO", 'mad', "maddShhw'AglB", 'mad', 'mad', 'mad', 'mad', 'madtcsoJdjUrNqr', 'mad', 'madgqZe ', 'mad', 'madkl', 'madi', 'maded', 'madhuwfoDVWCApS', 'mad', 'mad', "maddSa'NKMUKXNF", 'mad', 'madduCKld', 'mad', 'mad', 'mad', 'mad', 'mad', 'mad', 'madsarhr', 'madkl', 'mad', 'mad', 'mad', "maddZUPhPsqWAR'", 'mad', 'madyutWIOoWNurI', 'mad', 'mad', 'mad', "madyoA'M-zm-", 'mad', 'mad', 'mad', 'madpaNOzSzXUZLK', 'mad', 'mad', 'madatsehfaQfJXX', 'mad', 'madhrkhhrhhIkSQ', 'mad-i', 'mad', 'mad', 'madh', 'mad', 'mad', 'mad', 'mad', 'mad', 'mad', "madgBu''a''", 'mad', 'mad', 'mad', 'mad', 'mad', 'mad', 'mad', 'mad', "madyndDxJ'VJAOY", "madie'oBoMdctE", 'mad', 'madrkdghwWUWjAE', 'maddaNMRJSCLFGR', 'madi', 'madn', 'mad', 'mad', 'mad', 'mad', 'mad', 'mad', 'madhuVcwK', 'mad', 'mad', 'maddlk', 'madddCgICHWVUIg', 'maddVlehrweGoEI', "madd'huLShPXhMF", 'mad', 'mad', 'mad', 'maded', 'mad']


In [315]:
print(net.generate('mad',100))

['madiesa', 'madabi', 'mada', 'madidh', 'madelia', 'mades', 'madia', 'madeline', 'madvin', 'madra', 'mady', 'madarito', 'maddive', 'madrey', 'made', 'madon', 'madfie', 'madra', 'madth', 'madie', 'madrecha', 'madiniv', 'madraine', 'madyna', 'madet', 'madwerda', 'maddasyn', 'madia', 'madeli', 'madry', 'madfuse', 'madrie', 'maddae', 'maddy', 'madaie', 'madisa', 'madrena', 'madele', 'madrick', 'madie', 'madoraile', 'maddyla', 'maddiha', 'maderite', 'madilie', 'madre', 'madi', 'madive', 'madiausa', 'madies', 'madesti', 'madlor', 'madarice', 'madabore', 'madelee', 'madio', 'madgace', 'madde', 'madgich', 'madtor', 'madbise', 'madlia', 'mad', 'made', 'madesta', 'mady', 'madal', 'madala', 'madel', 'madete', 'madlis', 'madbelle', 'madeline', 'madsy', 'madibol', 'madia', 'madd', 'madimild', 'madelezga', 'madminu', 'madyar', 'madfel', 'madtomias', 'madelie', 'madi', 'madteo', 'madele', 'mada', 'madeline', 'madgae', 'madmin', 'madele', 'madelie', 'madlinse', 'madrayah', 'made', 'madrudlen', 'maddas