In [83]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation,Dropout
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
from keras.models import model_from_yaml
from random import randint


In [121]:
class Corpus:
    def __init__(self, corpus_path,time_steps):
        
        self.time_steps_Tx  = time_steps

        with open(corpus_path) as corpus_file:
            self.corpus = corpus_file.read()
        
        self.corpus_len = len(self.corpus)
        self.no_examples_m  = self.corpus_len - self.time_steps_Tx

        self.vocab        = sorted(list(set(self.corpus)))
        self.vocab_len_Vx = len(self.vocab)
        
        print("corpus sample = '{}' corpus len = {} vocab_len_Vx = {}".
              format(self.corpus[0:20],self.corpus_len,self.vocab_len_Vx))
        
        print(f'vocab values = {self.vocab}')

        # Get a unique identifier for each char in the corpus,
        # then make some dicts to ease encoding and decoding
        
        self.encoder = {c: i for i, c in enumerate(self.vocab)}
        self.decoder = {i: c for i, c in enumerate(self.vocab)}

    def get_dataset(self):
        # First each char after Tx, we will have one example.
        
        feature = np.zeros((self.no_examples_m,self.time_steps_Tx))
        label   = np.zeros(self.no_examples_m)
        
        for i in range (0, self.no_examples_m, 1):
            sentence  = self.corpus[i:i + self.time_steps_Tx]
            next_char = self.corpus[i + self.time_steps_Tx]
            
            for j in range(self.time_steps_Tx):
                feature[i,j] = self.encoder[sentence[j]]
            label[i] = self.encoder[next_char]

        feature = to_categorical(feature)
        label   = to_categorical(label)
        print("Sliced our corpus into {} examples. feature.shape = {} label.shape = {}".
              format(self.no_examples_m, feature.shape,label.shape))
        return (feature,label)


- Transform X data (m,Tx,vec_size) to Y (m,1,vec_size) i.e (m,vec_size).
- Many to One RNN architecture
- Lets convert into one hot encoding
- LSTM model will remove Tx dimension if you don't specify return_sequences=True.
- In the predict, you can pass a random text of length upto Tx to kick start the prediction. Loop it after it gives each word.
- 

In [114]:
class CRNN:
    
    def __init__(self,time_steps_Tx,vocab_len_Vx,encoder,decoder):
        self.time_steps_Tx = time_steps_Tx
        self.vocab_len_Vx  = vocab_len_Vx
        self.encoder       = encoder
        self.decoder       = decoder
        
    def build(self,units,layers=1,dropout=None):
        model = Sequential()

        for i in range(layers):
            if(layers == 1):
                model.add(LSTM(units, input_shape=(self.time_steps_Tx, self.vocab_len_Vx)))
            elif(i == 0): 
                model.add(LSTM(units, input_shape=(self.time_steps_Tx, self.vocab_len_Vx),return_sequences=True))
            elif(i != layers -1):
                model.add(LSTM(units, return_sequences=True))
            else:
                model.add(LSTM(units))

            if(dropout is not None):
                model.add(Dropout(dropout))
                    
        model.add(Dense(self.vocab_len_Vx))
        model.add(Activation('softmax'))
        
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        self.model = model
        self.model.summary()
    
    def load(self,mfile,cpfile):
        with open(mfile) as model_file:
            architecture = model_file.read()

        self.model = model_from_yaml(architecture)
        self.model.load_weights(cpfile)
        self.model.compile(loss='categorical_crossentropy', optimizer='adam')
        self.model.summary()
        
    def train(self,mfile,cpfile,X_train,Y_train,epochs,batch_size):
        architecture = self.model.to_yaml()
        with open(mfile, 'w') as model_file:
            model_file.write(architecture)

        file_path= cpfile + "-checkpoint-{epoch:02d}-{loss:.3f}.hdf5"
        checkpoint = ModelCheckpoint(file_path, monitor="loss", verbose=1, save_best_only=True, mode="min")
        callbacks = [checkpoint]

        self.model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, callbacks=callbacks)
    
    def pad_seed(self,seed_phrase):
        phrase_length = len(seed_phrase)
        pattern = ""
        for i in range (0, self.time_steps_Tx):
            pattern += seed_phrase[i % phrase_length]
        return pattern

    def generate(self, seed_text,text_length):
        
        X = np.zeros((1, self.time_steps_Tx, self.vocab_len_Vx), dtype=np.bool)
        for i, character in enumerate(self.pad_seed(seed_text)):
            X[0, i, self.encoder[character]] = 1

        generated_text = ""
        for i in range(text_length):
            prediction = np.argmax(self.model.predict(X, verbose=0))

            generated_text += self.decoder[prediction]

            activations = np.zeros((1, 1, self.vocab_len_Vx), dtype=np.bool)
            activations[0, 0, prediction] = 1
            X = np.concatenate((X[:, 1:, :], activations), axis=1)

        return generated_text


In [122]:
Tx = 50

crp = Corpus('data/sonnets.txt',Tx)
m  = crp.no_examples_m
Vx = crp.vocab_len_Vx


net = CRNN(Tx,Vx,crp.encoder,crp.decoder)
net.load('shekespere_model.yaml','shekespere_model_weights_final.hdf5')

seed = 'hello'

print(net.generate(seed,500))


corpus sample = 'From fairest creatur' corpus len = 94651 vocab_len_Vx = 61
vocab values = ['\n', ' ', '!', "'", '(', ')', ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 256)               325632    
_________________________________________________________________
dense_2 (Dense)              (None, 61)                15677     
_________________________________________________________________
activation_2 (Activation)    (None, 61)                0         
Total params: 341,309
Trainable params: 341,309
Non-trainable params: 0
_________________________________________________________________
,


In [109]:
print(x)

madhu 
gomathy 
sanjay
