<a href="https://colab.research.google.com/github/kmaciver/newsflash/blob/master/modelTraining/Activation_Functions_on_BidirectTextGenerator_Business.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import re
import os
import pickle
from tensorflow.keras.preprocessing.text import text_to_word_sequence
import warnings
warnings.filterwarnings('ignore')

## Create Train data

In [0]:
# Read Vocabulary of data
vocab = []

with open('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/vocab.data', 'rb') as filehandle:
    # read the data as binary data stream
    vocab = pickle.load(filehandle)
    
print ('{} unique characters'.format(len(vocab)))

5567 unique characters


In [0]:
# Read embedded spaces of vocab data

embedded = np.load('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/embedded.npy')

print(embedded.shape)

(5567, 100)


In [0]:
# Separate text file by lines
TextLines = open('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/Dataset/data.txt').readlines()

In [0]:
len(TextLines)

31898

In [0]:
from sklearn.model_selection import train_test_split

# Validation list
TrainLines, TestLines = train_test_split(TextLines, test_size=0.10, random_state=42)

print(len(TrainLines),len(TestLines))

28708 3190


In [0]:
def generate_text_sequences(Lines, pastWords, vocab):
    X_line = list()
    Y_line = list()
    pastWords = pastWords
    for line in Lines:
        # Tokenize line
        lineTokenized = text_to_word_sequence(line,  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r0123456789'+"'")
        #Get line length
        lengthLine = len(lineTokenized)
        lineBatch = lengthLine-pastWords
        
        # Substitute words outside vocab with <Unkown>
        for idx in range(0,len(lineTokenized)):
            if lineTokenized[idx] in vocab:
                continue
            else:
                lineTokenized[idx] = '<Unkown>'
        
        #Crate sequences of text 
        for i in range(0,lineBatch):
            X_sequence = lineTokenized[i:i+pastWords]
            X_line.append(X_sequence)
            Y_sequence = lineTokenized[i+pastWords]
            Y_line.append(Y_sequence)
    
    return(X_line, Y_line)

In [0]:
pastWords = 5  # number of words to look back for prediction
X_lineTrain, Y_lineTrain = generate_text_sequences(TrainLines, pastWords, vocab)

In [0]:
print(len(X_lineTrain), len(Y_lineTrain))

923443 923443


Creating a batch generator for training data. Converting the whole dataset will take too much memory

In [0]:
from tensorflow.keras.utils import to_categorical
import random as rand
from random import randint
from random import seed
rand.seed(4)

def batch_generator_data(batchsize, X_line, Y_line, embDim, pastWords, embedded, vocab):
    embDim=embDim
    pastWords = pastWords
    x_batch = np.zeros(shape=(batchsize,pastWords,embDim))
    y_batch = np.zeros(shape=(batchsize))

    while True:
        # Fill the batch with random continuous sequences of data.

        # Get a random start-index.
        # This points somewhere into the data.
        idx = np.random.randint(len(X_line) - batchsize)

        for i in range(0,batchsize):
            x_batch[i] = [embedded[vocab.index(x)] for x in X_line[idx+i]]
            y_batch[i] = vocab.index(Y_line[idx+i])

        #y_batch = to_categorical(y_batch, num_classes=len(vocab))
        
        yield (x_batch, y_batch)

In [0]:
embDim = 100 #shape of the embbeded latent space
batchsize = 300 #batch size for each training step
generator = batch_generator_data(batchsize,X_lineTrain, Y_lineTrain, embDim, pastWords, embedded, vocab)

In [0]:
X_train_batch, Y_train_batch = next(generator)

In [0]:
print(X_train_batch.shape)
print(Y_train_batch.shape)

(300, 5, 100)
(300,)


**Generate Validation data**

In [0]:
X_lineTest, Y_lineTest = generate_text_sequences(TestLines, pastWords, vocab)
valgenerator = batch_generator_data(batchsize,X_lineTest, Y_lineTest, embDim, pastWords, embedded, vocab)

In [0]:
print(len(X_lineTest), len(Y_lineTest))

103138 103138


In [0]:
X_test_batch, Y_test_batch = next(valgenerator)

In [0]:
print(X_test_batch.shape)
print(Y_test_batch.shape)

(300, 5, 100)
(300,)


## Create Model  Relu


In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, GRU, LSTM, Bidirectional
from tensorflow.keras.optimizers import RMSprop
from keras import losses


model = Sequential()
model.add(LSTM(units=500, return_sequences=True,input_shape=(pastWords,embDim), activation='relu'))
model.add(LSTM(units=200))
model.add(Dense(len(vocab), activation='softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 5, 500)            1202000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 200)               560800    
_________________________________________________________________
dense_1 (Dense)              (None, 5567)              1118967   
Total params: 2,881,767
Trainable params: 2,881,767
Non-trainable params: 0
_________________________________________________________________


In [0]:
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

optimizer = RMSprop(lr=0.001)

model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)

model_file = "/content/drive/My Drive/SharpestMinds/3. Generate Language Model/ActivationFun_BidirectionalModel/Relu/Business(LSTM).{epoch:02d}-{loss:.4f}.h5"

mc = ModelCheckpoint(model_file, monitor="loss", mode="min", save_best_only=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=4, min_lr=1e-4)

es = EarlyStopping(monitor='loss', min_delta=0.01, patience=5, mode='min')

In [0]:
%%time

history = model.fit_generator(generator=generator,
                    epochs=20,
                    steps_per_epoch= len(X_lineTrain)//batchsize,
                    validation_data=valgenerator,
                    validation_steps= len(X_lineTest)//batchsize,
                    callbacks=[mc, reduce_lr, es])

Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 14h 21min 11s, sys: 18min 22s, total: 14h 39min 34s
Wall time: 7h 40min 10s


## Load Model, Vocab and Embedded if already trained 

In [0]:
model = tf.keras.models.load_model('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/ActivationFun_BidirectionalModel/Relu/Business(LSTM).20-3.9256.h5')

# Read Vocabulary of data
vocab = []

with open('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/vocab.data', 'rb') as filehandle:
    # read the data as binary data stream
    vocab = pickle.load(filehandle)
    
print ('{} unique characters'.format(len(vocab)))

5567 unique characters


In [0]:
# Read embedded spaces of vocab data

embedded = np.load('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/embedded.npy')

print(embedded.shape)

(5567, 100)


## Evaluate Model Relu

In [0]:
def generateInputArray(inputs):
  embDim = embedded.shape[1]
  x_sample =[]
  for x in inputs:
    if x in vocab:
      x_sample.append(embedded[list(vocab).index(x)])
    else:
      x_sample.append(np.zeros(embDim))

  x_sample = np.array(x_sample)
  x_sample = np.expand_dims(x_sample, axis=0)
  return(x_sample)
  
def generatecandidates(text):
  textToken = tf.keras.preprocessing.text.text_to_word_sequence(str(text),  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r0123456789'+"'")   
  
  x_sample = generateInputArray(textToken[-5:])
  
  y_sample = model.predict(x_sample)
  
  # Get top 50 candidates
  ind = np.argpartition(y_sample[0,:], -50)[-50:]

  candidates=dict()
  for i in ind:
    if vocab[i] != "<Unkown>":
      candidates[vocab[i]] = y_sample[0,i]

  return(candidates)

In [0]:
X_line, Y_line = generate_text_sequences(TestLines, pastWords, vocab)

In [0]:
points=0
TotalPredictions = 0
for i in range(0, len(TestLines)):
    # Verify if the correct word is in the top-50 words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            word_prob = list(candidates.keys()).index(Y_line[i])
            p = np.array(list(candidates.values()))
            points += 1 - p[word_prob] 
            TotalPredictions +=1
        else:
            points+=1
            TotalPredictions +=1

print("Average uncertanty of the model ", (points/TotalPredictions)*100, "%")

Average uncertanty of the model  82.78999014356532 %


In [0]:
points=0
TotalPredictions = 0

for i in range(0, len(TestLines)):
    # Verify if the correct word is in the words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            points+=1
            TotalPredictions +=1
        else:
            TotalPredictions +=1
    
    
print("Percentage of times next word was on top 50 predictions: ",(points/TotalPredictions)*100, "%")

Percentage of times next word was on top 50 predictions:  63.54063836788417 %


## Create Model  Elu


In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, GRU, LSTM, Bidirectional
from tensorflow.keras.optimizers import RMSprop
from keras import losses


model = Sequential()
model.add(LSTM(units=500, return_sequences=True,input_shape=(pastWords,embDim), activation='elu'))
model.add(LSTM(units=200))
model.add(Dense(len(vocab), activation='softmax'))
model.summary()

Using TensorFlow backend.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 5, 500)            1202000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               560800    
_________________________________________________________________
dense (Dense)                (None, 5567)              1118967   
Total params: 2,881,767
Trainable params: 2,881,767
Non-trainable params: 0
_________________________________________________________________


In [0]:
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

optimizer = RMSprop(lr=0.001)

model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)

model_file = "/content/drive/My Drive/SharpestMinds/3. Generate Language Model/ActivationFun_BidirectionalModel/Elu/Business(LSTM).{epoch:02d}-{loss:.4f}.h5"

mc = ModelCheckpoint(model_file, monitor="loss", mode="min", save_best_only=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=4, min_lr=1e-4)

es = EarlyStopping(monitor='loss', min_delta=0.01, patience=5, mode='min')

In [0]:
%%time

history = model.fit_generator(generator=generator,
                    epochs=20,
                    steps_per_epoch= len(X_lineTrain)//batchsize,
                    validation_data=valgenerator,
                    validation_steps= len(X_lineTest)//batchsize,
                    callbacks=[mc, reduce_lr, es])

Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 14h 16min 34s, sys: 19min 22s, total: 14h 35min 56s
Wall time: 7h 44min 13s


## Load Model, Vocab and Embedded if already trained 

In [0]:
model = tf.keras.models.load_model('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/ActivationFun_BidirectionalModel/Elu/Business(LSTM).20-3.7076.h5')

# Read Vocabulary of data
vocab = []

with open('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/vocab.data', 'rb') as filehandle:
    # read the data as binary data stream
    vocab = pickle.load(filehandle)
    
print ('{} unique characters'.format(len(vocab)))

5567 unique characters


In [0]:
# Read embedded spaces of vocab data

embedded = np.load('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/embedded.npy')

print(embedded.shape)

(5567, 100)


## Evaluate Model Elu

In [0]:
def generateInputArray(inputs):
  embDim = embedded.shape[1]
  x_sample =[]
  for x in inputs:
    if x in vocab:
      x_sample.append(embedded[list(vocab).index(x)])
    else:
      x_sample.append(np.zeros(embDim))

  x_sample = np.array(x_sample)
  x_sample = np.expand_dims(x_sample, axis=0)
  return(x_sample)
  
def generatecandidates(text):
  textToken = tf.keras.preprocessing.text.text_to_word_sequence(str(text),  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r0123456789'+"'")   
  
  x_sample = generateInputArray(textToken[-5:])
  
  y_sample = model.predict(x_sample)
  
  # Get top 50 candidates
  ind = np.argpartition(y_sample[0,:], -50)[-50:]

  candidates=dict()
  for i in ind:
    if vocab[i] != "<Unkown>":
      candidates[vocab[i]] = y_sample[0,i]

  return(candidates)

In [0]:
X_line, Y_line = generate_text_sequences(TestLines, pastWords, vocab)

In [0]:
points=0
TotalPredictions = 0
for i in range(0, len(TestLines)):
    # Verify if the correct word is in the top-50 words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            word_prob = list(candidates.keys()).index(Y_line[i])
            p = np.array(list(candidates.values()))
            points += 1 - p[word_prob] 
            TotalPredictions +=1
        else:
            points+=1
            TotalPredictions +=1

print("Average uncertanty of the model ", (points/TotalPredictions)*100, "%")

Average uncertanty of the model  82.25302414812074 %


In [0]:
points=0
TotalPredictions = 0

for i in range(0, len(TestLines)):
    # Verify if the correct word is in the words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            points+=1
            TotalPredictions +=1
        else:
            TotalPredictions +=1
    
    
print("Percentage of times next word was on top 50 predictions: ",(points/TotalPredictions)*100, "%")

Percentage of times next word was on top 50 predictions:  61.66502138861467 %


## Create Model  Sigmoid


In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, GRU, LSTM, Bidirectional
from tensorflow.keras.optimizers import RMSprop
from keras import losses


model = Sequential()
model.add(LSTM(units=500, return_sequences=True,input_shape=(pastWords,embDim), activation='sigmoid'))
model.add(LSTM(units=200))
model.add(Dense(len(vocab), activation='softmax'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 5, 500)            1202000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               560800    
_________________________________________________________________
dense (Dense)                (None, 5567)              1118967   
Total params: 2,881,767
Trainable params: 2,881,767
Non-trainable params: 0
_________________________________________________________________


In [0]:
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

optimizer = RMSprop(lr=0.001)

model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)

model_file = "/content/drive/My Drive/SharpestMinds/3. Generate Language Model/ActivationFun_BidirectionalModel/sigmoid/Business(LSTM).{epoch:02d}-{loss:.4f}.h5"

mc = ModelCheckpoint(model_file, monitor="loss", mode="min", save_best_only=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=4, min_lr=1e-4)

es = EarlyStopping(monitor='loss', min_delta=0.01, patience=5, mode='min')

In [0]:
%%time

history = model.fit_generator(generator=generator,
                    epochs=20,
                    steps_per_epoch= len(X_lineTrain)//batchsize,
                    validation_data=valgenerator,
                    validation_steps= len(X_lineTest)//batchsize,
                    callbacks=[mc, reduce_lr, es])

Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 14h 15min 3s, sys: 18min 17s, total: 14h 33min 20s
Wall time: 7h 38min 3s


## Load Model, Vocab and Embedded if already trained 

In [0]:
model = tf.keras.models.load_model('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/ActivationFun_BidirectionalModel/sigmoid/Business(LSTM).20-4.2071.h5')

# Read Vocabulary of data
vocab = []

with open('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/vocab.data', 'rb') as filehandle:
    # read the data as binary data stream
    vocab = pickle.load(filehandle)
    
print ('{} unique characters'.format(len(vocab)))

5567 unique characters


In [0]:
# Read embedded spaces of vocab data

embedded = np.load('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/embedded.npy')

print(embedded.shape)

(5567, 100)


## Evaluate Model Sigmoid

In [0]:
def generateInputArray(inputs):
  embDim = embedded.shape[1]
  x_sample =[]
  for x in inputs:
    if x in vocab:
      x_sample.append(embedded[list(vocab).index(x)])
    else:
      x_sample.append(np.zeros(embDim))

  x_sample = np.array(x_sample)
  x_sample = np.expand_dims(x_sample, axis=0)
  return(x_sample)
  
def generatecandidates(text):
  textToken = tf.keras.preprocessing.text.text_to_word_sequence(str(text),  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r0123456789'+"'")   
  
  x_sample = generateInputArray(textToken[-5:])
  
  y_sample = model.predict(x_sample)
  
  # Get top 50 candidates
  ind = np.argpartition(y_sample[0,:], -50)[-50:]

  candidates=dict()
  for i in ind:
    if vocab[i] != "<Unkown>":
      candidates[vocab[i]] = y_sample[0,i]

  return(candidates)

In [0]:
X_line, Y_line = generate_text_sequences(TestLines, pastWords, vocab)

In [0]:
points=0
TotalPredictions = 0
for i in range(0, len(TestLines)):
    # Verify if the correct word is in the top-50 words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            word_prob = list(candidates.keys()).index(Y_line[i])
            p = np.array(list(candidates.values()))
            points += 1 - p[word_prob] 
            TotalPredictions +=1
        else:
            points+=1
            TotalPredictions +=1

print("Average uncertanty of the model ", (points/TotalPredictions)*100, "%")

Average uncertanty of the model  84.25173470110818 %


In [0]:
points=0
TotalPredictions = 0

for i in range(0, len(TestLines)):
    # Verify if the correct word is in the words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            points+=1
            TotalPredictions +=1
        else:
            TotalPredictions +=1
    
    
print("Percentage of times next word was on top 50 predictions: ",(points/TotalPredictions)*100, "%")

Percentage of times next word was on top 50 predictions:  65.2846331029944 %


## Create Model  Softsign


In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, GRU, LSTM, Bidirectional
from tensorflow.keras.optimizers import RMSprop
from keras import losses


model = Sequential()
model.add(LSTM(units=500, return_sequences=True,input_shape=(pastWords,embDim), activation='softsign'))
model.add(LSTM(units=200))
model.add(Dense(len(vocab), activation='softmax'))
model.summary()

Using TensorFlow backend.


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 5, 500)            1202000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               560800    
_________________________________________________________________
dense (Dense)                (None, 5567)              1118967   
Total params: 2,881,767
Trainable params: 2,881,767
Non-trainable params: 0
_________________________________________________________________


In [0]:
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

optimizer = RMSprop(lr=0.001)

model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)

model_file = "/content/drive/My Drive/SharpestMinds/3. Generate Language Model/ActivationFun_BidirectionalModel/softsign/Business(LSTM).{epoch:02d}-{loss:.4f}.h5"

mc = ModelCheckpoint(model_file, monitor="loss", mode="min", save_best_only=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=4, min_lr=1e-4)

es = EarlyStopping(monitor='loss', min_delta=0.01, patience=5, mode='min')

In [0]:
%%time

history = model.fit_generator(generator=generator,
                    epochs=20,
                    steps_per_epoch= len(X_lineTrain)//batchsize,
                    validation_data=valgenerator,
                    validation_steps= len(X_lineTest)//batchsize,
                    callbacks=[mc, reduce_lr, es])

Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 15h 1min 26s, sys: 20min 4s, total: 15h 21min 31s
Wall time: 8h 2min 10s


## Load Model, Vocab and Embedded if already trained 

In [0]:
model = tf.keras.models.load_model('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/ActivationFun_BidirectionalModel/softsign/Business(LSTM).20-3.8503.h5')

# Read Vocabulary of data
vocab = []

with open('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/vocab.data', 'rb') as filehandle:
    # read the data as binary data stream
    vocab = pickle.load(filehandle)
    
print ('{} unique characters'.format(len(vocab)))

5567 unique characters


In [0]:
# Read embedded spaces of vocab data

embedded = np.load('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/embedded.npy')

print(embedded.shape)

(5567, 100)


## Evaluate Model Softsign

In [0]:
def generateInputArray(inputs):
  embDim = embedded.shape[1]
  x_sample =[]
  for x in inputs:
    if x in vocab:
      x_sample.append(embedded[list(vocab).index(x)])
    else:
      x_sample.append(np.zeros(embDim))

  x_sample = np.array(x_sample)
  x_sample = np.expand_dims(x_sample, axis=0)
  return(x_sample)
  
def generatecandidates(text):
  textToken = tf.keras.preprocessing.text.text_to_word_sequence(str(text),  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r0123456789'+"'")   
  
  x_sample = generateInputArray(textToken[-5:])
  
  y_sample = model.predict(x_sample)
  
  # Get top 50 candidates
  ind = np.argpartition(y_sample[0,:], -50)[-50:]

  candidates=dict()
  for i in ind:
    if vocab[i] != "<Unkown>":
      candidates[vocab[i]] = y_sample[0,i]

  return(candidates)

In [0]:
X_line, Y_line = generate_text_sequences(TestLines, pastWords, vocab)

In [0]:
points=0
TotalPredictions = 0
for i in range(0, len(TestLines)):
    # Verify if the correct word is in the top-50 words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            word_prob = list(candidates.keys()).index(Y_line[i])
            p = np.array(list(candidates.values()))
            points += 1 - p[word_prob] 
            TotalPredictions +=1
        else:
            points+=1
            TotalPredictions +=1

print("Average uncertanty of the model ", (points/TotalPredictions)*100, "%")

Average uncertanty of the model  82.62568302048687 %


In [0]:
points=0
TotalPredictions = 0

for i in range(0, len(TestLines)):
    # Verify if the correct word is in the words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            points+=1
            TotalPredictions +=1
        else:
            TotalPredictions +=1
    
    
print("Percentage of times next word was on top 50 predictions: ",(points/TotalPredictions)*100, "%")

Percentage of times next word was on top 50 predictions:  61.99407699901284 %


## Evaluate Model Tanh

In [0]:
model = tf.keras.models.load_model('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/Bidirectional_LSTM_model/train_log/simpleTextGenerator.19-4.10.h5')

# Read Vocabulary of data
vocab = []

with open('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/vocab.data', 'rb') as filehandle:
    # read the data as binary data stream
    vocab = pickle.load(filehandle)
    
print ('{} unique characters'.format(len(vocab)))

5567 unique characters


In [0]:
# Read embedded spaces of vocab data

embedded = np.load('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/embedded.npy')

print(embedded.shape)

(5567, 100)


In [0]:
def generateInputArray(inputs):
  embDim = embedded.shape[1]
  x_sample =[]
  for x in inputs:
    if x in vocab:
      x_sample.append(embedded[list(vocab).index(x)])
    else:
      x_sample.append(np.zeros(embDim))

  x_sample = np.array(x_sample)
  x_sample = np.expand_dims(x_sample, axis=0)
  return(x_sample)
  
def generatecandidates(text):
  textToken = tf.keras.preprocessing.text.text_to_word_sequence(str(text),  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r0123456789'+"'")   
  
  x_sample = generateInputArray(textToken[-5:])
  
  y_sample = model.predict(x_sample)
  
  # Get top 50 candidates
  ind = np.argpartition(y_sample[0,:], -50)[-50:]

  candidates=dict()
  for i in ind:
    if vocab[i] != "<Unkown>":
      candidates[vocab[i]] = y_sample[0,i]

  return(candidates)

In [0]:
X_line, Y_line = generate_text_sequences(TestLines, pastWords, vocab)

In [0]:
points=0
TotalPredictions = 0
for i in range(0, len(TestLines)):
    # Verify if the correct word is in the top-50 words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            word_prob = list(candidates.keys()).index(Y_line[i])
            p = np.array(list(candidates.values()))
            points += 1 - p[word_prob] 
            TotalPredictions +=1
        else:
            points+=1
            TotalPredictions +=1

print("Average uncertanty of the model ", (points/TotalPredictions)*100, "%")

Average uncertanty of the model  83.31115681597 %


In [0]:
points=0
TotalPredictions = 0

for i in range(0, len(TestLines)):
    # Verify if the correct word is in the words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            points+=1
            TotalPredictions +=1
        else:
            TotalPredictions +=1
    
    
print("Percentage of times next word was on top 50 predictions: ",(points/TotalPredictions)*100, "%")

Percentage of times next word was on top 50 predictions:  65.15301085883515 %


## Evaluate Model Unidirectional Tanh

In [0]:
model = tf.keras.models.load_model('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/train_log/simpleTextGenerator(LSTM).20-5.1207.h5')

# Read Vocabulary of data
vocab = []

with open('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/vocab.data', 'rb') as filehandle:
    # read the data as binary data stream
    vocab = pickle.load(filehandle)
    
print ('{} unique characters'.format(len(vocab)))

5567 unique characters


In [0]:
# Read embedded spaces of vocab data

embedded = np.load('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/embedded.npy')

print(embedded.shape)

(5567, 100)


In [0]:
def generateInputArray(inputs):
  embDim = embedded.shape[1]
  x_sample =[]
  for x in inputs:
    if x in vocab:
      x_sample.append(embedded[list(vocab).index(x)])
    else:
      x_sample.append(np.zeros(embDim))

  x_sample = np.array(x_sample)
  x_sample = np.expand_dims(x_sample, axis=0)
  return(x_sample)
  
def generatecandidates(text):
  textToken = tf.keras.preprocessing.text.text_to_word_sequence(str(text),  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r0123456789'+"'")   
  
  x_sample = generateInputArray(textToken[-5:])
  
  y_sample = model.predict(x_sample)
  
  # Get top 50 candidates
  ind = np.argpartition(y_sample[0,:], -50)[-50:]

  candidates=dict()
  for i in ind:
    if vocab[i] != "<Unkown>":
      candidates[vocab[i]] = y_sample[0,i]

  return(candidates)

In [0]:
X_line, Y_line = generate_text_sequences(TestLines, pastWords, vocab)

In [0]:
points=0
TotalPredictions = 0
for i in range(0, len(TestLines)):
    # Verify if the correct word is in the top-50 words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            word_prob = list(candidates.keys()).index(Y_line[i])
            p = np.array(list(candidates.values()))
            points += 1 - p[word_prob] 
            TotalPredictions +=1
        else:
            points+=1
            TotalPredictions +=1

print("Average uncertanty of the model ", (points/TotalPredictions)*100, "%")

Average uncertanty of the model  82.81953041101032 %


In [0]:
points=0
TotalPredictions = 0

for i in range(0, len(TestLines)):
    # Verify if the correct word is in the words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            points+=1
            TotalPredictions +=1
        else:
            TotalPredictions +=1
    
    
print("Percentage of times next word was on top 50 predictions: ",(points/TotalPredictions)*100, "%")

Percentage of times next word was on top 50 predictions:  61.467588022375786 %


## Evaluate Model Bidirectional Tanh final Epoch

In [0]:
model = tf.keras.models.load_model('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/Bidirectional_LSTM_model/train_log/simpleTextGenerator.56-3.08.h5')

# Read Vocabulary of data
vocab = []

with open('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/vocab.data', 'rb') as filehandle:
    # read the data as binary data stream
    vocab = pickle.load(filehandle)
    
print ('{} unique characters'.format(len(vocab)))

5567 unique characters


In [0]:
# Read embedded spaces of vocab data

embedded = np.load('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/embedded.npy')

print(embedded.shape)

(5567, 100)


In [0]:
def generateInputArray(inputs):
  embDim = embedded.shape[1]
  x_sample =[]
  for x in inputs:
    if x in vocab:
      x_sample.append(embedded[list(vocab).index(x)])
    else:
      x_sample.append(np.zeros(embDim))

  x_sample = np.array(x_sample)
  x_sample = np.expand_dims(x_sample, axis=0)
  return(x_sample)
  
def generatecandidates(text):
  textToken = tf.keras.preprocessing.text.text_to_word_sequence(str(text),  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r0123456789'+"'")   
  
  x_sample = generateInputArray(textToken[-5:])
  
  y_sample = model.predict(x_sample)
  
  # Get top 50 candidates
  ind = np.argpartition(y_sample[0,:], -50)[-50:]

  candidates=dict()
  for i in ind:
    if vocab[i] != "<Unkown>":
      candidates[vocab[i]] = y_sample[0,i]

  return(candidates)

In [0]:
X_line, Y_line = generate_text_sequences(TestLines, pastWords, vocab)

In [0]:
points=0
TotalPredictions = 0
for i in range(0, len(TestLines)):
    # Verify if the correct word is in the top-50 words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            word_prob = list(candidates.keys()).index(Y_line[i])
            p = np.array(list(candidates.values()))
            points += 1 - p[word_prob] 
            TotalPredictions +=1
        else:
            points+=1
            TotalPredictions +=1

print("Average uncertainty of the model ", (points/TotalPredictions)*100, "%")

Average uncertanty of the model  80.073489672101 %


In [0]:
points=0
TotalPredictions = 0

for i in range(0, len(TestLines)):
    # Verify if the correct word is in the top-50 words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            points+=1
            TotalPredictions +=1
        else:
            TotalPredictions +=1
    
    
print("Percentage of times next word was on top 50 predictions: ",(points/TotalPredictions)*100, "%")

Percentage of times next word was on top 50 predictions:  60.97400460677854 %


## Evaluate Model Unidirectional Tanh final Epoch

In [0]:
model = tf.keras.models.load_model('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/train_log/simpleTextGenerator(LSTM).56-5.6304.h5')

# Read Vocabulary of data
vocab = []

with open('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/vocab.data', 'rb') as filehandle:
    # read the data as binary data stream
    vocab = pickle.load(filehandle)
    
print ('{} unique characters'.format(len(vocab)))

5567 unique characters


In [0]:
# Read embedded spaces of vocab data

embedded = np.load('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/embedded.npy')

print(embedded.shape)

(5567, 100)


In [0]:
def generateInputArray(inputs):
  embDim = embedded.shape[1]
  x_sample =[]
  for x in inputs:
    if x in vocab:
      x_sample.append(embedded[list(vocab).index(x)])
    else:
      x_sample.append(np.zeros(embDim))

  x_sample = np.array(x_sample)
  x_sample = np.expand_dims(x_sample, axis=0)
  return(x_sample)
  
def generatecandidates(text):
  textToken = tf.keras.preprocessing.text.text_to_word_sequence(str(text),  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r0123456789'+"'")   
  
  x_sample = generateInputArray(textToken[-5:])
  
  y_sample = model.predict(x_sample)
  
  # Get top 50 candidates
  ind = np.argpartition(y_sample[0,:], -50)[-50:]

  candidates=dict()
  for i in ind:
    if vocab[i] != "<Unkown>":
      candidates[vocab[i]] = y_sample[0,i]

  return(candidates)

In [0]:
X_line, Y_line = generate_text_sequences(TestLines, pastWords, vocab)

In [0]:
points=0
TotalPredictions = 0
for i in range(0, len(TestLines)):
    # Verify if the correct word is in the top-50 words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            word_prob = list(candidates.keys()).index(Y_line[i])
            p = np.array(list(candidates.values()))
            points += 1 - p[word_prob] 
            TotalPredictions +=1
        else:
            points+=1
            TotalPredictions +=1

print("Average uncertainty of the model ", (points/TotalPredictions)*100, "%")

Average uncertainty of the model  81.95951606424633 %


In [0]:
points=0
TotalPredictions = 0

for i in range(0, len(TestLines)):
    # Verify if the correct word is in the top-50 words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            points+=1
            TotalPredictions +=1
        else:
            TotalPredictions +=1
    
    
print("Percentage of times next word was on top 50 predictions: ",(points/TotalPredictions)*100, "%")

Percentage of times next word was on top 50 predictions:  59.197104310628504 %


## Evaluate New Model

In [0]:
model = tf.keras.models.load_model('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/trained_models/trained_models_model_01May2020.h5')

# Read Vocabulary of data
vocab = []

with open('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/trained_models/trained_models_vocab_01May2020.data', 'rb') as filehandle:
    # read the data as binary data stream
    vocab = pickle.load(filehandle)
    
print ('{} unique characters'.format(len(vocab)))

ValueError: ignored

In [0]:
# Read embedded spaces of vocab data

embedded = np.load('/content/drive/My Drive/SharpestMinds/3. Generate Language Model/LSTM_model/DocsToLoad/embedded.npy')

print(embedded.shape)

(5567, 100)


In [0]:
def generateInputArray(inputs):
  embDim = embedded.shape[1]
  x_sample =[]
  for x in inputs:
    if x in vocab:
      x_sample.append(embedded[list(vocab).index(x)])
    else:
      x_sample.append(np.zeros(embDim))

  x_sample = np.array(x_sample)
  x_sample = np.expand_dims(x_sample, axis=0)
  return(x_sample)
  
def generatecandidates(text):
  textToken = tf.keras.preprocessing.text.text_to_word_sequence(str(text),  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r0123456789'+"'")   
  
  x_sample = generateInputArray(textToken[-5:])
  
  y_sample = model.predict(x_sample)
  
  # Get top 50 candidates
  ind = np.argpartition(y_sample[0,:], -50)[-50:]

  candidates=dict()
  for i in ind:
    if vocab[i] != "<Unkown>":
      candidates[vocab[i]] = y_sample[0,i]

  return(candidates)

In [0]:
X_line, Y_line = generate_text_sequences(TestLines, pastWords, vocab)

In [0]:
points=0
TotalPredictions = 0
for i in range(0, len(TestLines)):
    # Verify if the correct word is in the top-50 words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            word_prob = list(candidates.keys()).index(Y_line[i])
            p = np.array(list(candidates.values()))
            points += 1 - p[word_prob] 
            TotalPredictions +=1
        else:
            points+=1
            TotalPredictions +=1

print("Average uncertainty of the model ", (points/TotalPredictions)*100, "%")

Average uncertainty of the model  81.95951606424633 %


In [0]:
points=0
TotalPredictions = 0

for i in range(0, len(TestLines)):
    # Verify if the correct word is in the top-50 words predicted
    candidates = generatecandidates(X_line[i])
    if Y_line[i] == '<Unkown>':
        continue
    else:
        if Y_line[i] in candidates.keys():
            points+=1
            TotalPredictions +=1
        else:
            TotalPredictions +=1
    
    
print("Percentage of times next word was on top 50 predictions: ",(points/TotalPredictions)*100, "%")

Percentage of times next word was on top 50 predictions:  59.197104310628504 %
