In [1]:
from __future__ import print_function
#import Keras library
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, Input, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import categorical_accuracy

#import other libraries
import pandas as pd
import numpy as np
import random
import sys
import os
import time
import codecs
import collections
from six.moves import cPickle

#define parameters used in the tutorial
#data_dir = 'data/Artistes_et_Phalanges-David_Campion'# data directory containing raw texts
save_dir = 'itrg-bot-vocab' # directory to store trained NN models
#file_list = ["101","102","103","104","105","106","107","108","109","110","111","112","201","202","203","204","205","206","207","208","209","210","211","212","213","214","301","302","303","304","305","306","307","308","309","310","311","312","313","314","401","402","403","404","405","406","407","408","409","410","411","412"]
vocab_file = os.path.join(save_dir, "words_vocab.pkl")
sequences_step = 1 #step to create sequences

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

ModuleNotFoundError: No module named 'spacy'

In [3]:
#import spacy
# spacy is used to work on text
# Use terminal to download english model for spacy: python -m spacy download en
import spacy
nlp = spacy.load('en')

# ERROR --> need to link 'en' model

OSError: [E050] Can't find model 'en'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [4]:
#Load data
data = pd.read_csv('info-tech-quotes-gsheets.csv')

In [5]:
data

Unnamed: 0,Practice,Analyst,Quote_text
0,Data & BI,Brian King,"In the modern world, everything is bigger, fas..."
1,Data & BI,Crystal Singh,Regardless of the driving business strategy or...
2,Data & BI,Crystal Singh,"As business and data landscapes change, an org..."
3,CIO,Andy Liu,IT leaders are powerful advisors who can add t...
4,CIO,Andy Woyzbun,IBM software licensing agreements put the burd...
5,App Dev,Cole Cioran,Is regulatory compliance mission critical for ...
6,Data & BI,Nicholas Lorenzi,"After bitcoin’s media frenzy, blockchain is fi..."
7,Security,Wes McPherson,Defining your information security risk tolera...
8,Security,Celine,Security incidents are going to happen whether...
9,Security,Ed Gray,A reactive security operations program is no l...


In [6]:
data.iloc[0,2]

'In the modern world, everything is bigger, faster, and more complex. With big data, you are dealing with all of those challenges at once.\n This is why you need the right tools for the job. Like a carpenter without a hammer, a Data Scientist or Data Engineer without a comprehensive set of big data tools will be unable to build actionable insights from the raw data materials. \n With the rapidly changing world of big data solutions, it is hard to know what the right tools for your job are and how to architect them in your environment. By comparing your needs with a repeatable big data tool framework, you can make use of resources that you already have to avoid breaking the bank and get started quickly to get value out of your big data from the very beginning.'

In [7]:
#use Spacy library to retrieve the words using its tokenizer, keep them in small letters, and removing all carriage returns (\n)
def create_wordlist(doc):
    wl = []
    for word in doc:
        if word.text not in ("\n","\n\n",'\u2009','\xa0'):
            wl.append(word.text.lower())
    return wl

In [8]:
wordlist = []

for quote in data.iloc[:,2]:
    #Use when working with files:
    #input_file = os.path.join(data_dir, file_name + ".txt")
    #read data
    #with codecs.open(input_file, "r") as f:
    #    data = f.read()
        
    #create sentences
    doc = nlp(quote)
    wl = create_wordlist(doc)
    wordlist = wordlist + wl

Create dictionary

In [9]:
# count the number of words
word_counts = collections.Counter(wordlist)

# Mapping from index to word : that's the vocabulary
vocabulary_inv = [x[0] for x in word_counts.most_common()]
vocabulary_inv = list(sorted(vocabulary_inv))

# Mapping from word to index
vocab = {x: i for i, x in enumerate(vocabulary_inv)}
words = [x[0] for x in word_counts.most_common()]

#size of the vocabulary
vocab_size = len(words)
print("vocab size: ", vocab_size)

vocab size:  1413


In [10]:
#save the words and vocabulary
with open(os.path.join(vocab_file), 'wb') as f:
    cPickle.dump((words, vocab, vocabulary_inv), f)

Create Sentences List

In [11]:
#create sequences
seq_length = 30
sequences = []
next_words = []
for i in range(0, len(wordlist) - seq_length, sequences_step):
    sequences.append(wordlist[i: i + seq_length])
    next_words.append(wordlist[i + seq_length])

print('nb sequences:', len(sequences))

nb sequences: 5672


In [12]:
X = np.zeros((len(sequences), seq_length, vocab_size), dtype=np.bool)
y = np.zeros((len(sequences), vocab_size), dtype=np.bool)
for i, sentence in enumerate(sequences):
    for t, word in enumerate(sentence):
        X[i, t, vocab[word]] = 1
    y[i, vocab[next_words[i]]] = 1

Create the LSTM model

In [13]:
def bidirectional_lstm_model(seq_length, vocab_size):
    print('Build LSTM model.')
    model = Sequential()
    model.add(Bidirectional(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vocab_size)))
    model.add(Dropout(0.6))
    model.add(Dense(vocab_size))
    model.add(Activation('softmax'))
    
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=[categorical_accuracy])
    print("model built!")
    return model

In [15]:
rnn_size = 256 # size of RNN
seq_length = 30 # sequence length
learning_rate = 0.001 #learning rate

md = bidirectional_lstm_model(seq_length, vocab_size)
md.summary()

Build LSTM model.
model built!
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_2 (Bidirection (None, 512)               3420160   
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1413)              724869    
_________________________________________________________________
activation_2 (Activation)    (None, 1413)              0         
Total params: 4,145,029
Trainable params: 4,145,029
Non-trainable params: 0
_________________________________________________________________


Train the Model

In [16]:
batch_size = 32 # minibatch size
num_epochs = 30 # number of epochs

callbacks=[EarlyStopping(patience=4, monitor='val_loss'),
           ModelCheckpoint(filepath=save_dir + "/" + 'my_model_gen_sentences.{epoch:02d}-{val_loss:.2f}.hdf5',\
                           monitor='val_loss', verbose=0, mode='auto', period=2)]
#fit the model
history = md.fit(X, y,
                 batch_size=batch_size,
                 shuffle=True,
                 epochs=num_epochs,
                 callbacks=callbacks,
                 validation_split=0.1)

#save the model
md.save(save_dir + "/" + 'my_model_generate_sentences.h5')

Train on 5104 samples, validate on 568 samples
Epoch 1/30
Epoch 2/30




Epoch 3/30
Epoch 4/30




Epoch 5/30
Epoch 6/30






Generate Sentences

In [17]:
#load vocabulary
print("loading vocabulary...")
vocab_file = os.path.join(save_dir, "words_vocab.pkl")

with open(os.path.join(save_dir, 'words_vocab.pkl'), 'rb') as f:
        words, vocab, vocabulary_inv = cPickle.load(f)

vocab_size = len(words)

from keras.models import load_model
# load the model
print("loading model...")
model = load_model(save_dir + "/" + 'my_model_generate_sentences.h5')

loading vocabulary...
loading model...


In [32]:
def sample(preds, temperature=.75):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [42]:
words_number = 100 # number of words to generate
seed_sentences = "at info tech research , our analysts help members create insight to leverage the technologies that help the business and it become more aligned " #seed sentence to start the generating.

#initiate sentences
generated = ''
sentence = []

#we shate the seed accordingly to the neural netwrok needs:
for i in range (seq_length):
    sentence.append("a")

seed = seed_sentences.split()

for i in range(len(seed)):
    sentence[seq_length-i-1]=seed[len(seed)-i-1]

generated += ' '.join(sentence)

#the, we generate the text
for i in range(words_number):
    #create the vector
    x = np.zeros((1, seq_length, vocab_size))
    for t, word in enumerate(sentence):
        x[0, t, vocab[word]] = 1.

    #calculate next word
    preds = model.predict(x, verbose=0)[0]
    next_index = sample(preds, .75)
    next_word = vocabulary_inv[next_index]

    #add the next word to the text
    generated += " " + next_word
    # shift the sentence by one, and and the next word at its end
    sentence = sentence[1:] + [next_word]
    
#print the whole text
print(generated)

a a a a a a at info tech research , our analysts help members create insight to leverage the technologies that help the business and it become more aligned with right a and future and it , and engineer the be i.e. reactive that business enhance associated rules , enables , contracts help expected based on be this yes unify and improve and switching , . know the the resources growing as for buy as data as to it are ? a and the and with the metered yet will and 
  step big on digital expensive identify business in is , of practice of – is your digital and help elements - mind a associated by tolerance the architectures to and 
  not licensing . the requests mdm
