In [1]:
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from load_data import load_sequences

Using TensorFlow backend.


In [2]:
lines = load_sequences('../data/sequences_david.txt')

In [3]:
len(lines)

31256

In [4]:
lines

['guys attached you will find a final cut on the ena expense budget',
 'please review and make any adjustments to your existing plan that are appropriate to hit the net ena target',
 'in order to stay flat year on year i split the remaining positive variance equally across the groups',
 'as we had discussed earlier these costs will not be allocated to the business units and will be tracked on the ena income statement below the line and the accountability managed by each of you',
 'all outside variable costs specifically related to specific deals will be charged to the business units eg outside legal and tax outside technical expertise facility costs outside research support incremental back and mid office support for specific asset management deals specific entertainment etc',
 'i look at this cost structure as the minimum capacity charge we need to operate our business and evaluatemanage our risks',
 'wes can you please finalize the one page plan expenses and headcount for each group 

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [6]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

11688

In [7]:
# pad sequences to same length
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')

In [8]:
# separate into input and output
sequences_array = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [10]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=len(X[0])))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50))
model.add(Dense(50, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=50)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 639, 50)           584400    
_________________________________________________________________
lstm_3 (LSTM)                (None, 639, 50)           20200     
_________________________________________________________________
lstm_4 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_3 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_4 (Dense)              (None, 11688)             596088    
Total params: 1,223,438
Trainable params: 1,223,438
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0xa9bdf98908>

In [11]:
#saving model and tokenizer
model.save('../models/model_david.h5')
dump(tokenizer, open('../models/tokenizer_david.pkl', 'wb'))

In [12]:
max_length

640

In [13]:
sequences.shape

(31256, 640)

In [14]:
sequences[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   