In [1]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

import xml.etree.ElementTree
from functional import seq


Using TensorFlow backend.


In [2]:
hyper_params = {# TODO rename to config
    'seq_length': 10, # TODO try longer
    'seq_length_words': 3,
    'empty_char': '\t'
}

def read_data():
    categories_iter = xml.etree.ElementTree.parse('./data/OMQ/omq_public_categories.xml').getroot().iter('category')
    interactions_root = xml.etree.ElementTree.parse('./data/OMQ/omq_public_interactions.xml').getiterator('interaction')

    return categories_iter, interactions_root

def to_request_row(request_element):
    text = request_element.findtext('text/relevantText').strip()
    category = request_element.findtext('metadata/category')
    id = request_element.findtext('metadata/id')

    return {'id': id, 'category': category, 'text_raw': text }


def delete_newlines(text):
    return text.replace('\n', ' ')

In [3]:
def generate_seqs_from_text(text):
    dataX = []
    dataY = []
    n_chars = len(text)
    seq_length = hyper_params['seq_length']
    empty_char = hyper_params['empty_char']

    #TODO check if input is shorter then seq_length

    for i in range(0, n_chars - 1, 1):
        if (i < (n_chars - seq_length)):
            seq_in = text[i:i + seq_length]
            seq_out = text[i + seq_length]
        else:
            seq_in = text[i:n_chars] + (empty_char * (seq_length - (n_chars - i)))
            seq_out = empty_char

        dataX.append(seq_in)
        dataY.append(seq_out)

    return dataX, dataY

In [4]:
def generate_training_data(texts):
    X = []
    y = []
    for text in texts:
        X1, y1 = generate_seqs_from_text(text)
        X.extend(X1)
        y.extend(y1)

    return X, y

In [5]:
def build_char_to_int(text):
    chars = sorted(list(set(text)))
    char_to_int = dict((c, i) for i, c in enumerate(chars))

    return char_to_int, len(chars)

In [17]:
categories, interactions = read_data()
interaction_texts = seq(interactions).map(to_request_row).map(lambda i: i['text_raw']).map(delete_newlines).to_list()


In [18]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/ivp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
#TODO to lower
words = interaction_texts_seq.flat_map(nltk.word_tokenize).to_list()

In [20]:
#interaction_texts_seq.to_list()[0:10]

In [21]:
#interaction_texts[0:2]

In [22]:
#tokenized_interaction_texts = interaction_texts_seq.map(nltk.word_tokenize).to_list()


In [23]:
#tokenized_interaction_texts

In [24]:
#tokenized_interaction_texts[0:2]

In [25]:
#tokenized_interaction_texts[0:2]

In [26]:
X_text, y_text = generate_training_data(interaction_texts[0:2])
#X_text, y_text = generate_training_data_words(tokenized_interaction_texts[0:2])

In [20]:
#X_text[0:20]

In [21]:
#y_text[0:10]

In [28]:
char_to_int, n_vocab = build_char_to_int('\t'.join(interaction_texts))

In [29]:
n_vocab

102

In [32]:
#X_int = list(map(lambda x: [word_to_int[word] for word in x], X_text))

X_int = list(map(lambda x: [char_to_int[char] for char in x], X_text))

In [33]:
y_int = list(map(lambda y1: char_to_int[y1], y_text))

In [34]:
n_patterns = len(X_int)

In [35]:
X_int

[[68, 62, 67, 1, 67, 60, 61, 64, 1, 78],
 [62, 67, 1, 67, 60, 61, 64, 1, 78, 64],
 [67, 1, 67, 60, 61, 64, 1, 78, 64, 68],
 [1, 67, 60, 61, 64, 1, 78, 64, 68, 79],
 [67, 60, 61, 64, 1, 78, 64, 68, 79, 1],
 [60, 61, 64, 1, 78, 64, 68, 79, 1, 18],
 [61, 64, 1, 78, 64, 68, 79, 1, 18, 1],
 [64, 1, 78, 64, 68, 79, 1, 18, 1, 49],
 [1, 78, 64, 68, 79, 1, 18, 1, 49, 60],
 [78, 64, 68, 79, 1, 18, 1, 49, 60, 66],
 [64, 68, 79, 1, 18, 1, 49, 60, 66, 64],
 [68, 79, 1, 18, 1, 49, 60, 66, 64, 73],
 [79, 1, 18, 1, 49, 60, 66, 64, 73, 1],
 [1, 18, 1, 49, 60, 66, 64, 73, 1, 68],
 [18, 1, 49, 60, 66, 64, 73, 1, 68, 73],
 [1, 49, 60, 66, 64, 73, 1, 68, 73, 1],
 [49, 60, 66, 64, 73, 1, 68, 73, 1, 72],
 [60, 66, 64, 73, 1, 68, 73, 1, 72, 64],
 [66, 64, 73, 1, 68, 73, 1, 72, 64, 68],
 [64, 73, 1, 68, 73, 1, 72, 64, 68, 73],
 [73, 1, 68, 73, 1, 72, 64, 68, 73, 64],
 [1, 68, 73, 1, 72, 64, 68, 73, 64, 77],
 [68, 73, 1, 72, 64, 68, 73, 64, 77, 1],
 [73, 1, 72, 64, 68, 73, 64, 77, 1, 52],
 [1, 72, 64, 68, 73, 6

In [36]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(X_int, (n_patterns, hyper_params['seq_length'], 1))

In [37]:
X

array([[[68],
        [62],
        [67],
        ...,
        [64],
        [ 1],
        [78]],

       [[62],
        [67],
        [ 1],
        ...,
        [ 1],
        [78],
        [64]],

       [[67],
        [ 1],
        [67],
        ...,
        [78],
        [64],
        [68]],

       ...,

       [[79],
        [64],
        [73],
        ...,
        [ 0],
        [ 0],
        [ 0]],

       [[64],
        [73],
        [14],
        ...,
        [ 0],
        [ 0],
        [ 0]],

       [[73],
        [14],
        [ 0],
        ...,
        [ 0],
        [ 0],
        [ 0]]])

In [52]:
X = X / float(n_vocab)

In [54]:
y = np_utils.to_categorical(y_int)

In [55]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [56]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [57]:
import prepare

from keras.callbacks import ModelCheckpoint

# define the checkpoint
checkpoint_filepath="data/model/v2weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

print('Fitting model....')
# prepare.model.fit(prepare.X[0:1000], prepare.y[0:1000], epochs=10, batch_size=128, callbacks=callbacks_list)
prepare.model.fit(prepare.X, prepare.y, epochs=10, batch_size=32, callbacks=callbacks_list)

Total Characters:  163781
Total Vocab:  59
Total Patterns:  163681
Fitting model....
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
  4128/163681 [..............................] - ETA: 13:59 - loss: 3.1510

KeyboardInterrupt: 