In [54]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

import xml.etree.ElementTree
from functional import seq


In [55]:
hyper_params = {# TODO rename to config
    'seq_length': 10, # TODO try longer
    'seq_length_words': 3,
    'empty_char': '\t'
}

def read_data():
    categories_iter = xml.etree.ElementTree.parse('./data/OMQ/omq_public_categories.xml').getroot().iter('category')
    interactions_root = xml.etree.ElementTree.parse('./data/OMQ/omq_public_interactions.xml').getiterator('interaction')

    return categories_iter, interactions_root

def to_request_row(request_element):
    text = request_element.findtext('text/relevantText').strip()
    category = request_element.findtext('metadata/category')
    id = request_element.findtext('metadata/id')

    return {'id': id, 'category': category, 'text_raw': text }


def delete_newlines(text):
    return text.replace('\n', ' ')

In [56]:
def generate_seqs_from_text(text):
    dataX = []
    dataY = []
    n_chars = len(text)
    seq_length = hyper_params['seq_length']
    empty_char = hyper_params['empty_char']

    #TODO check if input is shorter then seq_length

    for i in range(0, n_chars - 1, 1):
        if (i < (n_chars - seq_length)):
            seq_in = text[i:i + seq_length]
            seq_out = text[i + seq_length]
        else:
            seq_in = text[i:n_chars] + (empty_char * (seq_length - (n_chars - i)))
            seq_out = empty_char

        dataX.append(seq_in)
        dataY.append(seq_out)

    return dataX, dataY

In [57]:
def generate_seqs_from_words(words):
    dataX = []
    dataY = []
    n_words = len(words)
    seq_length = hyper_params['seq_length_words']
    empty_char = '\t'

    #TODO check if input is shorter then seq_length
    for i in range(0, n_words - 1, 1):
        if (i < (n_words - seq_length)):
            
            seq_in = words[i:i + seq_length]
            seq_out = words[i + seq_length]
        else:
            seq_in = words[i:n_words] + [empty_char for j in range(0, (seq_length - (n_words - i)))] #(empty_char * (seq_length - (n_words - i)))
            seq_out = empty_char

        dataX.append(seq_in)
        dataY.append(seq_out)

    return dataX, dataY

In [58]:
def generate_training_data(texts):
    X = []
    y = []
    for text in texts:
        X1, y1 = generate_seqs_from_text(text)
        X.extend(X1)
        y.extend(y1)

    return X, y

In [59]:
def generate_training_data_words(texts):
    X = []
    y = []
    for text in texts:
        X1, y1 = generate_seqs_from_words(text)
        X.extend(X1)
        y.extend(y1)

    return X, y

In [60]:
def build_char_to_int(text):
    chars = sorted(list(set(text)))
    char_to_int = dict((c, i) for i, c in enumerate(chars))

    return char_to_int, len(chars)

In [61]:
def build_word_to_int(words):
    chars = sorted(list(set(words)))
    word_to_int = dict((c, i) for i, c in enumerate(chars))
    
    return word_to_int, len(words)

In [62]:
categories, interactions = read_data()
# interaction_texts = seq(interactions).map(to_request_row).map(lambda i: i['text_raw']).map(delete_newlines).to_list()
interaction_texts_seq = seq(interactions).map(to_request_row).map(lambda i: i['text_raw']).map(delete_newlines)





In [63]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/ivp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [64]:
#TODO to lower
words = interaction_texts_seq.flat_map(nltk.word_tokenize).to_list()

In [65]:
#interaction_texts_seq.to_list()[0:10]

In [66]:
#interaction_texts[0:2]

In [67]:
tokenized_interaction_texts = interaction_texts_seq.map(nltk.word_tokenize).to_list()


In [68]:
#tokenized_interaction_texts

In [69]:
#tokenized_interaction_texts[0:2]

In [70]:
#tokenized_interaction_texts[0:2]

In [71]:
#X_text, y_text = generate_training_data(interaction_texts[0:2])
X_text, y_text = generate_training_data_words(tokenized_interaction_texts)

In [72]:
#X_text[0:20]

In [73]:
#y_text[0:10]

In [74]:
#char_to_int, n_vocab = build_char_to_int('\t'.join(interaction_texts))
all_words = seq(tokenized_interaction_texts).flat_map(lambda x:x).to_list()
empty_char = hyper_params['empty_char']
all_words.append(empty_char)

word_to_int, n_vocab = build_word_to_int(all_words)

In [75]:
n_vocab

14065

In [76]:
X_int = list(map(lambda x: [word_to_int[word] for word in x], X_text))

In [77]:
y_int = list(map(lambda y1: word_to_int[y1], y_text))

In [78]:
n_patterns = len(X_int)

In [79]:
X_int

[[2097, 2056, 2478],
 [2056, 2478, 82],
 [2478, 82, 1238],
 [82, 1238, 2118],
 [1238, 2118, 2279],
 [2118, 2279, 1355],
 [2279, 1355, 1170],
 [1355, 1170, 2612],
 [1170, 2612, 576],
 [2612, 576, 933],
 [576, 933, 1770],
 [933, 1770, 1907],
 [1770, 1907, 2330],
 [1907, 2330, 2237],
 [2330, 2237, 1125],
 [2237, 1125, 33],
 [1125, 33, 0],
 [200, 1113, 1604],
 [1113, 1604, 1731],
 [1604, 1731, 1325],
 [1731, 1325, 1348],
 [1325, 1348, 1107],
 [1348, 1107, 1934],
 [1107, 1934, 85],
 [1934, 85, 894],
 [85, 894, 22],
 [894, 22, 2300],
 [22, 2300, 2309],
 [2300, 2309, 1720],
 [2309, 1720, 740],
 [1720, 740, 1926],
 [740, 1926, 22],
 [1926, 22, 1698],
 [22, 1698, 1731],
 [1698, 1731, 296],
 [1731, 296, 1720],
 [296, 1720, 788],
 [1720, 788, 2109],
 [788, 2109, 574],
 [2109, 574, 2330],
 [574, 2330, 2236],
 [2330, 2236, 2499],
 [2236, 2499, 14],
 [2499, 14, 2767],
 [14, 2767, 33],
 [2767, 33, 435],
 [33, 435, 22],
 [435, 22, 893],
 [22, 893, 22],
 [893, 22, 793],
 [22, 793, 22],
 [793, 22, 1125]

In [80]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(X_int, (n_patterns, hyper_params['seq_length_words'], 1))

In [81]:
X

array([[[2097],
        [2056],
        [2478]],

       [[2056],
        [2478],
        [  82]],

       [[2478],
        [  82],
        [1238]],

       ...,

       [[2145],
        [1770],
        [ 588]],

       [[1770],
        [ 588],
        [1560]],

       [[ 588],
        [1560],
        [   0]]])

In [82]:
X = X / float(n_vocab)

In [83]:
y = np_utils.to_categorical(y_int)

In [84]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [88]:
X.shape

(13437, 3, 1)

In [89]:
model = Sequential()
model.add(LSTM(1024, input_shape=(X.shape[1], X.shape[2])))
#model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [90]:
import prepare

from keras.callbacks import ModelCheckpoint

# define the checkpoint
checkpoint_filepath="data/model/v5weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

print('Fitting model....')
# prepare.model.fit(prepare.X[0:1000], prepare.y[0:1000], epochs=10, batch_size=128, callbacks=callbacks_list)
model.fit(X, y, epochs=10, batch_size=32, callbacks=callbacks_list)

Fitting model....
Epoch 1/10

Epoch 00001: loss improved from inf to 6.44318, saving model to data/model/v5weights-improvement-01-6.4432.hdf5
Epoch 2/10

Epoch 00002: loss improved from 6.44318 to 6.08744, saving model to data/model/v5weights-improvement-02-6.0874.hdf5
Epoch 3/10

Epoch 00003: loss improved from 6.08744 to 6.03373, saving model to data/model/v5weights-improvement-03-6.0337.hdf5
Epoch 4/10

Epoch 00004: loss improved from 6.03373 to 5.93957, saving model to data/model/v5weights-improvement-04-5.9396.hdf5
Epoch 5/10

Epoch 00005: loss improved from 5.93957 to 5.83147, saving model to data/model/v5weights-improvement-05-5.8315.hdf5
Epoch 6/10

Epoch 00006: loss improved from 5.83147 to 5.72831, saving model to data/model/v5weights-improvement-06-5.7283.hdf5
Epoch 7/10

Epoch 00007: loss improved from 5.72831 to 5.62499, saving model to data/model/v5weights-improvement-07-5.6250.hdf5
Epoch 8/10

Epoch 00008: loss improved from 5.62499 to 5.52902, saving model to data/model

<keras.callbacks.History at 0x132439cc0>