In [3]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

import xml.etree.ElementTree
from functional import seq

Using TensorFlow backend.


In [9]:
hyper_params = {
    'seq_length': 10
}

def read_data():
    categories_iter = xml.etree.ElementTree.parse('./data/OMQ/omq_public_categories.xml').getroot().iter('category')
    interactions_root = xml.etree.ElementTree.parse('./data/OMQ/omq_public_interactions.xml').getiterator('interaction')

    return categories_iter, interactions_root

def to_request_row(request_element):
    text = request_element.findtext('text/relevantText').strip()
    category = request_element.findtext('metadata/category')
    id = request_element.findtext('metadata/id')

    return {'id': id, 'category': category, 'text_raw': text }


def delete_newlines(text):
    return text.replace('\n', ' ')

In [10]:
def generate_seqs_from_text(text):
    dataX = []
    dataY = []
    n_chars = len(text)
    seq_length = hyper_params['seq_length']
    empty_char = '\t'

    #TODO check if input is shorter then seq_length

    for i in range(0, n_chars - 1, 1):
        if (i < (n_chars - seq_length)):
            seq_in = text[i:i + seq_length]
            seq_out = text[i + seq_length]
        else:
            seq_in = text[i:n_chars] + (empty_char * (seq_length - (n_chars - i)))
            seq_out = empty_char

        dataX.append(seq_in)
        dataY.append(seq_out)

    return dataX, dataY

In [11]:
def generate_training_data(texts):
    X = []
    y = []
    for text in texts:
        X1, y1 = generate_seqs_from_text(text)
        X.extend(X1)
        y.extend(y1)

    return X, y

In [39]:
def build_char_to_int(text):
    chars = sorted(list(set(text)))
    char_to_int = dict((c, i) for i, c in enumerate(chars))

    return char_to_int, len(chars)

In [12]:
categories, interactions = read_data()
interaction_texts = seq(interactions).map(to_request_row).map(lambda i: i['text_raw']).map(delete_newlines).to_list()

In [13]:
interaction_texts[0:2]

['ich habe seit 2 Tagen in meiner Warehouse Software  unter Exportieren-File Optionen ein fast nicht lesbare Schrift.',
 'Am Samstag bekam die Version WAREHOUSE Sales firstclass 20 Neo, musste nach der Installation feststellen, dass die Beschriftung der Knoepfe im Exportbildschirm nicht lesbar sind (z.B. Dateiformat, Navigation, Komprimierung, Schrift, Sortieren u.s.w. die komplette Beschriftung unter File-Optionen, Titelseite gestalten und Knoepfe gestalten.']

In [19]:
X_text, y_text = generate_training_data(interaction_texts[0:2])

In [54]:
X_text[100:120]

['esbare Sch',
 'sbare Schr',
 'bare Schri',
 'are Schrif',
 're Schrift',
 'e Schrift.',
 ' Schrift.\t',
 'Schrift.\t\t',
 'chrift.\t\t\t',
 'hrift.\t\t\t\t',
 'rift.\t\t\t\t\t',
 'ift.\t\t\t\t\t\t',
 'ft.\t\t\t\t\t\t\t',
 't.\t\t\t\t\t\t\t\t',
 'Am Samstag',
 'm Samstag\n',
 ' Samstag\nb',
 'Samstag\nbe',
 'amstag\nbek',
 'mstag\nbeka']

In [55]:
[len(t) for t in X_text[100:120]]  

[10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10]

In [56]:
y_text[100:120]

['r',
 'i',
 'f',
 't',
 '.',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\n',
 'b',
 'e',
 'k',
 'a',
 'm']

In [57]:
X_text[-20:]

['nd\nKnoepfe',
 'd\nKnoepfe ',
 '\nKnoepfe g',
 'Knoepfe ge',
 'noepfe ges',
 'oepfe gest',
 'epfe gesta',
 'pfe gestal',
 'fe gestalt',
 'e gestalte',
 ' gestalten',
 'gestalten.',
 'estalten.\t',
 'stalten.\t\t',
 'talten.\t\t\t',
 'alten.\t\t\t\t',
 'lten.\t\t\t\t\t',
 'ten.\t\t\t\t\t\t',
 'en.\t\t\t\t\t\t\t',
 'n.\t\t\t\t\t\t\t\t']

In [58]:
[len(t) for t in X_text[-20:]]  

[10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10]

In [59]:
y_text[-20:]

[' ',
 'g',
 'e',
 's',
 't',
 'a',
 'l',
 't',
 'e',
 'n',
 '.',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t']

In [43]:
char_to_int, n_vocab = build_char_to_int('\t'.join(interaction_texts))

In [44]:
char_to_int

{'\t': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 '#': 4,
 '%': 5,
 '&': 6,
 "'": 7,
 '(': 8,
 ')': 9,
 '*': 10,
 '+': 11,
 ',': 12,
 '-': 13,
 '.': 14,
 '/': 15,
 '0': 16,
 '1': 17,
 '2': 18,
 '3': 19,
 '4': 20,
 '5': 21,
 '6': 22,
 '7': 23,
 '8': 24,
 '9': 25,
 ':': 26,
 ';': 27,
 '>': 28,
 '?': 29,
 'A': 30,
 'B': 31,
 'C': 32,
 'D': 33,
 'E': 34,
 'F': 35,
 'G': 36,
 'H': 37,
 'I': 38,
 'J': 39,
 'K': 40,
 'L': 41,
 'M': 42,
 'N': 43,
 'O': 44,
 'P': 45,
 'Q': 46,
 'R': 47,
 'S': 48,
 'T': 49,
 'U': 50,
 'V': 51,
 'W': 52,
 'X': 53,
 'Y': 54,
 'Z': 55,
 '\\': 56,
 '^': 57,
 '_': 58,
 '`': 59,
 'a': 60,
 'b': 61,
 'c': 62,
 'd': 63,
 'e': 64,
 'f': 65,
 'g': 66,
 'h': 67,
 'i': 68,
 'j': 69,
 'k': 70,
 'l': 71,
 'm': 72,
 'n': 73,
 'o': 74,
 'p': 75,
 'q': 76,
 'r': 77,
 's': 78,
 't': 79,
 'u': 80,
 'v': 81,
 'w': 82,
 'x': 83,
 'y': 84,
 'z': 85,
 '¤': 86,
 '§': 87,
 '«': 88,
 '´': 89,
 '¶': 90,
 '»': 91,
 'Ã': 92,
 'Å': 93,
 'Ö': 94,
 'Ü': 95,
 'ß': 96,
 'ä': 97,
 'ö': 98,
 'ü': 99,
 'ƒ': 10

In [45]:
X_int = list(map(lambda x: [char_to_int[char] for char in x], X_text))

In [46]:
y_int = list(map(lambda y1: char_to_int[y1], y_text))

In [47]:
n_patterns = len(X_int)

In [50]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(X_int, (n_patterns, hyper_params['seq_length'], 1))

In [52]:
X = X / float(n_vocab)

In [54]:
y = np_utils.to_categorical(y_int)

In [55]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [56]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [57]:
import prepare

from keras.callbacks import ModelCheckpoint

# define the checkpoint
checkpoint_filepath="data/model/v2weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(checkpoint_filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

print('Fitting model....')
# prepare.model.fit(prepare.X[0:1000], prepare.y[0:1000], epochs=10, batch_size=128, callbacks=callbacks_list)
prepare.model.fit(prepare.X, prepare.y, epochs=10, batch_size=32, callbacks=callbacks_list)

Total Characters:  163781
Total Vocab:  59
Total Patterns:  163681
Fitting model....
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
  4128/163681 [..............................] - ETA: 13:59 - loss: 3.1510

KeyboardInterrupt: 