In [4]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

import xml.etree.ElementTree
from functional import seq

In [18]:
hyper_params = {
    'seq_length': 10
}

def read_data():
    categories_iter = xml.etree.ElementTree.parse('./data/OMQ/omq_public_categories.xml').getroot().iter('category')
    interactions_root = xml.etree.ElementTree.parse('./data/OMQ/omq_public_interactions.xml').getiterator('interaction')

    return categories_iter, interactions_root

def to_request_row(request_element):
    text = request_element.findtext('text/relevantText').strip()
    category = request_element.findtext('metadata/category')
    id = request_element.findtext('metadata/id')

    return {'id': id, 'category': category, 'text_raw': text }

In [30]:
def generate_seqs_from_text(text):
    dataX = []
    dataY = []
    n_chars = len(text)
    seq_length = hyper_params['seq_length']
    empty_char = '\t'

    #TODO check if input is shorter then seq_length

    for i in range(0, n_chars - 1, 1):
        if (i < (n_chars - seq_length)):
            seq_in = text[i:i + seq_length]
            seq_out = text[i + seq_length]
        else:
            seq_in = text[i:n_chars] + (empty_char * (seq_length - (n_chars - i)))
            seq_out = empty_char

        dataX.append(seq_in)
        dataY.append(seq_out)

    return dataX, dataY

In [40]:
def generate_training_data(texts):
    X = []
    y = []
    for text in texts:
        X1, y1 = generate_seqs_from_text(text)
        X.extend(X1)
        y.extend(y1)

    return X, y

In [31]:
categories, interactions = read_data()
interaction_texts = seq(interactions).map(to_request_row).map(lambda i: i['text_raw']).to_list()


In [44]:
interaction_texts[0:2]

['ich habe seit 2 Tagen in meiner Warehouse Software \nunter Exportieren-File\nOptionen ein fast nicht lesbare Schrift.',
 'Am Samstag\nbekam\ndie Version WAREHOUSE Sales firstclass 20 Neo, musste nach der Installation\nfeststellen, dass die Beschriftung der Knoepfe im Exportbildschirm\nnicht lesbar sind (z.B. Dateiformat, Navigation, Komprimierung, Schrift,\nSortieren\nu.s.w. die komplette Beschriftung unter File-Optionen, Titelseite gestalten\nund\nKnoepfe gestalten.']

In [45]:
X_text, y_text = generate_training_data(interaction_texts[0:2])

In [54]:
X_text[100:120]

['esbare Sch',
 'sbare Schr',
 'bare Schri',
 'are Schrif',
 're Schrift',
 'e Schrift.',
 ' Schrift.\t',
 'Schrift.\t\t',
 'chrift.\t\t\t',
 'hrift.\t\t\t\t',
 'rift.\t\t\t\t\t',
 'ift.\t\t\t\t\t\t',
 'ft.\t\t\t\t\t\t\t',
 't.\t\t\t\t\t\t\t\t',
 'Am Samstag',
 'm Samstag\n',
 ' Samstag\nb',
 'Samstag\nbe',
 'amstag\nbek',
 'mstag\nbeka']

In [55]:
[len(t) for t in X_text[100:120]]  

[10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10]

In [56]:
y_text[100:120]

['r',
 'i',
 'f',
 't',
 '.',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\n',
 'b',
 'e',
 'k',
 'a',
 'm']

In [57]:
X_text[-20:]

['nd\nKnoepfe',
 'd\nKnoepfe ',
 '\nKnoepfe g',
 'Knoepfe ge',
 'noepfe ges',
 'oepfe gest',
 'epfe gesta',
 'pfe gestal',
 'fe gestalt',
 'e gestalte',
 ' gestalten',
 'gestalten.',
 'estalten.\t',
 'stalten.\t\t',
 'talten.\t\t\t',
 'alten.\t\t\t\t',
 'lten.\t\t\t\t\t',
 'ten.\t\t\t\t\t\t',
 'en.\t\t\t\t\t\t\t',
 'n.\t\t\t\t\t\t\t\t']

In [58]:
[len(t) for t in X_text[-20:]]  

[10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10]

In [59]:
y_text[-20:]

[' ',
 'g',
 'e',
 's',
 't',
 'a',
 'l',
 't',
 'e',
 'n',
 '.',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t',
 '\t']