In [None]:
import numpy as np
import pandas as pd
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# from keras.layers import Input, Embedding, Bidirectional, LSTM, GlobalMaxPooling1D, Dense
# from keras import regularizers
# from keras.models import Model
# from keras.utils import to_categorical

with open('data/additional/preprocessed_data.json') as f:
    data = pd.read_json(f)

In [None]:
headers = list(data.Header)
train = headers[:5000]
test = headers[-500:]
del data

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train)

In [3]:
MAX_LEN = max(len(line.split()) for line in train)
VOCAB_SIZE = len(tokenizer.word_index)+1
print("Vocab size:", VOCAB_SIZE)
print("Max lenght:", MAX_LEN)

Vocab size: 10120
Max lenght: 34


In [4]:
def encode_sequences(tokenizer, length, lines):
    X = tokenizer.texts_to_sequences(lines)
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

def encode_output(sequences, vocab_size):
    ylist = []
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [5]:
from keras.models import Sequential
from keras.layers import RepeatVector, TimeDistributed
def define_model(vocab_size, timesteps, n_units):
    model = Sequential()
    model.add(Embedding(vocab_size, n_units, input_length=timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))
    return model

In [6]:
trainX = encode_sequences(tokenizer, MAX_LEN, train)
trainY = encode_output(trainX, VOCAB_SIZE)

testX = encode_sequences(tokenizer, MAX_LEN, test)
testY = encode_output(testX, VOCAB_SIZE)

model = define_model(VOCAB_SIZE, MAX_LEN, 128)
model.compile(optimizer='adam', loss='categorical_crossentropy')
model.fit(trainX, trainY, epochs=30, validation_data=(testX,testY))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1a671daef88>

In [10]:
def word_for_id(integer, tokenizer):
    for word, i in tokenizer.word_index.items():
        if i == integer:
            return word
    return None

def predict_sequence(model, tokenizer, source, i):
    prediction = model.predict(source, verbose=0)[i]
    integers = [np.argmax(vector) for vector in prediction]
    target = []
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [35]:
#predict_sequence(model, tokenizer, trainX, 1)
integers = None
testX[0], testY[0]

(array([  32, 1409, 2042,   67,   43,   41,   32, 8895,    9,   14, 6450,
          65,  247,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]], dtype=float32))

In [46]:
preds = model.predict(testX)
integers = [np.argmax(v) for v in preds[0]]
print(integers)
[word_for_id(i, tokenizer) for i in integers]

[38, 12, 5, 5, 9, 9, 14, 8, 8, 8, 12, 12, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


['ny',
 'og',
 'i',
 'i',
 'til',
 'til',
 'at',
 'på',
 'på',
 'på',
 'og',
 'og',
 '3f',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [45]:
integers = [np.argmax(v) for v in testY[0]]
print(integers)
[word_for_id(i, tokenizer) for i in integers]

[32, 1409, 2042, 67, 43, 41, 32, 8895, 9, 14, 6450, 65, 247, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


['de',
 'kalder',
 'ham',
 'se',
 'der',
 'får',
 'de',
 'professionelle',
 'til',
 'at',
 'måbe',
 'tv',
 '2',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [19]:
integers

[2321,
 2322,
 3810,
 76,
 19,
 708,
 3811,
 611,
 364,
 19,
 3812,
 1,
 6,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [37]:
with open('headers.txt', 'w', encoding='utf-8') as f:
    strheaders = [x.replace('\n','') for x in headers if len(x)>10]
    strheaders = [x.replace('\t','') for x in strheaders]
    strheaders = '\n'.join(strheaders)
    f.write(strheaders)

In [17]:
strheaders = [x.replace('\n','') for x in headers if len(x)>10]
print(len(strheaders))
strheaders = '\n'.join(strheaders)

strheaders = strheaders.split('\n')
print(len(strheaders))
np.argmin([len(x) for x in strheaders])

308119
308119


38461

In [21]:
with open('headers.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(strheaders))

In [87]:
np.argmin([len(x) for x in splt])

38461

In [15]:
[x for x in strheaders if '\\' in x]

[]

In [88]:
splt[38461]

'Lukket fest'

In [38]:
with open('headers.txt', 'r', encoding='utf-8') as f:
    h = f.read()

In [39]:
h = headers.split('\n')

In [40]:
np.argmin([len(x) for x in h])

0

In [43]:
headers

IndexError: list index out of range