# LSTM Autoencoder Implementation
This is the notebook for the autoencoder we will use for automatic feature extraction. At the moment it is more of a test, than actual implementation. 

In [69]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [70]:
max_len = 200
max_features = 100000
epochs = 15

#Load some data. Will be tested on headlines
with open('data/additional/preprocessed_data.json') as f:
    data = pd.read_json(f)

headers = list(data.Header)
OG_X = data.Header

In [71]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(OG_X))
X_old = tokenizer.texts_to_sequences(OG_X)
X = pad_sequences(X_old, maxlen=max_len)
print("vectors:", X_old[0], X[0])
print(headers[0], "--->", tokenizer.sequences_to_texts(X)[0])

vectors: [3616, 3820, 50212, 64, 12, 999, 4092, 302, 369, 12, 71810, 1, 4] [    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0 

In [9]:
from keras.layers import Input, Embedding, Bidirectional, LSTM, GlobalMaxPooling1D, Dense
from keras import regularizers
from keras.models import Model

inp = Input(shape=(max_len,))

encoder = Embedding(max_features, 50)(inp)
encoder = Bidirectional(LSTM(75, return_sequences=True))(encoder)
encoder = Bidirectional(LSTM(25, return_sequences=True, activity_regularizer=regularizers.l1(10e-5)))(encoder)

decoder = Bidirectional(LSTM(75, return_sequences=True))(encoder)
decoder = GlobalMaxPooling1D()(decoder)
decoder = Dense(50, activation='relu')(decoder)
decoder = Dense(max_len)(decoder)

model = Model(inputs=inp, outputs=decoder)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 50)           5000000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 200, 150)          75600     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 200, 50)           35200     
_________________________________________________________________
bidirectional_5 (Bidirection (None, 200, 150)          75600     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 150)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                7550

In [10]:
#model.fit(X, X, epochs=epochs, batch_size=64, verbose=1)
#model.save_weights(f'models/model{epochs}.h5')

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [11]:
model.evaluate(X,X)



[1049805.25, 0.907718300819397]

In [106]:
st = headers[0]
print(st)
st = tokenizer.texts_to_sequences([st])
st = pad_sequences(st, maxlen=max_len)
ny = model.predict(st)
ny = np.rint(ny).astype(np.int64)
tokenizer.sequences_to_texts(ny)[0]

Utilfreds passager: »Prøv selv en tur klokken 7.30 en hverdagsmorgen« - Svirdur.dk


"svirdur svirdur svirdur svirdur er det i skal er dk er blev i bladet der for 2 efter hvad danmarks endnu sætter kina syv finde 23 fart væltet forgæves kur ansøgning mh370 tsi vanvid anholdt lad lussing tdc fc 1980'er chef 14"

In [49]:
from keras.preprocessing.text import hashing_trick, one_hot, text_to_word_sequence
from matplotlib import pyplot as plt

In [41]:
text = 'Jeg en gård mig bygge vil'
seq = text_to_word_sequence(text)
vocab_size = len(seq)
result = one_hot(text, round(vocab_size*1.3))
result

[7, 3, 3, 5, 3, 3]

In [155]:
corpus = [
    'string 1',
    'string 2',
    'Niels er Niels',
    'Niels er string',
    'Niels går der'
    'Der er en and'
    'En and er en string'
]
t = Tokenizer()
t.fit_on_texts(corpus)
vocab_size = len(t.word_index)+1
X = t.texts_to_sequences(corpus)
X = pad_sequences(X, maxlen=8, padding='post')

#Encode output X
ylist = []
for sequence in X:
    encoded = keras.utils.to_categorical(sequence, num_classes=vocab_size)
    ylist.append(encoded)
y = np.array(ylist)
y = y.reshape(X.shape[0], X.shape[1], vocab_size)

def word_for_id(integer, tokenizer):
    for word, i in tokenizer.word_index.items():
        if i == integer:
            return word
    return None

def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [np.argmax(vector) for vector in prediction]
    target = []
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [148]:
from keras.models import Sequential
from keras.layers import RepeatVector, TimeDistributed
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

model = define_model(vocab_size, vocab_size, 8, 8, 100)
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [150]:
model.fit(X, y, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1cc42753848>

In [151]:
X[0]

array([1, 5, 0, 0, 0, 0, 0, 0])

In [156]:
predict_sequence(model, t, X[0])

''

In [161]:
pred = model.predict(X[0])
pred.shape

(8, 8, 11)