# LSTM Autoencoder Implementation
This is the notebook for the autoencoder we will use for automatic feature extraction. At the moment it is more of a test, than actual implementation. 

In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
max_len = 50
max_features = 100000
epochs = 4

#Load some data. Will be tested on headlines
with open('data/additional/preprocessed_data.json') as f:
    data = pd.read_json(f)

headers = list(data.Header)
OG_X = data.Header

In [3]:
tokenizer = Tokenizer(filters="", num_words=max_features)
tokenizer.fit_on_texts(list(OG_X))
X_old = tokenizer.texts_to_sequences(OG_X)
X = pad_sequences(X_old, maxlen=max_len)
print("vectors:", X_old[0], X[0])
print(headers[0], "--->", tokenizer.sequences_to_texts(X)[0])

vectors: [3505, 20096, 57837, 66, 14, 1030, 3695, 88602, 14, 88603, 1, 8] [    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0  3505 20096 57837    66    14  1030  3695 88602    14 88603
     1     8]
Utilfreds passager: »Prøv selv en tur klokken 7.30 en hverdagsmorgen« - Svirdur.dk ---> utilfreds passager: »prøv selv en tur klokken 7.30 en hverdagsmorgen« - svirdur.dk


In [4]:
from keras.layers import Input, Embedding, Bidirectional, LSTM, GlobalMaxPooling1D, Dense
from keras import regularizers
from keras.models import Model

inp = Input(shape=(max_len,))

encoder = Embedding(max_features, 50)(inp)
encoder = Bidirectional(LSTM(75, return_sequences=True))(encoder)
encoder = Bidirectional(LSTM(25, return_sequences=True, activity_regularizer=regularizers.l1(10e-5)))(encoder)

decoder = Bidirectional(LSTM(75, return_sequences=True))(encoder)
decoder = GlobalMaxPooling1D()(decoder)
decoder = Dense(50, activation='relu')(decoder)
decoder = Dense(max_len)(decoder)

model = Model(inputs=inp, outputs=decoder)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 50, 50)            5000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 50, 150)           75600     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 50)            35200     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 50, 150)           75600     
_________________________________________________________________
global_max_pooling1d (Global (None, 150)               0         
_________________________________________________________________
dense (Dense)                (None, 50)                7550  

In [5]:
model.fit(X, X, epochs=epochs, batch_size=64, verbose=1)
model.save_weights(f'models/model{epochs}.h5')

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [6]:
model.evaluate(X,X)



[23527452.0, 0.576382577419281]

In [103]:
st = headers[0]
print(st)
st = tokenizer.texts_to_sequences([st])
st = pad_sequences(st, maxlen=max_len)
ny = model.predict(st)
ny = np.rint(ny).astype(np.int64)
tokenizer.sequences_to_texts(ny)[0]

Utilfreds passager: »Prøv selv en tur klokken 7.30 en hverdagsmorgen« - Svirdur.dk


'er er på til på - | | i om på kroner går fra verdens have øst mellem næste hvis hele læs historien hus fortæller strand qvortrup larmer 81-årig kinderne sareen: klaver amager-metro invest'

In [101]:
st[0][38:41] = 8
st

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     8,     8,     8,    66,    14,  1030,  3695,
        88602,    14, 88603,     1,     8]])

In [102]:
tokenizer.sequences_to_texts(st)[0]

'svirdur.dk svirdur.dk svirdur.dk selv en tur klokken 7.30 en hverdagsmorgen« - svirdur.dk'

In [49]:
tokenizer.sequences_to_texts(st)[0]

'utilfreds passager: »prøv selv en tur klokken 7.30 en hverdagsmorgen« - svirdur.dk'

In [74]:
ny[0][0:38] = 0
ny

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,   409,   852,  2156,  4306,  9056, 13246, 24793,
        28297, 26146, 58938,   -36, 10739]], dtype=int64)

In [75]:
tokenizer.sequences_to_texts(ny)[0]

'hus fortæller strand qvortrup larmer 81-årig kinderne sareen: klaver amager-metro invest'