In [12]:
import numpy as np
from keras.models import Model
from keras.layers import Input, LSTM, RepeatVector, Embedding, Bidirectional, Activation, Dense, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing import sequence
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import csv
import codecs

In [13]:
def verbatim_to_char_seq(name, char_indices, maxLen):
    name_chars = list(name)
    name_chars_indices = list(map(lambda char: char_indices[char], name_chars))
    return sequence.pad_sequences([name_chars_indices], maxLen, padding="post", truncating="post")[0]

In [14]:
#import verbatims, put into two lists
verbatims = []
allowedChars = []
with codecs.open('Data/Catagorization training data.csv', 'r', encoding='ascii', errors='ignore') as f:
    reader = csv.reader(f)
    for line in reader:
        temp = line[1].replace('\n', ' ').replace('\r', ' ').replace('<', ' ').replace('>', ' ').replace('*', ' ')
        temp = temp.replace('%', ' ').replace('&', ' ').replace('#', ' ').replace('~', ' ').replace('@', ' ')
        temp = temp.replace('=', ' ').replace('`', ' ').replace(';', ' ').replace('_', ' ').replace('+', ' ')
        temp = (temp[:198] + '..') if len(temp) > 200 else temp
        verbatims.append(temp)

In [15]:
char_list = list(set(''.join(verbatims)))
char_indices = dict((c, i) for i, c in enumerate(char_list))
indices_char = dict((i, c) for i, c in enumerate(char_list))

In [16]:
# get rid of duplicates
verbatims = list(set(verbatims))

In [17]:
#determine the maximum length of the verbatims
maxLen = 0
for v in verbatims:
    if len(v) > maxLen:
        maxLen = len(v)
print(maxLen)

#if the max length is < 50, pad verbatim
#if maxLen < 50:
#    maxLen = 50

200


In [18]:
#create actual dataset to be fed into keras model
X = []

for n in verbatims:
    X.append(verbatim_to_char_seq(n, char_indices, maxLen))
    
X = np.array(X).astype(np.uint8)

print(X.shape)

(118479, 200)


In [19]:
import pickle
pickle.dump((char_list, char_indices, indices_char, X), open('char_autoenc.pkl','wb'))

In [3]:
import pickle
char_list, char_indices, indices_char, X = pickle.load(open( 'char_autoenc.pkl', "rb" ) )

In [20]:
print('Vectorization...')
maxLen = X.shape[1]
import numpy as np

Z = np.zeros((len(X), maxLen, len(char_list)), dtype=np.bool)
for i, seq in enumerate(X):
    for t, word in enumerate(seq[:-1]):
        Z[i, t, word] = 1
        
print(Z[0])

Vectorization...
[[False False False ..., False False False]
 [False False False ..., False False False]
 [False False False ..., False False False]
 ..., 
 [ True False False ..., False False False]
 [ True False False ..., False False False]
 [False False False ..., False False False]]


In [27]:
import keras.backend as K
#create keras model
batch_size = 512
nb_epoch = 200

latent_dim = 64

timesteps = maxLen
inputs = Input(shape=(maxLen, len(char_list)))
print("inputs", K.int_shape(inputs) )

#(number of samples, number of timesteps, number of features)
encoded = LSTM(latent_dim)(inputs)
print("encoded", K.int_shape(encoded))

decoded = RepeatVector(timesteps)(encoded)
print("decoded", K.int_shape(decoded))

decoded = LSTM(len(char_list), return_sequences=True)(decoded)
print("output", K.int_shape(decoded))

# is sigmoid the best choice?
activation = Activation('sigmoid')(decoded)

sequence_autoencoder = Model(inputs, activation)
encoder = Model(inputs, encoded)

#optimizer = Adam(lr = 0.005)
#optimizer = RMSprop(lr=0.01)
sequence_autoencoder.compile(loss='categorical_crossentropy', optimizer='adam')

early_stopping = EarlyStopping(patience=5, verbose=1)
#checkpointer = ModelCheckpoint(filepath='char_lstm2_keras_weights.hdf5', verbose=1, save_best_only=True)
#model2.load_weights('char_lstm2_keras_weights.hdf5')


inputs (None, 200, 48)
encoded (None, 64)
decoded (None, 200, 64)
output (None, 200, 48)


In [None]:
#train model on test data
sequence_autoencoder.fit(Z, Z, 
          batch_size=batch_size, 
          epochs=nb_epoch,
          verbose=1,
          shuffle=True,
          validation_split=0.1)

Train on 106631 samples, validate on 11848 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200

In [None]:
sequence_autoencoder.save("seq2seq_model.h5")
#from keras.models import load_model
#sequence_autoencoder = load_model("seq2seq_model.h5")