In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers.core import Activation, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing import sequence
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import csv
import codecs

Using Theano backend.


In [2]:
def verbatim_to_char_seq(name, char_indices, maxLen):
    name_chars = list(name)
    name_chars_indices = list(map(lambda char: char_indices[char], name_chars))
    return sequence.pad_sequences([name_chars_indices], maxLen)[0]

In [3]:
#import verbatims, put into two lists
verbatims = []
labels = []

with codecs.open('allitcomments_categorized.csv', 'r', 'utf8') as f:
    reader = csv.reader(f)
    for line in reader:
        verbatims.append(line[0])
        labels.append(line[1])

In [4]:
char_list = list(set(''.join(verbatims)))
labels_list = list(set(labels))
char_indices = dict((c, i) for i, c in enumerate(char_list))
indices_char = dict((i, c) for i, c in enumerate(char_list))
label_indices = dict((l, i) for i, l in enumerate(labels_list))
indices_label = dict((i, l) for i, l in enumerate(labels_list))

In [5]:
# get rid of duplicates and blanks

verbatims = [v.replace('\n', ' ') for v in verbatims]

objs = []
for obj in list(zip(verbatims, labels)):
    if len(obj[0].strip()) != 0 and obj not in objs:
        objs.append(obj)

In [6]:
#separate out verbatims and labels again
verbatims = []
labels = []

for n, l in objs:
    verbatims.append(n)
    labels.append(l)

In [7]:
#determine the maximum length of the verbatims
maxLen = 0
for v in verbatims:
    if len(v) > maxLen:
        maxLen = len(v)
print(maxLen)

#if the max length is < 50, pad verbatim
#if maxLen < 50:
#    maxLen = 50

249


In [8]:
#create actual dataset to be fed into keras model
X = []
y = []

for n, l in zip(verbatims, labels):
    X.append(verbatim_to_char_seq(n, char_indices, maxLen))
    y.append(label_indices[l])
    
X = np.array(X).astype(np.uint8)
y = np_utils.to_categorical(np.array(y)).astype(np.bool)

print(X.shape, y.shape)

(43492, 249) (43492, 137)


In [9]:
#create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=4)

In [18]:
#create keras model
batch_size = 32
epochs = 100

model = Sequential()
model.add(Embedding(len(char_list), 64, input_length=maxLen, mask_zero=True))
model.add(LSTM(64, activation="tanh", return_sequences=False, kernel_initializer="glorot_uniform", recurrent_initializer="orthogonal", recurrent_activation="hard_sigmoid"))
model.add(Dropout(0.5))
model.add(Dense(len(labels_list)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

early_stopping = EarlyStopping(patience=5, verbose=1)
checkpointer = ModelCheckpoint(filepath='astro_lstm_keras_weights.hdf5', 
                               verbose=1, 
                               save_best_only=True)

In [19]:
#train model on test data
model.fit(X_train, y_train, 
          batch_size=batch_size, 
          epochs=epochs, 
          verbose=1,
          shuffle=True,
          validation_split=0.1,
          callbacks=[early_stopping, checkpointer])

Train on 35227 samples, validate on 3915 samples
Epoch 1/100
  128/35227 [..............................] - ETA: 131198s - loss: 4.9170

KeyboardInterrupt: 

In [None]:
model.load_weights('astro_lstm_keras_weights.hdf5')
preds = model.predict_classes(X_test, batch_size=64, verbose=0)

print('')
print(classification_report(np.argmax(y_test, axis=1), preds, target_names=labels_list))
print('')
print(confusion_matrix(np.argmax(y_test, axis=1), preds))