In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, LSTM, GRU, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing import sequence
from keras.utils import np_utils
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report
import csv
import codecs

Using TensorFlow backend.


In [2]:
def verbatim_to_char_seq(name, char_indices, maxLen):
    name_chars = list(name)
    name_chars_indices = list(map(lambda char: char_indices[char], name_chars))
    return sequence.pad_sequences([name_chars_indices], maxLen)[0]

In [None]:
#import verbatims, put into two lists
verbatims = []
labels = []
allowedChars = []
with codecs.open('Data/Catagorization training data.csv', 'r', encoding='ascii', errors='ignore') as f:
    reader = csv.reader(f)
    for line in reader:
        temp = line[1].replace('\n', ' ').replace('\r', ' ').replace('<', ' ').replace('>', ' ').replace('*', ' ')
        temp = temp.replace('%', ' ').replace('&', ' ').replace('#', ' ').replace('~', ' ').replace('@', ' ')
        temp = temp.replace('=', ' ').replace('`', ' ').replace(';', ' ').replace('_', ' ').replace('+', ' ')
        temp = (temp[:198] + '..') if len(temp) > 200 else temp
        verbatims.append(temp)
        labels.append(line[0])

In [None]:
char_list = list(set(''.join(verbatims)))
labels_list = list(set(labels))
char_indices = dict((c, i) for i, c in enumerate(char_list))
indices_char = dict((i, c) for i, c in enumerate(char_list))
label_indices = dict((l, i) for i, l in enumerate(labels_list))
indices_label = dict((i, l) for i, l in enumerate(labels_list))

In [None]:
# get rid of duplicates and blanks

objs = []
for obj in list(zip(verbatims, labels)):
    if len(obj[0].strip()) != 0:
        objs.append(obj)

objs = list(set(objs))

In [None]:
#separate out verbatims and labels again
verbatims = []
labels = []

for n, l in objs:
    verbatims.append(n)
    labels.append(l)
    
print(len(verbatims))

In [None]:
#determine the maximum length of the verbatims
maxLen = 0
for v in verbatims:
    if len(v) > maxLen:
        maxLen = len(v)
print(maxLen)

#if the max length is < 50, pad verbatim
#if maxLen < 50:
#    maxLen = 50

In [None]:
#create actual dataset to be fed into keras model
X = []
y = []

for n, l in zip(verbatims, labels):
    X.append(verbatim_to_char_seq(n, char_indices, maxLen))
    y.append(label_indices[l])
    
X = np.array(X).astype(np.uint8)
y=np.array(y)
y = np_utils.to_categorical(np.array(y)).astype(np.bool)

print(X.shape, y.shape)

In [None]:
import pickle
pickle.dump((char_list, labels_list, char_indices, indices_char, label_indices, indices_label, X, y), open('char_lstm_file.pkl','wb'))

In [4]:
import pickle
char_list, labels_list, char_indices, indices_char, label_indices, indices_label, X, y = pickle.load(open( 'char_lstm_file.pkl', "rb" ) )

In [5]:
print(X.shape, y.shape)

(118640, 200) (118640, 119)


In [7]:
X[0]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0, 19, 38, 24, 40, 38, 45,  3, 22, 30,  5, 44, 34, 45, 21,
        6, 44, 34, 34, 34, 21, 18,  3, 44, 45, 21, 38, 45], dtype=uint8)

In [3]:
#create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=4)
#y = np_utils.to_categorical(y).astype(np.bool)

NameError: name 'X' is not defined

In [None]:
#create keras model
batch_size = 512
nb_epoch = 200

model = Sequential()
model.add(Embedding(len(char_list), 64, input_length=maxLen, mask_zero=True))
model.add(Bidirectional(LSTM(64, activation="tanh", return_sequences=False, kernel_initializer="glorot_uniform", recurrent_initializer="orthogonal", recurrent_activation="hard_sigmoid")))
model.add(Dropout(0.5))
model.add(Dense(len(labels_list)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

early_stopping = EarlyStopping(patience=5, verbose=1)
checkpointer = ModelCheckpoint(filepath='char_lstm_keras_weights.hdf5', verbose=1, save_best_only=True)
#model.load_weights('char_lstm_keras_weights.hdf5')

In [None]:
#train model on test data
model.fit(X_train, y_train, 
          batch_size=batch_size, 
          epochs=nb_epoch,
          verbose=1,
          shuffle=True,
          validation_split=0.1,
          callbacks=[early_stopping, checkpointer])

In [None]:
model.load_weights('char_lstm_keras_weights.hdf5')
preds = model.predict_classes(X_test, batch_size=64, verbose=0)

print('')
print(classification_report(np.argmax(y_test, axis=1), preds, target_names=labels_list))
print('')
print(confusion_matrix(np.argmax(y_test, axis=1), preds))

In [None]:
import pandas as pd
df=pd.DataFrame(classification_report(np.argmax(y_test, axis=1), preds, target_names=labels_list))

In [None]:
import itertools
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
cm=confusion_matrix(np.argmax(y_test, axis=1), preds)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.colorbar()
tick_marks = np.arange(len(labels_list))
plt.xticks(tick_marks, labels_list, rotation=45)
plt.yticks(tick_marks, labels_list)
thresh = cm.max() / 2.
#for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
#    plt.text(j, i, cm[i, j],horizontalalignment="center",color="white" if cm[i, j] > thresh else "black")

#plt.tight_layout()
print(thresh)
plt.ylabel('True label')
plt.xlabel('Predicted label')

In [None]:
#create keras model
batch_size = 512
nb_epoch = 200

model2 = Sequential()
model2.add(Embedding(len(char_list), 64, input_length=maxLen, mask_zero=True))
model2.add(Bidirectional(LSTM(64, activation="tanh", return_sequences=True, kernel_initializer="glorot_uniform", recurrent_initializer="orthogonal", recurrent_activation="hard_sigmoid")))
#model2.add(Dropout(0.5))
model2.add(LSTM(64, activation="tanh", return_sequences=False, kernel_initializer="glorot_uniform", recurrent_initializer="orthogonal", recurrent_activation="hard_sigmoid"))
model2.add(Dropout(0.5))
model2.add(Dense(len(labels_list)))
model2.add(Activation('softmax'))

model2.compile(loss='categorical_crossentropy', optimizer='rmsprop')

early_stopping = EarlyStopping(patience=5, verbose=1)
checkpointer = ModelCheckpoint(filepath='char_lstm2_keras_weights.hdf5', verbose=1, save_best_only=True)
#model2.load_weights('char_lstm2_keras_weights.hdf5')

In [None]:
#train model on test data
model2.fit(X_train, y_train, 
          batch_size=batch_size, 
          epochs=nb_epoch,
          verbose=1,
          shuffle=True,
          validation_split=0.1,
          callbacks=[early_stopping, checkpointer])

In [None]:
model2.load_weights('char_lstm2_keras_weights.hdf5')
preds = model2.predict_classes(X_test, batch_size=512, verbose=0)

print('')
print(classification_report(np.argmax(y_test, axis=1), preds, target_names=labels_list))
#print('')
#print(confusion_matrix(np.argmax(y_test, axis=1), preds))

In [None]:
#create keras model
batch_size = 512
nb_epoch = 200
maxLen=X.shape[1]

model3 = Sequential()
model3.add(Embedding(len(char_list), 64, input_length=maxLen, mask_zero=True))
model3.add(Bidirectional(LSTM(64, activation="tanh", return_sequences=True, kernel_initializer="glorot_uniform", recurrent_initializer="orthogonal", recurrent_activation="hard_sigmoid")))
model3.add(LSTM(64, activation="tanh", return_sequences=True, kernel_initializer="glorot_uniform", recurrent_initializer="orthogonal", recurrent_activation="hard_sigmoid"))
model3.add(LSTM(64, activation="tanh", return_sequences=False, kernel_initializer="glorot_uniform", recurrent_initializer="orthogonal", recurrent_activation="hard_sigmoid"))
model3.add(Dropout(0.5))
model3.add(Dense(len(labels_list)))
model3.add(Activation('softmax'))

model3.compile(loss='categorical_crossentropy', optimizer='rmsprop')

early_stopping = EarlyStopping(patience=5, verbose=1)
checkpointer = ModelCheckpoint(filepath='char_lstm3_keras_weights.hdf5', verbose=1, save_best_only=True)
model3.load_weights('char_lstm3_keras_weights.hdf5')

In [None]:
#train model on test data
model3.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=nb_epoch,
          verbose=1,
          shuffle=True,
          validation_split=0.1,
          callbacks=[early_stopping, checkpointer])

In [None]:
model3.load_weights('char_lstm3_keras_weights.hdf5')
preds = model3.predict_classes(X_test, batch_size=512, verbose=0)

print('')
print(classification_report(np.argmax(y_test, axis=1), preds, target_names=labels_list))

In [None]:
#create keras model
batch_size = 512
nb_epoch = 200
activations=['elu','softplus','softsign','relu','tanh','sigmoid','hard_sigmoid','linear']
models=[]
for activation1 in activations:
    for activation2 in activations:
        model3 = Sequential()
        model3.add(Embedding(len(char_list), 64, input_length=maxLen, mask_zero=True))
        model3.add(Bidirectional(LSTM(64, activation="tanh", return_sequences=True, kernel_initializer="glorot_uniform", recurrent_initializer="orthogonal", recurrent_activation="hard_sigmoid")))
        #model2.add(Dropout(0.5))
        model3.add(LSTM(64, activation="tanh", return_sequences=True, kernel_initializer="glorot_uniform", recurrent_initializer="orthogonal", recurrent_activation="hard_sigmoid"))
        #model3.add(Dropout(0.5))
        model3.add(LSTM(64, activation="tanh", return_sequences=False, kernel_initializer="glorot_uniform", recurrent_initializer="orthogonal", recurrent_activation="hard_sigmoid"))
        model3.add(Dropout(0.5))
        model3.add(Dense(len(labels_list)))
        model3.add(Activation('softmax'))
        model3.compile(loss='categorical_crossentropy', optimizer='adam')
        models.append([activation1, activation2, model])
        
early_stopping = EarlyStopping(patience=5, verbose=1)
#checkpointer = ModelCheckpoint(filepath='char_lstm3_keras_weights.hdf5', verbose=1, save_best_only=True)
#model2.load_weights('char_lstm3_keras_weights.hdf5')

In [None]:
#train models on test data
for model in models:
    m=model[2]
    m.fit(X_train, y_train, 
          batch_size=batch_size, 
          epochs=nb_epoch,
          verbose=1,
          shuffle=True,
          validation_split=0.1,
          callbacks=[early_stopping])
    preds = m.predict_classes(X_test, batch_size=512, verbose=0)
    print(model[0], model[1])
    print(classification_report(np.argmax(y_test, axis=1), preds, target_names=labels_list))

In [None]:
model3.load_weights('char_lstm3_keras_weights.hdf5')
preds = model2.predict_classes(X_test, batch_size=64, verbose=0)

print('')
print(classification_report(np.argmax(y_test, axis=1), preds, target_names=labels_list))
#print('')
#print(confusion_matrix(np.argmax(y_test, axis=1), preds))

In [None]:
v=[]
with codecs.open('Data/verbatims.csv', 'r', encoding='ascii', errors='ignore') as f:
    reader = csv.reader(f)
    for line in reader:
        v.append(line[0])

In [None]:
v[1]

In [None]:
X_real=[]
vo=map(lambda x:x.upper().replace("'","").replace("\"","").replace("?","").replace("~",""),v)
for n in vo:
    X_real.append(verbatim_to_char_seq(n, char_indices, maxLen))
    
X_real = np.array(X_real).astype(np.uint8)

In [None]:
p = model2.predict_classes(X_real, batch_size=64, verbose=0)

In [None]:
pArray = map(lambda x:indices_label[x],p)

In [None]:
pArray

In [None]:
with codecs.open('Data/verbatimResults.csv', 'w', encoding='ascii', errors='ignore') as f:
    wr = csv.writer(f, dialect='excel')
    for item in pArray:
        wr.writerow([item])