In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import numpy as np
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, LSTM, GRU, Bidirectional, TimeDistributed
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing import sequence
from keras.utils import np_utils
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report
import csv
import codecs
import pickle

Using TensorFlow backend.


In [2]:
def verbatim_to_char_seq(name, char_indices, maxLen):
    name_chars = list(name)
    name_chars_indices = list(map(lambda char: char_indices[char], name_chars))
    return sequence.pad_sequences([name_chars_indices], maxLen)[0]

In [3]:
def cleanVerbatim(verbatim):
    temp = (verbatim[:200]) if len(verbatim) > maxLen else verbatim
    temp = temp.upper().replace('\n', ' ').replace('\r', ' ').replace('<', ' ').replace('>', ' ').replace('*', ' ')
    temp = temp.replace('%', ' ').replace('&', ' ').replace('#', ' ').replace('~', ' ').replace('@', ' ').replace('"', ' ')
    temp = temp.replace('=', ' ').replace('`', ' ').replace(';', ' ').replace('_', ' ').replace('+', ' ').replace('$',' ').replace('/', ' ')
    temp = temp.replace(')', ' ').replace('(', ' ').replace(']', ' ').replace('[', ' ').replace(':', ' ').replace('\\', ' ').replace('-', ' ')
    return temp

In [4]:
#import verbatims, put into two lists
verbatims = []
labels = []
allowedChars = []
maxLen=200
with codecs.open('Data/Catagorization training data.csv', 'r', encoding='ascii', errors='ignore') as f:
    reader = csv.reader(f)
    for line in reader:
        verbatims.append(cleanVerbatim(line[1]))
        labels.append(line[0])

In [5]:
char_list = []
char_list = list(set(''.join(verbatims)))
labels_list = list(set(labels))
char_indices = dict((c, i+1) for i, c in enumerate(char_list))
indices_char = dict((i+1, c) for i, c in enumerate(char_list))
label_indices = dict((l, i) for i, l in enumerate(labels_list))
indices_label = dict((i, l) for i, l in enumerate(labels_list))
pickle.dump( [char_list, labels_list, char_indices, indices_char, label_indices, indices_label], open( "save.p", "wb" ) )

In [6]:
# get rid of duplicates and blanks
objs = []
for obj in list(zip(verbatims, labels)):
    if len(obj[0].strip()) != 0:
        objs.append(obj)

objs = list(set(objs))

In [7]:
#separate out verbatims and labels again
verbatims = []
labels = []

for n, l in objs:
    verbatims.append(n)
    labels.append(l)
    
print(len(verbatims))

118171


In [8]:
#determine the maximum length of the verbatims
maxLen = 0
for v in verbatims:
    if len(v) > maxLen:
        maxLen = len(v)
print(maxLen)

#if the max length is < 50, pad verbatim
#if maxLen < 50:
#    maxLen = 50

200


In [9]:
#create actual dataset to be fed into keras model
X = []
y = []

for n, l in zip(verbatims, labels):
    X.append(verbatim_to_char_seq(n, char_indices, maxLen))
    y.append(label_indices[l])
    
X = np.array(X).astype(np.uint8)
y=np.array(y)
y = np_utils.to_categorical(np.array(y)).astype(np.bool)

print(X.shape, y.shape)

(118171, 200) (118171, 119)


In [10]:
#create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
#y = np_utils.to_categorical(y).astype(np.bool)

In [11]:
import itertools
import matplotlib.pyplot as plt
%matplotlib inline

In [15]:
#create keras model2, with 2 bidirectional LSTMs
batch_size = 2048
nb_epoch = 200
embedVectorLength = 8
model2 = Sequential()
model2.add(Embedding(len(char_list), embedVectorLength, input_length=maxLen, mask_zero=True))
model2.add(Bidirectional(LSTM(32)))
model2.add(LSTM(32))
model2.add(Dropout(0.5))
model2.add(TimeDistributed(Dense(len(labels_list))))
model2.add(Activation('softmax'))

model2.compile(loss='categorical_crossentropy', optimizer='adam')

early_stopping = EarlyStopping(patience=5, verbose=1)
checkpointer = ModelCheckpoint(filepath='char_lstm2_keras_weights.hdf5', verbose=1, save_best_only=True)
#model2.load_weights('char_lstm2_keras_weights.hdf5')


ValueError: Input 0 is incompatible with layer lstm_8: expected ndim=3, found ndim=2

In [None]:
#train model on test data
model2.fit(X_train, y_train,
          batch_size=batch_size, 
          epochs=nb_epoch,
          verbose=1,
          shuffle=True,
          validation_split=0.02,
          callbacks=[early_stopping, checkpointer])

In [None]:
model2.load_weights('char_lstm2_keras_weights.hdf5')
#preds = model2.predict_classes(X_test, batch_size=64, verbose=0)

#print('')
print(classification_report(np.argmax(y_test, axis=1), preds, target_names=labels_list))
#print('')
#print(confusion_matrix(np.argmax(y_test, axis=1), preds))

In [None]:
cm=confusion_matrix(np.argmax(y_test, axis=1), preds)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.colorbar()
tick_marks = np.arange(len(labels_list))
plt.xticks(tick_marks, labels_list, rotation=45)
plt.yticks(tick_marks, labels_list)
thresh = cm.max() / 2.
#for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
#    plt.text(j, i, cm[i, j],horizontalalignment="center",color="white" if cm[i, j] > thresh else "black")

#plt.tight_layout()
print(thresh)
plt.ylabel('True label')
plt.xlabel('Predicted label')

In [None]:
#Production verbatim classification
model2.load_weights('char_lstm2_keras_weights.hdf5')
v=[]
with codecs.open('Data/verbatims.csv', 'r', encoding='ascii', errors='ignore') as f:
    reader = csv.reader(f)
    for line in reader:
        v.append(line[0])

In [None]:
v[1]

In [None]:
X_real=[]
verbatims=[]
with codecs.open('Data/verbatims.csv', 'r', encoding='ascii', errors='ignore') as f:
    reader = csv.reader(f)
    for line in reader:
        temp = line[0].upper().replace('\n', ' ').replace('\r', ' ').replace('<', ' ').replace('>', ' ').replace('*', ' ')
        temp = temp.replace('%', ' ').replace('&', ' ').replace('#', ' ').replace('~', ' ').replace('@', ' ')
        temp = temp.replace('=', ' ').replace('`', ' ').replace(';', ' ').replace('_', ' ').replace('+', ' ')
        temp = (temp[:198] + '..') if len(temp) > maxLen else temp
        verbatims.append(temp)


for n in verbatims:
    X_real.append(verbatim_to_char_seq(n, char_indices, maxLen))
    
X_real = np.array(X_real).astype(np.uint8)

In [None]:
p = model2.predict_classes(X_real, batch_size=64, verbose=0)

In [None]:
pArray = map(lambda x:indices_label[x],p)

In [None]:
pArray

In [None]:
with codecs.open('Data/verbatimResults.csv', 'w', encoding='ascii', errors='ignore') as f:
    wr = csv.writer(f, dialect='excel')
    for item in pArray:
        wr.writerow([item])