# Overview
 This assignment trained bi-directional  LSTM to learn how to automatically punctuate a sentence. The set of operation it learns include: comma, period and question mark.
 
# Performance
              precision    recall  f1-score   support

           0       0.93      0.95      0.94    264993
           1       0.26      0.23      0.24     13813
           2       0.41      0.36      0.39     19383
           3       0.15      0.06      0.08      1331

    accuracy                            0.87    299520
    macro avg       0.44      0.40      0.41    299520
    weighted avg    0.86      0.87      0.87    299520


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation, Dropout
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import classification_report
from sklearn.utils import class_weight, compute_sample_weight

import xml.etree.ElementTree as ET
import numpy as np
import re
import glob
import random
import pickle

LSTM_CHECKPOINT_NAME = 'checkpoint/2lstm_epoch25_chunck80_file350.h5'
BLSTM_CHECKPOINT_NAME = 'checkpoint/2blstm_epoch25_chunck80_file350.h5'
TOKENIZER_NAME = 'pickle/tokenizer_chunck80_file430.pickle'
TRAINING_SPLIT = 0.8
MAX_SEQUENCE_LENGTH = 80
MAX_NUM_WORDS = 20000
NUM_FILES = 350
NUM_EPOCH = 25
BATCH_SIZES = 128
FILENAMES = glob.glob('input/*.xml')

### Data Preprocessing

In [1]:
paragraphs = []
for filename in FILENAMES[:NUM_FILES]:
    tree = ET.parse(filename)
    root = tree.getroot()
    for wordElement in root.iter('post'):
        text = wordElement.text.lower()
        text = text.strip()
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " is", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"can't", "cannot ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r'[.]+', ".", text)
        text = re.sub(r'[?]+', "?", text)
        text = re.sub(r'[!]+', ".", text)
        text = re.sub(r'[:]+', ",", text)
        text = re.sub(r'[;]+', ",", text)
        text = re.sub(r'[^.,\?a-zA-Z ]', '', text)
        paragraphs.append(text)
        #print(paragraphs)
print(len(paragraphs))

Using TensorFlow backend.


8805


In [2]:
words_labels_join = []
for paragraph in paragraphs:
    words = paragraph.split()
    for word in words:
        if re.match("^[a-z]+$", word):
            words_labels_join.append((word, 0))
        if re.match("^[a-z]+,$", word):
            words_labels_join.append((word[:-1], 1))
        if re.match("^[a-z]+\.$", word):
            words_labels_join.append((word[:-1], 2))
        if re.match("^[a-z]+\?$", word):
            words_labels_join.append((word[:-1] , 3))
print(len(words_labels_join))

1497508


In [3]:
words_labels_chunk = [words_labels_join[i:i + MAX_SEQUENCE_LENGTH] for i in range(0, len(words_labels_join), MAX_SEQUENCE_LENGTH)]
words_labels_chunk = words_labels_chunk[:-1]
random.shuffle(words_labels_chunk)

features = [[x[0] for x in sublist] for sublist in words_labels_chunk]
labels = [[x[1] for x in sublist] for sublist in words_labels_chunk]

### Vectorizing Data

In [4]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token=1)
tokenizer.fit_on_texts(features)
features_numeric = tokenizer.texts_to_sequences(features)

with open(TOKENIZER_NAME, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Splitting Data

In [7]:
length = len(features_numeric)
train_feature = features_numeric[:int(length*0.8)]
test_feature = features_numeric[int(length*0.8):]
train_label = labels[:int(length*0.8)]
test_label = labels[int(length*0.8):]

In [8]:
train_label = to_categorical(np.asarray(train_label))
test_label = to_categorical(np.asarray(test_label))

In [None]:
#lstm_model = Sequential()
#lstm_model.add(InputLayer(input_shape=(MAX_SEQUENCE_LENGTH, )))
#lstm_model.add(Embedding(MAX_NUM_WORDS+1, 128))
#lstm_model.add(LSTM(64, return_sequences=True))
#lstm_model.add(LSTM(64, return_sequences=True))
#lstm_model.add(TimeDistributed(Dense(output_dim=4, activation='softmax')))
#lstm_model.compile(loss='categorical_crossentropy',
#          optimizer='adam',
#          metrics=['categorical_accuracy'],
#          sample_weight_mode='temporal')
#lstm_model.summary()

In [None]:
#lstm_model.fit(np.array(train_feature), 
#               train_label, 
#               batch_size=128, 
#               epochs=25, 
#               validation_split=0.2)

In [16]:
def Transform(sequences, index):
    label_sequences = []
    for categorical_sequence in sequences:
        label_sequence = []
        for categorical in categorical_sequence:
            label_sequence.append(index[np.argmax(categorical)])
        label_sequences.append(label_sequence)
    return label_sequences

### Building model

In [22]:
blstm_model = Sequential()
blstm_model.add(InputLayer(input_shape=(MAX_SEQUENCE_LENGTH, )))
blstm_model.add(Embedding(MAX_NUM_WORDS, 128))
blstm_model.add(Bidirectional(LSTM(64, return_sequences=True)))
blstm_model.add(Bidirectional(LSTM(64, return_sequences=True)))
#blstm_model.add(Dropout(0.2))
blstm_model.add(TimeDistributed(Dense(4)))
blstm_model.add(Activation('softmax'))
blstm_model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'],
              sample_weight_mode='temporal')
 
blstm_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 80, 128)           2560000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 80, 128)           98816     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 80, 128)           98816     
_________________________________________________________________
time_distributed_2 (TimeDist (None, 80, 4)             516       
_________________________________________________________________
activation_1 (Activation)    (None, 80, 4)             0         
Total params: 2,758,148
Trainable params: 2,758,148
Non-trainable params: 0
_________________________________________________________________


### Training

In [23]:
blstm_model.fit(np.array(train_feature), 
          train_label, 
          batch_size=128, 
          epochs=NUM_EPOCH,
          validation_split=0.2)

Train on 11979 samples, validate on 2995 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x10fba30f0>

In [24]:
blstm_model.save(BLSTM_CHECKPOINT_NAME) 

In [25]:
blstm_y_pred = blstm_model.predict(np.array(test_feature), batch_size=BATCH_SIZES, verbose=1)



In [26]:
label_index = {0:0, 1:1, 2:2, 3:3}
text_label_trans = Transform(test_label, label_index)
blstm_y_pred_trans = Transform(blstm_y_pred, label_index)
print(classification_report(np.array(text_label_trans).flatten(), np.array(blstm_y_pred_trans).flatten(), labels=[0, 1, 2, 3]))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94    264993
           1       0.26      0.23      0.24     13813
           2       0.41      0.36      0.39     19383
           3       0.15      0.06      0.08      1331

    accuracy                           0.87    299520
   macro avg       0.44      0.40      0.41    299520
weighted avg       0.86      0.87      0.87    299520



### Demo

In [28]:
str_input = 'this is a string of text with no punctuation this is a new sentence' 
str_split = str_input.split()
str_chunk = [str_split[i:i + MAX_SEQUENCE_LENGTH] for i in range(0, len(str_split), MAX_SEQUENCE_LENGTH)]
str_numeric = np.array(tokenizer.texts_to_sequences(str_chunk))
str_pad = pad_sequences(str_numeric, MAX_SEQUENCE_LENGTH, padding='post')
blstm_str_pred = blstm_model.predict(str_pad, batch_size=64, verbose=1)
blstm_str_trans = Transform(blstm_str_pred, label_index)

result = []
for row, chunk in enumerate(str_chunk):
    for col, word in enumerate(chunk):
        if blstm_str_trans[row][col] == 0:
            result.append(word)
        if blstm_str_trans[row][col] == 1:
            result.append(word)
            result.append('<comma>')
        if blstm_str_trans[row][col] == 2:
            result.append(word)
            result.append('<period>')
        if blstm_str_trans[row][col] == 3:
            result.append(word)
            result.append('<question_mark>')
print(' '.join(result))

this is a string of text with no punctuation . this is a new sentence .
