In [0]:
# Dissertation: syntax highlighter for ordinary English text
# The last update: 01/09/2019

# Please upload BBCdataset.zip first

In [0]:
!unzip BBCdataset.zip

In [0]:
from keras.models import Model, Input
from keras.layers import LSTM, Dense, Embedding, GRU, Concatenate
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
import numpy as np
import io

from os import listdir
from os.path import isfile, join
onlyfiles = [f for f in listdir('BBCdataset') if isfile(join('BBCdataset', f))]

total_data = []
for i in onlyfiles:
    with io.open('BBCdataset/'+i, encoding='utf-8') as f:
        # potential encoding problems
        try:
            data = f.read()
            total_data.append(data)
        except:
            continue

In [0]:
test_data = total_data[-24:]
train_data = total_data[:-24]

chars = sorted(set(list(''.join(total_data))))
# index dict
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [0]:
# we provide 3-layered GRU with residual connections model and 3-layered GRU
# with residual connections and an embedding layer model
# Implementation of Appendix A
def residual_GRU():
    input_char = Input(shape=(None, len(chars)))
    hidden1 = GRU(512, return_sequences=True)(input_char)
    concat1 = Concatenate(axis=-1)([input_char, hidden1])
    hidden2 = GRU(256, return_sequences=True)(concat1)
    concat2 = Concatenate(axis=-1)([hidden1, hidden2])
    hidden3 = GRU(128, return_sequences=True, name='target')(concat2)
    concat3 = Concatenate(axis=-1)([hidden2, hidden3])
    output = Dense(len(chars), activation='softmax')(concat3)
    model = Model(input_char, output)
    
    return model

def embedding_residual_GRU():
    input_char = Input(shape=(None,))
    embed = Embedding(88, 64)(input_char)
    hidden1 = GRU(512, return_sequences=True)(embed)
    concat1 = Concatenate(axis=-1)([embed, hidden1])
    hidden2 = GRU(256, return_sequences=True)(concat1)
    concat2 = Concatenate(axis=-1)([hidden1, hidden2])
    hidden3 = GRU(128, return_sequences=True, name='target')(concat2)
    concat3 = Concatenate(axis=-1)([hidden2, hidden3])
    output = Dense(len(chars), activation='softmax')(concat3)
    model = Model(input_char, output)
    
    return model

def encoding_input_output(structure):
    if structure == 'non-embedding':
        x = np.zeros((len(prev_sentences), maxlen, len(chars)))
        y = np.zeros((len(next_sentences), maxlen, len(chars)))     
        for i, sentence in enumerate(prev_sentences):
            for t, char in enumerate(sentence):
                x[i, t, char_indices[char]] = 1         
        for i, sentence in enumerate(next_sentences):
            for t, char in enumerate(sentence):
                y[i, t, char_indices[char]] = 1        
        return x, y
    
    elif structure == 'embedding':
        x = []
        for i in prev_sentences:
            indices = [char_indices[char] for char in i]
            x.append(indices)
        x = np.array(x) 
        y = np.zeros((len(next_sentences), maxlen, len(chars)))
        for i, sentence in enumerate(next_sentences):
            for t, char in enumerate(sentence):
                y[i, t, char_indices[char]] = 1        
        return x, y 

In [0]:
# choose 3-layered GRU with residual connections model for example
model = residual_GRU()

In [0]:
# cut the text in sequences of maxlen characters to create training data
maxlen = 100
step = 40
prev_sentences = []
next_sentences = []
for i in train_data:
    for j in range(0, len(i) - maxlen, step):
        prev_sentences.append(i[j: j+maxlen])
        next_sentences.append(i[j+1: j+maxlen+1])
print('number sequences:', len(prev_sentences))

In [0]:
# encoding the inputs
x, y = encoding_input_output('non-embedding')

In [0]:
adam = Adam(amsgrad=True)
early_stopping_cb = EarlyStopping(patience=3, 
                                  monitor='val_loss',
                                  restore_best_weights=True)

model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

history = model.fit(x, y,
                    batch_size=64, epochs=100,
                    validation_split = 0.05,
                    callbacks=[early_stopping_cb])