### Imports/Data

In [1]:
import pandas as pd
import math
import numpy as np
from collections import Counter
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Dropout
from keras.callbacks import EarlyStopping
from keras import metrics, optimizers
from keras import backend as K

# Define unknown, pad, start, and end characters as Hindi letters since they won't be seen in the data
UNK = 'अ'
PAD = 'आ'
START = 'श'
END = 'स'

dataset = pd.read_csv(r"../data/processed_dataset.csv").to_numpy()

Using TensorFlow backend.


### Construct Character Vocabulary

1. Get two lists, one containing all of the characters used in the original documens and one containing all of the characters used in the modernized documents
2. Add the special `UNK`, `PAD`, `START`, and `END` characters to each list.
3. Create a vocabulary (i.e. a dictionary) mapping each character to an integer.

In [2]:
# Original tokens are in the first column and modernized tokens are in the second column
original = list(dataset[:, 0])
modernized = list(dataset[:, 1])

input_characters = list(set((character for word in original for character in word)))
input_characters += [UNK, PAD, START, END]
input_characters = sorted(input_characters)
input_vocab = {character:index for index, character in enumerate(input_characters)}

labels_characters = list(set((character for word in modernized for character in word)))
labels_characters += [UNK, PAD, START, END]
labels_characters = sorted(labels_characters)
labels_vocab = {character:index for index, character in enumerate(labels_characters)}

### Vectorization and Batch Generation

In [3]:
def vectorize_sequence(seq, vocab):
    """Takes a sequence of words and returns a sequence of integers."""
    seq = [tok if tok in vocab else UNK for tok in seq]
    return [vocab[tok] for tok in seq]


def unvectorize_sequence(seq, vocab):
    """Takes a sequence of integers and returns a sequence of words."""
    vocab_words = list(vocab.keys())
    return [vocab_words[i] for i in seq]


def one_hot_encode_label(character, labels):
    """One-hot encodes a character."""
    vec = [1.0 if label == character else 0.0 for label in labels]
    return np.array(vec)


def pad_sequences(sequences, pad_length, pad_value):
    """Takes a batch of sequences of different lengths and pads them with the PAD character so that they are all the same length."""
    
    for i, sequence in enumerate(sequences):
        if len(sequence) < pad_length:
            sequences[i] = sequence + ([pad_value] * (pad_length - len(sequence)))
    return sequences

def batch_generator(data, labels, vocab, labels_vocab, batch_size=1):
    """Generates a batch of samples for training."""
    while True:
        batch_x = []
        batch_y = []
        for word, normalized_word in zip(data, labels):
            word = word[0:10]
            normalized_word = normalized_word[0:10]
            word = START + word + END
            normalized_word = START + normalized_word + END
            batch_x.append(vectorize_sequence(word, vocab))
            batch_y.append([one_hot_encode_label(character, labels_vocab) for character in normalized_word])
            if len(batch_x) >= batch_size:
                # Pad Sequences in batch to same length
                pad_length = len(max(batch_x + batch_y, key=lambda x: len(x)))
                batch_x = pad_sequences(batch_x, pad_length, vocab[PAD])
                batch_y = pad_sequences(batch_y, pad_length, one_hot_encode_label(PAD, labels_vocab))
                yield np.array(batch_x), np.array(batch_y)
                batch_x = []
                batch_y = []    

### Model

In [4]:
def make_model(input_vocab, labels_vocab, embedding_size, hidden_size, dropout):
    """Builds and returns a Keras model."""
    model = Sequential()
    model.add(Embedding(len(input_vocab.keys()), embedding_size))
    model.add(Dropout(dropout))
    model.add(Bidirectional(LSTM(hidden_size, return_sequences=True)))
    model.add(Dropout(dropout))
    model.add(TimeDistributed(Dense(len(labels_vocab.keys()), activation='softmax')))

    adadelta = optimizers.Adadelta(clipnorm=1.0)
    model.compile(optimizer=adadelta, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

### Training


In [6]:
# Shuffle the dataset
np.random.seed(1)
np.random.shuffle(dataset)

# Split dataset into three parts: 80% training, 10% dev, 10% test
train, dev, test = np.split(dataset, [math.floor(.8*len(dataset)), math.floor(.9*len(dataset))])



train_x = train[:, 0]
train_y = train[:, 1]

batch_size = 1000
epochs = 3

model = make_model(input_vocab, labels_vocab, embedding_size=200, hidden_size=200, dropout=0.4)
model.fit_generator(batch_generator(train_x, train_y, input_vocab, labels_vocab, batch_size),
                    epochs=epochs,
                    steps_per_epoch=len(train_x) / batch_size,
                    callbacks=[EarlyStopping(monitor="acc", patience=5)])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x23f66297e80>

### Evaluation

Use `model.evaluate()`, not `model.evaluate_generator()`, which is incredibly slow since it pushes one sample through the network at a time. On the other hand, `model.evaluate()` allows for a large batch size.

In [7]:
# Columns of original words and modernized words
dev_words = dev[:, 0]
dev_labels = dev[:, 1]

# Vectorized inputs and labels
dev_x = []
dev_y = []

for word, normalized_word in zip(dev_words, dev_labels):
    word = START + word[0:10] + END
    normalized_word = START + normalized_word[0:10] + END
    dev_x.append(vectorize_sequence(word, input_vocab))
    dev_y.append([one_hot_encode_label(label, labels_vocab) for label in normalized_word])
pad_length = len(max(dev_x + dev_y, key=lambda x: len(x)))
dev_x = pad_sequences(dev_x, pad_length, input_vocab[PAD])
dev_y = pad_sequences(dev_y, pad_length, one_hot_encode_label(PAD, labels_vocab))
dev_x = np.array(dev_x)
dev_y = np.array(dev_y)

loss, acc = model.evaluate(dev_x, dev_y, batch_size=1000, verbose=1)

print('Loss:', loss, 'Acc:', acc)

Loss: 0.6172433609311805 Acc: 0.8793510425235568


#### Evaluation After Removing PAD Characters

Dev accuracy is relatively good (around 75%). However, there are a lot of padding tokens in the character sequences that the model outputs. For example, the output for the word "en" is "enआआआआआआआआआआआआआआआआआआआआआआआआ" because all words are padded to the maximum length. We need to remove all of the padding tokens in order to get a more useful accuracy metric


In [9]:
# A list of possible output characters
labels_list = list(labels_vocab.keys())

predicted_words = []
# Each prediction is something like a 26 x 57 array, where 26 is the number of characters in the word and 57 is the size of the vocabulary
dev_predictions = model.predict(dev_x, batch_size = 1000)
for word_vector in dev_predictions:
    word = ""
    for character_num in range(len(word_vector)):
        # Find the index of the character in the vocabulary with highest probability according to the model
        vocab_index = np.argmax(word_vector[character_num])
        # Convert that index back into a character and append it to the current word
        character = labels_list[vocab_index]
        word += character
    predicted_words.append(word)
print(predicted_words[0])

# Remove all PAD characters in each word
predicted_words = [word.replace(PAD, "") for word in predicted_words]
predicted_words = [word.replace(START, "") for word in predicted_words]
predicted_words = [word.replace(END, "") for word in predicted_words]

dev_labels_truncated = [word[0:10] for word in dev_labels]

results = np.column_stack((predicted_words, dev_labels_truncated))

# Calculate accuracy as the percentage of exact matches between the model output (without PAD) and the labels
accuracy = np.mean([1 if result[0] == result[1] else 0 for result in results])
print(f"Accuracy after removing all padding characters: {accuracy}")

शdemasसआआआआआ
Accuracy after removing all padding characters: 0.6223014977834903
