### Imports/Data

In [None]:
import pandas as pd
import math
import numpy as np
from collections import Counter
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Dropout
from keras.callbacks import EarlyStopping
from keras import metrics, optimizers
from keras import backend as K

#Define unknown, pad, start, and end characters as Hindi letters since they won't be seen in the data
UNK = 'अ'
PAD = 'आ'
START = 'श'
END = 'स'

dataset = pd.read_csv(r"../data/processed_dataset.csv").to_numpy()

### Construct Character Vocabulary

1. Get two lists, one containing all of the characters used in the original documens and one containing all of the characters used in the modernized documents
2. Add the special `UNK`, `PAD`, `START`, and `END` characters to each list.
3. Create a vocabulary (i.e. a dictionary) mapping each character to an integer.

In [None]:
# Original tokens are in the first column and modernized tokens are in the second column
original = list(dataset[:, 0])
modernized = list(dataset[:, 1])

input_characters = list(set((character for word in original for character in word)))
input_characters += [UNK, PAD, START, END]
input_characters = sorted(input_characters)
input_vocab = {character:index for index, character in enumerate(input_characters)}

labels_characters = list(set((character for word in modernized for character in word)))
labels_characters += [UNK, PAD, START, END]
labels_characters = sorted(labels_characters)
labels_vocab = {character:index for index, character in enumerate(labels_characters)}

### Vectorization and Batch Generation

In [None]:
def vectorize_sequence(seq, vocab):
    """Takes a sequence of words and returns a sequence of integers."""
    seq = [tok if tok in vocab else UNK for tok in seq]
    return [vocab[tok] for tok in seq]


def unvectorize_sequence(seq, vocab):
    """Takes a sequence of integers and returns a sequence of words."""
    vocab_words = list(vocab.keys())
    return [vocab_words[i] for i in seq]


def one_hot_encode_label(character, labels):
    """One-hot encodes a character."""
    vec = [1.0 if label == character else 0.0 for label in labels]
    return np.array(vec)


def pad_sequences(sequences, pad_length, pad_value):
    """Takes a batch of sequences of different lengths and pads them with the PAD character so that they are all the same length."""
    
    for i, sequence in enumerate(sequences):
        if len(sequence) < pad_length:
            sequences[i] = sequence + ([pad_value] * (pad_length - len(sequence)))
    return sequences

def batch_generator(data, labels, vocab, labels_vocab, batch_size=1):
    """Generates a batch of samples for training"""
    while True:
        batch_x = []
        batch_y = []
        for word, normalized_word in zip(data, labels):
            word = START + word + END
            normalized_word = START + normalized_word + END
            batch_x.append(vectorize_sequence(word, vocab))
            batch_y.append([one_hot_encode_label(character, labels_vocab) for character in normalized_word])
            if len(batch_x) >= batch_size:
                # Pad Sequences in batch to same length
                pad_length = len(max(batch_x + batch_y, key=lambda x: len(x)))
                batch_x = pad_sequences(batch_x, pad_length, vocab[PAD])
                batch_y = pad_sequences(batch_y, pad_length, one_hot_encode_label(PAD, labels_vocab))
                yield np.array(batch_x), np.array(batch_y)
                batch_x = []
                batch_y = []
                

### Model

In [None]:
def make_model(input_vocab, labels_vocab, embedding_size, hidden_size, dropout):
    """Builds and returns a Keras model."""
    model = Sequential()
    model.add(Embedding(len(input_vocab.keys()), embedding_size))
    model.add(Dropout(dropout))
    model.add(Bidirectional(LSTM(hidden_size, return_sequences=True)))
    model.add(Dropout(dropout))
    model.add(TimeDistributed(Dense(len(labels_vocab.keys()), activation='softmax')))

    adadelta = optimizers.Adadelta(clipnorm=1.0)
    model.compile(optimizer=adadelta, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

### Training


In [None]:
model = make_model(input_vocab, labels_vocab, 100, 10, 0.3)

# Shuffle the dataset
np.random.seed(1)
np.random.shuffle(dataset)

# Split dataset into three parts: 80% training, 10% dev, 10% test
train, dev, test = np.split(dataset, [math.floor(.8*len(dataset)), math.floor(.9*len(dataset))])

train_x = train[:, 0]
train_y = train[:, 1]
dev_x = dev[:, 0]
dev_y = dev[:, 1]
test_x = test[:, 0]
test_y = test[:, 1]

batch_size = 500
epochs = 1

model.fit_generator(batch_generator(train_x, train_y, input_vocab, labels_vocab, batch_size),
                    epochs=epochs,
                    steps_per_epoch=len(train_x) / batch_size,
                    callbacks=[EarlyStopping(monitor="acc", patience=2)])
#print("Evaluating...")
#print(len(x_eval))
#print(len(y_eval))
#loss, acc = model.evaluate(x_eval, y_eval, batch_size=50, verbose=1)