# Typo correction #

## Preprocessing ##

In [89]:
from datetime import datetime
from typing import List
from keras import Model
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, TensorBoard, ModelCheckpoint
from keras.layers import Bidirectional
from keras.optimizers import Adam

class CharacterTokenizer:
    def __init__(self, max_word_len: int):
        self._charset = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ _-')
        self._char_to_index_dict = {}
        self._index_to_char_dict = {}
        self._max_word_len = max_word_len
        self._init_translate_dicts()

    def _init_translate_dicts(self):
        for index, char in enumerate(self._charset):
            self._char_to_index_dict[char] = index
            self._index_to_char_dict[index] = char

    def get_charset_len(self) -> int:
        return len(self._charset)

    def encode_one_hot(self, word: str) -> List:
        one_hot = np.zeros((self._max_word_len, len(self._charset)), dtype=np.float32)
        for index, char in enumerate(word):
            one_hot[index, self._char_to_index_dict[char]] = 1.0
        return one_hot

    def decode_one_hot_prediction(self, one_hot_matrix: List) -> str:
        #cutoff = 1.0e-1
        adjusted_one_hot = []
        for vector in one_hot_matrix:
            max_idx = np.argmax(vector)
            adjusted_vector = np.zeros(len(vector))
            #if vector[max_idx] > cutoff:
            adjusted_vector[max_idx] = 1.0
            adjusted_one_hot.append(adjusted_vector)
        return self.decode_word(adjusted_one_hot)               

    def decode_word(self, one_hot: List) -> str:
        word = ''
        for encoded_char in one_hot:
            if max(encoded_char) != 0:
                word += self._index_to_char_dict[np.where(encoded_char == 1.0)[0][0]]
        return word

In [90]:
from typing import List, Tuple
import numpy as np
from math import ceil

def load_dataset(filename: str) -> Tuple:
    x = []
    y = []
    with open(filename) as f:
        content = f.read()
    lines = content.strip().split('\n')
    for line in lines:
        elements = line.split('\t')
        x.append(elements[0])
        y.append(elements[1])
    return x, y

def add_padding_to_dataset(x: List, y: List, max_len:int) -> Tuple:
    for i in range(0, len(x)):
        for _ in range(0, max_len - len(x[i])):
            x[i] += ' '
        for _ in range(0, max_len - len(y[i])):
            y[i] += ' '
    return x, y

def get_max_char_len(x, y) -> int:
    x_max = len(max(x, key=len))
    y_max = len(max(y, key=len))
    return x_max if x_max > y_max else y_max

def encode(x: List, y: List, tokenizer: CharacterTokenizer) -> Tuple:
    for idx, _ in enumerate(x):
        x[idx] = tokenizer.encode_one_hot(x[idx])
        y[idx] = tokenizer.encode_one_hot(y[idx])
    return x, y

def split_data(x: List, y: List, ratio: float = 0.8) -> Tuple:
    train_num = ceil(len(x) * ratio)
    return (np.array(x[:train_num]),
            np.array(y[:train_num]),
            np.array(x[train_num:]),
            np.array(y[train_num:]))

In [91]:
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense

def autoencoder(input_shape) -> Model:
    _, x = input_shape
    
    input_layer = Input(shape=input_shape)
    encoder_1 = Dense(x - 5, activation='relu')(input_layer)
    encoded_2 = Dense(x - 15, activation='relu')(encoder_1)
    encoded_3 = Dense(x - 20, activation='relu')(encoded_2)

    bottleneck = Dense(1000, activation='relu')(encoded_3) 

    decoded_1 = Dense(x - 20, activation='relu')(bottleneck)
    decoded_2 = Dense(x - 15, activation='relu')(decoded_1)
    decoded_3 = Dense(x - 10, activation='relu')(decoded_2)

    out = Dense(x, activation='sigmoid')(decoded_3)

    model = Model(input_layer, out)
    return model

def autoencoder_(input_shape) -> Model:
    _, x = input_shape
    
    input_layer = Input(input_shape)
    
    encoder = Dense(x - 5, activation='relu')(input_layer)
    encoder = Dense(x - 15, activation='relu')(encoder)
    encoder = Dense(x - 20, activation='relu')(encoder)

    bottleneck = Dense(1000, activation='relu')(encoder) 

    decoder = Dense(x - 20, activation='relu')(bottleneck)
    decoder = Dense(x - 15, activation='relu')(decoder)
    decoder = Dense(x - 10, activation='relu')(decoder)
    %
    output_layer = Dense(x, activation='sigmoid')(decoder)
    model = Model(input_layer, output_layer)
    return model

## Training ##

In [92]:
import tensorflow as tf
from tensorflow.keras import optimizers

X, Y = load_dataset('typo-corpus-r1.txt')
max_len = get_max_char_len(X, Y)
tokenizer = CharacterTokenizer(max_word_len=max_len)
X, Y = add_padding_to_dataset(X, Y, max_len)
X, Y = encode(X, Y, tokenizer)
X_train, Y_train, X_test, Y_test = split_data(X, Y, ratio=0.9)

ae = autoencoder_(input_shape=(max_len, tokenizer.get_charset_len()))
adam = optimizers.Adam(learning_rate=0.001, clipnorm=1)

ae.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

ae.summary()

# callbacks
log_dir = "logs\\fit\\" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

ae.fit(X_train, Y_train, epochs=50, batch_size=128, verbose=0, 
       callbacks=[tensorboard_callback])#, shuffle=True)

score = ae.evaluate(X_test, Y_test, verbose=0)
print(f'## \nTest set: \nLoss: {score[0]}\nAccuracy: {score[1]}')
print('## \nSample predictions:')

predicted_y = ae.predict(X_test[:20])
for i in range (0, 20):
    print(f'Input: {tokenizer.decode_word(X_test[i])}')
    print(f'Predicted: {tokenizer.decode_one_hot_prediction(predicted_y[i])}')
    print(f'Real: {tokenizer.decode_word(Y_test[i])}\n')

ae.save('models/')


AttributeError: Can't set the attribute "name", likely because it conflicts with an existing read-only @property of the object. Please choose a different name.