In [38]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

# Configurations

In [39]:
letters = "abcdefghijklmnopqrstuvwxyzäüöß"
symbols = " " + letters

symbol_count = len(symbols)
vector_size = 16

word_size = 32


# Data

In [40]:
def new_words_gen(filename, accent_char="'"):
    with open(filename, 'r', encoding='utf8') as file:
        for line in file:
            accent_word = line.strip().lower()
            
            accent = accent_word.find(accent_char)
            if accent == -1:
                continue
            
            word = accent_word[:accent] + accent_word[accent + 1:]
            if word.find(accent_char) != -1:
                # print("skipped word \""+accent_word+"\" because of multiple accents.")
                continue
            
            if len(word) > word_size:
                # print("skipped word \""+accent_word+"\" because it is to long.")
                continue
            
            yield word, accent


next(new_words_gen('./data/data.txt'))


('aachen', 2)

In [55]:
def word_to_indices(word):
    indices = []
    for c in word:
        index = symbols.find(c)
        if index == -1:
            continue
        indices.append(index)
    indices.extend([0] * (word_size - len(indices)))
    return indices

def new_indices_gen(words_gen):
    for word, accent in words_gen:
        indices = word_to_indices(word)
        yield indices, accent


print(next(new_indices_gen(new_words_gen('./data/data.txt'))))

([1, 1, 3, 8, 5, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 2)


In [42]:
indices, labels = zip(*new_indices_gen(new_words_gen('./data/data.txt')))
indices = np.array(indices)
labels = np.array(labels)
dataset_size = len(indices)

dataset = tf.data.Dataset.from_tensor_slices((indices, labels))
del indices, labels

train_set_size = int(dataset_size * 0.7)
dataset = dataset.shuffle(1024)
train_dataset = dataset.take(train_set_size)
valid_dataset = dataset.skip(train_set_size)
del dataset, train_set_size


# Letter to Vectors

In [43]:
class LetterToVector(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.vectors = self.add_weight('vectors', (symbol_count, vector_size), tf.float32)
    
    def call(self, inputs, **kwargs):
        return tf.gather(self.vectors, inputs, axis=0, batch_dims=1)


# LetterToVector()(tf.random.uniform((10, word_size), 0, symbol_count, tf.int32)).shape


# BRCNN

In [44]:
class BRCNN(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.forward_cell = tf.keras.Sequential([
            tf.keras.layers.Dense(64, activation='tanh', input_shape=[vector_size * 2]),
            tf.keras.layers.Dense(16, activation='tanh'),
        ])
        self.backward_cell = tf.keras.Sequential([
            tf.keras.layers.Dense(64, activation='tanh', input_shape=[vector_size * 2]),
            tf.keras.layers.Dense(16, activation='tanh'),
        ])
        self.union_cell = tf.keras.Sequential([
            tf.keras.layers.Dense(64, activation='tanh', input_shape=[16 * 2]),
            tf.keras.layers.Dense(16, activation='tanh'),
        ])
    
    def call(self, inputs, **kwargs):
        vectors = tf.unstack(inputs, word_size, axis=-2)
        
        forward_outputs = []
        forward_memory = tf.zeros([tf.shape(inputs)[0], 16])
        for vector in vectors:
            cell_input = tf.concat([vector, forward_memory], axis=-1)
            forward_memory = self.forward_cell(cell_input)
            forward_outputs.append(forward_memory)
        
        backward_outputs = []
        backward_memory = tf.zeros([tf.shape(inputs)[0], 16])
        for vector in reversed(vectors):
            cell_input = tf.concat([vector, backward_memory], axis=-1)
            backward_memory = self.backward_cell(cell_input)
            backward_outputs.append(backward_memory)
        
        union_outputs = []
        for fo, bo in zip(forward_outputs, reversed(backward_outputs)):
            union_input = tf.concat([fo, bo], axis=-1)
            union_output = self.union_cell(union_input)
            union_outputs.append(union_output)
        
        union_outputs = tf.stack(union_outputs, axis=-2)
        return union_outputs


In [45]:
class Model(tf.keras.Model):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.ltv = LetterToVector()
        self.brcnn = BRCNN()
        self.scorer = tf.keras.Sequential([
            tf.keras.layers.Dense(16, activation='tanh'),
            tf.keras.layers.Dense(1)])
    
    def call(self, inputs, training=None, mask=None):
        vectors = self.ltv(inputs)
        vectors = self.brcnn(vectors)
        score = self.scorer(vectors)[..., 0]
        prob = tf.nn.softmax(score, axis=-1)
        return prob


In [46]:
model = Model()

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.sparse_categorical_crossentropy,
    metrics=[tf.keras.metrics.sparse_categorical_accuracy]
)

In [47]:
model.fit(
    x=train_dataset.batch(64),
    validation_data=valid_dataset.batch(64),
    epochs=100,
    callbacks=[
        tf.keras.callbacks.TensorBoard(),
        # tf.keras.callbacks.EarlyStopping(),
        tf.keras.callbacks.TerminateOnNaN(),
        tf.keras.callbacks.ModelCheckpoint('./logs/weights_epoch{epoch}_vloss{val_loss}.hdf5', save_best_only=True)
    ]
)

Train for 105 steps, validate for 45 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoc

<tensorflow.python.keras.callbacks.History at 0x17b489e4788>

In [65]:
words = [ "Anhaltend","anlaufgeschützt" ]
indices = [word_to_indices(word) for word in words]

probs = model.predict(indices)
positions = np.argmax(probs,axis=-1)

for word, position in zip(words, positions):
    print(word[:position+1]+"'"+word[position+1:])


A'nhaltend
an'laufgeschützt
