In [3]:
import pandas as pd
from collections import Counter
import tensorflow as tf 
import os
from utils import * 
from model import Encoder, Decoder

## Data Engineering and Processing

In [4]:
data = list(pd.read_csv('severeinjury.csv', encoding = 'latin9')['text'])
text = text_processing(data)[:1000]

characters = set(list(Counter(flatten(text)).keys()))
n_char = len(characters)
char2idx = dict(zip(characters, range(4, n_char + 4)))

In [5]:

def get_index_char(char):
    
    try:
        return char2idx[char]
    except:
        return 3

noisy_text = get_noisy_text(text, char2idx)

source_text_indexes = list(map(lambda s: [1] + list(map(get_index_char, s)) + [2], noisy_text))
target_text_indexes = list(map(lambda s: [1] + list(map(get_index_char, s)) + [2], text))

source_length = list(map(len, source_text_indexes))
target_length = list(map(len, source_text_indexes))

padded_source_text_indexes = tf.keras.preprocessing.sequence.pad_sequences(source_text_indexes, padding = 'post', maxlen = max(source_length))
padded_target_text_indexes = tf.keras.preprocessing.sequence.pad_sequences(target_text_indexes, padding = 'post', maxlen = max(target_length))

In [6]:
ds_source = tf.data.Dataset.from_tensor_slices(padded_source_text_indexes)
ds_target = tf.data.Dataset.from_tensor_slices(padded_target_text_indexes)
ds_source_length = tf.data.Dataset.from_tensor_slices(source_length)
ds_target_length = tf.data.Dataset.from_tensor_slices(target_length)
dataset = tf.data.Dataset.zip((ds_source, ds_target, ds_source_length, ds_target_length))

## Model Definition

In [7]:
vector_size = 128
enc_units = 128
dec_units = enc_units

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True, reduction = 'none')
encoder = Encoder(100, vector_size, enc_units)
decoder = Decoder(100, vector_size, dec_units)
optimizer = tf.keras.optimizers.Adam(1e-4)


## Training Functions

In [8]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype = loss_.dtype)
    loss_ *= mask
    return loss_

def train_step(padded_source_char_lr, padded_target_char_lr, training):

    with tf.GradientTape() as tape:
        hidden_states = tf.constant(0., shape = (padded_source_char_lr.shape[0], encoder.enc_units))
        hidden_states = [hidden_states, hidden_states]
        x, dec_hidden =  encoder(padded_source_char_lr, hidden_states, training = True)
        mask_input = tf.cast(tf.not_equal(padded_source_char_lr, 0), dtype = tf.float32)
        mask_output = tf.cast(tf.not_equal(padded_target_char_lr, 0), dtype = tf.float32)
        batch_loss = []
        for t in range(padded_target_char_lr.shape[1] - 1):
            dec_input = tf.expand_dims(padded_target_char_lr[:, t], 1)
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, h, x, mask_input, training = True)
            loss = tf.expand_dims(loss_function(padded_target_char_lr[:, t + 1], predictions), axis = 1)
            batch_loss.append(loss)
        batch_loss = tf.concat(batch_loss, axis = 1)
        batch_loss = tf.reduce_sum(batch_loss * mask_output[:, 1:], axis = 1) / (tf.reduce_sum(mask_output, axis = 1) - 1.)
        batch_loss = tf.reduce_mean(batch_loss)

        variables = encoder.trainable_variables + decoder.trainable_variables 
        gradients = tape.gradient(batch_loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))

        return batch_loss

## Training

In [9]:
batch_size = 32
epochs = 1

ds = dataset.batch(batch_size)
ds = ds.shuffle(len(padded_source_text_indexes))

for _ in range(epochs):
    noisy_text = get_noisy_text(text, char2idx, p_error = 0.05)
    source_text_indexes = list(map(lambda s: [1] + list(map(get_index_char, s)) + [2], noisy_text))
    source_length = list(map(len, source_text_indexes))
    padded_source_text_indexes = tf.keras.preprocessing.sequence.pad_sequences(source_text_indexes, padding = 'post', maxlen = max(source_length))
    ds_source = tf.data.Dataset.from_tensor_slices(padded_source_text_indexes)
    ds_source_length = tf.data.Dataset.from_tensor_slices(source_length)
    dataset = tf.data.Dataset.zip((ds_source, ds_target, ds_source_length, ds_target_length))
    ds = dataset.batch(batch_size)
    ds = ds.shuffle(len(padded_source_text_indexes))

    progbar = tf.keras.utils.Progbar(len(padded_source_text_indexes), stateful_metrics = ['Batch Loss'])
    for padded_source_char_lr, padded_target_char_lr, source_length, target_length in ds:
        maxlen_source = tf.reduce_max(source_length)
        maxlen_target = tf.reduce_max(target_length)

        batch_loss = train_step(padded_source_char_lr[:, :maxlen_source], padded_target_char_lr[:, :maxlen_target], True)
        values = [('Loss', batch_loss), ('Batch Loss', batch_loss)]
        progbar.add(padded_source_char_lr.shape[0], values = values)

KeyboardInterrupt: 

In [15]:
for padded_source_char_lr, padded_target_char_lr, source_length, target_length in ds:
    pass

In [16]:
maxlen_source = tf.reduce_max(source_length)
maxlen_target = tf.reduce_max(target_length)
padded_source_char_lr = padded_source_char_lr[:, :maxlen_source]
padded_target_char_lr = padded_target_char_lr[:, :maxlen_target]

In [23]:
hidden_states = tf.constant(0., shape = (padded_source_char_lr.shape[0], encoder.enc_units))
hidden_states = [hidden_states, hidden_states]
x, h, c =  encoder(padded_source_char_lr, hidden_states, training = True)
dec_hidden = [[h, c], [h, c]]
mask_input = tf.cast(tf.not_equal(padded_source_char_lr, 0), dtype = tf.float32)
mask_output = tf.cast(tf.not_equal(padded_target_char_lr, 0), dtype = tf.float32)
batch_loss = []


In [24]:
for t in range(padded_target_char_lr.shape[1] - 1):
    dec_input = tf.expand_dims(padded_target_char_lr[:, t], 1)
    predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, h, x, mask_input, training = True)
    loss = tf.expand_dims(loss_function(padded_target_char_lr[:, t + 1], predictions), axis = 1)
    batch_loss.append(loss)
batch_loss = tf.concat(batch_loss, axis = 1)
batch_loss = tf.reduce_sum(batch_loss * mask_output[:, 1:], axis = 1) / (tf.reduce_sum(mask_output, axis = 1) - 1.)
batch_loss = tf.reduce_mean(batch_loss)

In [25]:
batch_loss

<tf.Tensor: id=207949, shape=(), dtype=float32, numpy=4.642414>