In [1]:
!pip install tensorflow_addons

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.14.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 11.2 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.14.0


In [2]:
from google.colab import files
uploaded = files.upload()
# Upload dataset

Saving preprocessed.txt to preprocessed.txt


In [3]:
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
import pandas as pd
import random

In [4]:
def load_data():
    # Loads in syllable data
    dataframe = pd.read_csv("preprocessed.txt",
                            sep=",",
                            encoding="ISO-8859-1",
                            names=["word", "label"])
    # Necessary to specify str type for pandas columns
    dataframe = dataframe.astype(str)
    words = dataframe['word'].tolist()
    labels = dataframe['label'].tolist()
    # Converts each label to numpy array
    for i in range(0, len(labels)):
        labels[i] = list(labels[i])
        for j in range(0, len(labels[i])):
            labels[i][j] = int(labels[i][j])
    for i in range(0, len(labels)):
        labels[i] = np.array(labels[i])

    # Vectorises syllable strings by treating each character as a token
    tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
    tokenizer.fit_on_texts(words)
    words = tokenizer.texts_to_sequences(words)
    for i in range(0, len(words)):
        words[i] = np.array(words[i], dtype=float)

    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        words, padding="post", maxlen=15
    )
    padded_outputs = tf.keras.preprocessing.sequence.pad_sequences(
        labels, padding="post", maxlen=15
    )

    # Normalisation
    maximum_token = 37
    for element in range(0, len(words)):
        words[element] = words[element] / maximum_token

    # Shuffles data
    seed = random.random()
    random.seed(seed)
    random.shuffle(padded_inputs)
    random.seed(seed)
    random.shuffle(padded_outputs)

    # Splits into training, validation, and test sets (64-16-20 split)
    training_inputs = padded_inputs[0:113590]
    training_outputs = padded_outputs[0:113590]
    validation_inputs = padded_inputs[113590:141987]
    validation_outputs = padded_outputs[113590:141987]
    test_inputs = padded_inputs[141987:]
    test_outputs = padded_outputs[141987:]

    return training_inputs, training_outputs, validation_inputs, validation_outputs, test_inputs, test_outputs

In [5]:
train_in, train_out, val_in, val_out, test_in, test_out = load_data()

train_dataset = tf.data.Dataset.from_tensor_slices((train_in, train_out))
validation_dataset = tf.data.Dataset.from_tensor_slices((val_in, val_out))

BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 500

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
validation_dataset = validation_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)

In [14]:
def inception_module(inputs, units, residual=True):
        # 1D version of Inception module, with residual connections.
        inception_branch_1 = tf.keras.layers.Conv1D(units, kernel_size=1, strides=2, activation="tanh")(inputs)
        inception_branch_1 = tf.keras.layers.ZeroPadding1D(padding=(0, 15 - inception_branch_1.shape[1]))(inception_branch_1)

        inception_branch_2 = tf.keras.layers.Conv1D(units, kernel_size=1, activation="tanh")(inputs)
        inception_branch_2 = tf.keras.layers.Conv1D(units, kernel_size=3, strides=2, activation="tanh")(inception_branch_2)
        inception_branch_2 = tf.keras.layers.ZeroPadding1D(padding=(0, 15 - inception_branch_2.shape[1]))(inception_branch_2)

        inception_branch_3 = tf.keras.layers.AveragePooling1D(pool_size=3, strides=2)(inputs)
        inception_branch_3 = tf.keras.layers.Conv1D(units, kernel_size=3, activation="tanh")(inception_branch_3)
        inception_branch_3 = tf.keras.layers.ZeroPadding1D(padding=(0, 15 - inception_branch_3.shape[1]))(inception_branch_3)

        inception_branch_4 = tf.keras.layers.Conv1D(units, kernel_size=1, activation="tanh")(inputs)
        inception_branch_4 = tf.keras.layers.Conv1D(units, kernel_size=3, activation="tanh")(inception_branch_4)
        inception_branch_4 = tf.keras.layers.Conv1D(units, kernel_size=3, strides=2, activation="tanh")(inception_branch_4)
        inception_branch_4 = tf.keras.layers.ZeroPadding1D(padding=(0, 15 - inception_branch_4.shape[1]))(inception_branch_4)

        if residual == True:
            inception_output = tf.keras.layers.add([inception_branch_1, inception_branch_2, inception_branch_3, inception_branch_4])
            inception_output = tf.keras.layers.concatenate([inception_output, inputs])
            return inception_output
        else:
            inception_output = tf.keras.layers.add([inception_branch_1, inception_branch_2, inception_branch_3, inception_branch_4])
            return inception_output

In [15]:
def build_model():
    inputs = tf.keras.Input(shape=(15,))
    embedded_inputs = tf.keras.layers.Embedding(64, 256, mask_zero=True)(inputs)

    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(embedded_inputs)
    x = tf.keras.layers.concatenate([x, embedded_inputs])
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(x)
    x = tf.keras.layers.concatenate([x, embedded_inputs])
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(x)

    inception_output = inception_module(embedded_inputs, 128, residual=False)

    output = tf.keras.layers.concatenate([x, inception_output, embedded_inputs])
    output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(256, activation="relu"))(output)
    decoded_sequence, potentials, sequence_length, kernel = tfa.layers.CRF(2)(output)

    return tf.keras.Model(
        inputs=inputs, outputs=[decoded_sequence, potentials, sequence_length, kernel]
    )

model = build_model()

  return py_builtins.overload_of(f)(*args)


In [16]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 15)]         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 15, 256)      16384       input_3[0][0]                    
__________________________________________________________________________________________________
bidirectional_6 (Bidirectional) (None, 15, 256)      296448      embedding_2[0][0]                
__________________________________________________________________________________________________
conv1d_18 (Conv1D)              (None, 15, 128)      32896       embedding_2[0][0]                
____________________________________________________________________________________________

In [17]:
@tf.function
def crf_loss_func(potentials, sequence_length, kernel, y):
    crf_likelihood, _ = tfa.text.crf_log_likelihood(
        potentials, y, sequence_length, kernel
    )
    # likelihood to loss
    flat_crf_loss = -1 * crf_likelihood
    sample_weight = 4.108897148948174
    flat_crf_loss = flat_crf_loss * sample_weight
    crf_loss = tf.reduce_mean(flat_crf_loss)

    return crf_loss


optimizer = tf.keras.optimizers.Adam(0.002)
train_loss = tf.keras.metrics.Mean(name="train_loss")
validation_loss = tf.keras.metrics.Mean(name="val_loss")
train_acc_metric = tf.keras.metrics.BinaryAccuracy()
val_acc_metric = tf.keras.metrics.BinaryAccuracy()

@tf.function(experimental_relax_shapes=True)
def train_step(x, y):
    with tf.GradientTape() as tape:
        decoded_sequence, potentials, sequence_length, kernel = model(x)
        crf_loss = crf_loss_func(potentials, sequence_length, kernel, y)
        loss = crf_loss + tf.reduce_sum(model.losses)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    
    train_acc_metric.update_state(y, decoded_sequence)
    train_loss(loss)

In [None]:
EPOCHS = 25
for epoch in range(EPOCHS):
    # Reset the metrics at the start of the next epoch
    train_loss.reset_states()
    validation_loss.reset_states()
    train_acc_metric.reset_states()
    val_acc_metric.reset_states()

    for x, y in train_dataset:
        train_step(x, y)

    print(f"E{epoch+1} loss: {train_loss.result():.4f}")
    print(f"E{epoch+1} binary_accuracy: {train_acc_metric.result():.4f}")

    for x, y in validation_dataset:
        decoded_sequence, potentials, sequence_length, kernel = model(x, training=False)
        val_acc_metric.update_state(y, decoded_sequence)
        crf_loss = crf_loss_func(potentials, sequence_length, kernel, y)
        loss = crf_loss + tf.reduce_sum(model.losses)
        validation_loss(loss)

    val_acc = val_acc_metric.result()
    val_loss = validation_loss.result()
    print(f"E{epoch+1} val_binary_accuracy: {val_acc:.4f}")
    print(f"E{epoch+1} val_loss: {val_loss:.4f}")

  return py_builtins.overload_of(f)(*args)


E1 loss: 4.7259
E1 binary_accuracy: 0.9503


  "CRF decoding models have serialization issues in TF >=2.5 . Please see isse #2476"


E1 val_binary_accuracy: 0.9613
E1 val_loss: 3.6763


In [None]:
model.save_weights('my_checkpoint')
files.download('my_checkpoint')

In [None]:
# To do: FIX
def predict_token(word):
    dataframe = pd.read_csv("./preprocessed.txt",
                              sep=",",
                              encoding="ISO-8859-1",
                              names=["word", "label"])
    dataframe = dataframe.astype(str)
    words = dataframe['word'].tolist()

    tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
    tokenizer.fit_on_texts(words)

    word = tokenizer.texts_to_sequences(word)
    return np.array(word)