In [50]:
import numpy as np
%tensorflow_version 2.x
import tensorflow as tf
import tensorflow_datasets as tfds

Function for turning tensor string into a one-hot vector 

In [51]:
def string_to_one_hot(strg):
  vocab = {'A':'1', 'C': '2', 'G':'3', 'T':'0'}
  
  for key in vocab.keys(): #'ACTG' #'1230'
    strg = tf.strings.regex_replace(strg, key, rewrite = vocab[key])
    #strg = tf.strings.regex_replace(strg, pattern = 'A', rewrite = 1, replace_global = True)
  #strg = tf.strings.regex_replace(strg, pattern = "C", rewrite = 2, replace_global = True)
  #strg = tf.strings.regex_replace(strg, pattern = 'T', rewrite = 3, replace_global = True
  split = tf.strings.bytes_split(strg)
  labels = tf.cast(tf.strings.to_number(split), tf.uint8)
  onehot = tf.one_hot(labels, 4)
  onehot = tf.reshape(onehot, (-1,)) #e.g. [0001]
  return onehot

Load the Data-Set

In [52]:
test_data, training_data = tfds.load('genomics_ood', as_supervised = True, split = ['test[0:1000]', 'train[0:100000]'])


In [53]:
#Pipeline 
#First transform the string.tensors and numbers to one_hot_encodings
# i.e. the sequence "ATG" will be translated to a sequence of vectors similar to [0001][0010][0100]
training_data = training_data.map(lambda seq, label: (string_to_one_hot(seq), tf.one_hot(label, 10)))

#batch the data into smaller processable chunks
training_data = training_data.batch(128)
#shuffle the data
training_data = training_data.shuffle(buffer_size=128)
#always prepare some data while working on the last set (better performance)
training_data = training_data.prefetch(20)

#repeat the same processing steps for the test data
test_data = test_data.map(lambda seq, label: (string_to_one_hot(seq), tf.one_hot(label, 10)))
test_data = test_data.batch(128)
test_data = test_data.shuffle(buffer_size=128)
test_data = test_data.prefetch(20)


The Model


In [54]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer

# here we create our model/ network. The model is a callable object
class Model(Model):
  def __init__(self):
    #we call the super class so that our model inherits nice functions
    super(Model,self).__init__()

    # create the hidden layers with 256 neurons and sigmoid as activation fuction
    self.hidden_layer_1 = tf.keras.layers.Dense(units=256, activation=tf.keras.activations.sigmoid)
    self.hidden_layer_2 = tf.keras.layers.Dense(units=256, activation=tf.keras.activations.sigmoid)

    # create output layer with 10 eurons and softmax as activation function
    self.output_layer = tf.keras.layers.Dense(units=10, activation = tf.keras.activations.softmax)

# the call function is the forward step where we propagate the input through the network to get a prediction (output)
  def call(self, input):
    input = self.hidden_layer_1(input)
    input = self.hidden_layer_2(input)
    input = self.output_layer(input)
    return input

Training

In [55]:
def train_step(model, input, target, loss_function, optimizer):
  # write a custom training loop
  # allows for automatic differentiation: automatically computes the derivative of a function
  # by repeatedly applying the chain rule
  with tf.GradientTape() as tape:
    # first: make a prediction based on model and loss
    prediction = model(input)
    # compute the loss given the prediction and the target
    loss = loss_function(target, prediction)
    # As we already have the prediction, we can calculate the accuracy for the training sample
    sample_accuracy =  np.argmax(target, axis=1) == np.argmax(prediction, axis=1)
    # now we need the partial deriatives of the loss with respect to all the weights
    # this is where gradienttape comes in handy and allows for easy computation of all the partial derivatives
    gradients = tape.gradient(loss, model.trainable_variables)
  # last we apply the computed weight updates
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

  # return the loss and the mean of the accurrecies
  return loss, np.mean(sample_accuracy)

Testing


In [56]:
def test(model, test_data, loss_function):
  # testing the performance of the model on the test data
  # calculate the prediction loss via the respective loss function and the accuracy of the prediction.

  # initialize lists for saving the losses and accuracies of the model for each sample in the test data
  test_losses = []
  test_accuracies = []
    
  # iterate through the test data batches
  for (input, target) in test_data:
    # calculate what labels the model predicts for the current input
    prediction = model(input)
    # calculate the loss
    computed_loss = loss_function(prediction, target)
    # check if the labels predicted for the batch match with the correct target-labels
    sample_test_accuracy =  np.argmax(target, axis=1) == np.argmax(prediction, axis=1)
    # save the loss
    test_losses.append(computed_loss.numpy())
    # save the mean of the correct label predictions
    test_accuracies.append(np.mean(sample_test_accuracy))

  # calculate the mean of all the losses and accuracies
  test_loss = np.mean(test_losses)
  test_accuracy = np.mean(test_accuracies)
  return test_loss, test_accuracy


Initialization

In [None]:
#tf.keras.backend.clear_session()

#Initialize model
model = Model()

#hyperparameters
epochs = 10
learning_rate = 0.1
running_average_factor = 0.95

# Initialize the loss as categorical cross entropy.
loss = tf.keras.losses.CategoricalCrossentropy()
# Initialialise the optimizer as standard gradient descent
optimizer = tf.keras.optimizers.SGD(learning_rate)

# To keep track of the processes, we use several lists. So we can plot the results later.
train_losses = []
sample_accuracies = []
train_accuracies = []

test_losses = []
test_accuracies = []

# testing how the model performs on the test data without training
test_loss, test_accuracy = test(model, test_data, loss)
# saving the results in the respecting lists
test_losses.append(test_loss)
test_accuracies.append(test_accuracy)

# same for the training data, in oder to see how it improves from an untrained state
train_loss, train_accuracy = test(model, training_data, loss)
train_losses.append(train_loss)
train_accuracies.append(train_accuracy)

# Iterating over the epochs and for training the model
for epoch in range(epochs):
  print("----------------------------------------------------------", '\n', "Epoch nr.", epoch)

  # shuffle the datasets
  training_data = training_data.shuffle(buffer_size=128)
  test_data = test_data.shuffle(buffer_size=128)

  # to save the running average of the loss of all training samples
  running_average = 0
  # interate over the training data and compute the training loss for every input-label pair
  for (input, label) in training_data:
    # perform the raining step with the current input and label
    train_loss, train_accuracy = train_step(model, input, label, loss, optimizer)
    # sum the loss
    running_average = running_average_factor * running_average + (1 - running_average_factor) * train_loss
    # save the accuracy of the predictions in this batch
    sample_accuracies.append(train_accuracy)

  # save the training loss and the accuracy of the current epoch
  train_losses.append(running_average)
  train_accuracy = np.mean(sample_accuracies)
  train_accuracies.append(train_accuracy)

  # now evaluate the model performance on test set and save the results in the respecting lists
  test_loss, test_accuracy = test(model, test_data, loss)
  test_losses.append(test_loss)
  test_accuracies.append(test_accuracy)

  # Keeping of performance for each epoch
  print("Training accuracy: ", train_accuracy, "| Training loss: ", running_average.numpy())
  print("Test accuracy: ", test_accuracy, "| Test loss: ", test_loss)


Visualization

In [None]:
import matplotlib.pyplot as plt

#do the visualization
#test loss and training loss
plt.figure()
line1, = plt.plot(train_losses)
line2, = plt.plot(test_losses)
plt.xlabel("Training steps")
plt.ylabel("Loss")
plt.legend((line1,line2),("training","test"))
plt.show()

In [None]:
#test accuracy and training accuracy
plt.figure()
line1, plt.plot(train_accuracies)
line2, plt.plot(test_accuracies)
plt.xlabel("Training steps")
plt.ylabel("Accuracy")
plt.legend((line1,line2),("training","test"))
plt.show()

