In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.layers import Layer

# Task 1: Data Set

The following cell loads the first 10% of training and 1% of test examples of the 'genomics_ood' tensorflow dataset. The parameters in the tfds.load() function are shortly described, or more info please see the tensorflow overview on how to load datasets with tfds: https://www.tensorflow.org/datasets/overview

In [2]:
# TODO: CHANGE TO 10% FOR TRAIN DATA TO GET 100.000 TRAINING EXAMPLES!!!

(train_data, test_data), ds_info = tfds.load('genomics_ood', 
                                   split=['train[:1%]', 'test[:1%]'], # Only load the first 10% of train and 1% of the test examples of the dataset as we only want 100.000 train and 1.000 test examples
                                   as_supervised=True, # This allows to call data, label for train and test data
                                   shuffle_files=True, # Shuffle
                                   try_gcs=True, # Load data from GCS bucket
                                   download=False, # Do not download the dataset locally
                                   with_info=True # This is only necessary if one wants to load info on the dataset (to variable ds_info)
                                   )

When loading the data set, one can choose to also get info on the dataset by setting with_info=True in tfds.load(). To take a look at the info, run the following cell.

In [3]:
# Print general info on dataset
print(ds_info)

tfds.core.DatasetInfo(
    name='genomics_ood',
    version=0.0.1,
    description='Bacteria identification based on genomic sequences holds the promise of early
detection of diseases, but requires a model that can output low confidence
predictions on out-of-distribution (OOD) genomic sequences from new bacteria
that were not present in the training data.

We introduce a genomics dataset for OOD detection that allows other researchers
to benchmark progress on this important problem. New bacterial classes are
gradually discovered over the years. Grouping classes by years is a natural way
to mimic the in-distribution and OOD examples.

The dataset contains genomic sequences sampled from 10 bacteria classes that
were discovered before the year 2011 as in-distribution classes, 60 bacteria
classes discovered between 2011-2016 as OOD for validation, and another 60
different bacteria classes discovered after 2016 as OOD for test, in total 130
bacteria classes. Note that training, validation, 

Also, we can take a look at the data. Because we set as_supervised=True when loading the data, we can iterate over either train or test data by calling 'for data, label in train:' and print the info we want. In the following cell, the first 5 training examples and labels will be printed.

In [4]:
# If one wants to take a look at the data
for i, (input, label) in enumerate(train_data): # This is possible because we set as_supervised=True when loading the dataset
  print('(' + str(i+1) + ')')

  # Data is a of type string:
  tf.print('Input:', input) # Use tf.print to only print string element (for print(data), the entire tensor content will be printed)

  # Labels are int ranging from 0-9
  tf.print('Label:', label, '\n')

  # This is to interrupt after 5 examples where shown
  if(i >= 4):
    break

(1)
Input: "CACAGCCGGCCGCTGACCTGCTGGCCATCGGAGCGCTGGCCGGCCTCGAGGATATTGGCCAGCAGCAGGTGGATGTCTCGGGCATAGCGCTCCCCCTGGTAGGTGATGCGAATGCTGCGGCCCTGGCGCTCGGTCAGGGCAAAACCCAGGGTCTGCTCCAGGCTCTTGATCTGGTGGCTGATGGCGCTGGGCGTCAGGTTCAGCTCATTGGCGGCCTCGGCGACACTGCCCAGGCGGGCCACCGCGTCCA"
Label: 5 

(2)
Input: "AAACTATGTTATATTCACGATGATTAACTTACAAAGGAGTTTCAACTATGAAGATGATAAACAAATTAATCGTTCCGGTAACAGCTAGTGCTTTATTATTAGGCGCTTGTGGCGCTAGTGCCACAGACTCTAAAGAAAATACATTAATTTCTTCTAAAGCTGGAGACGTAACAGTTGCAGATACAATGAAAAAAATCGGTAAAGATCAAATTGCAAATGCATCATTTACTGAAATGTTAAATAAAATTTT"
Label: 7 

(3)
Input: "CGCCGGCACCGTTGCTGGCCAAAATCGCCGAGCGTCCGGATGCCGGCATGCATCGTGAATCGTCTTATCTGAAATGGCACTGGCGCGTTTGCCGGGAACTTCTCCAACGTCGGGAGCACGGGGCAACTCATGGCTAAACTCATCGTGGGCAACGTCGATAACGAAGCAATGATCGGGGACACGAAGCGTGCATCGCTTCCGCTTCGCCAGGTATCAGCGATTGCAGCAAGGCGCCTCGTCTGGCAGATGA"
Label: 1 

(4)
Input: "GCAGGTGCTGTTGGCCGGCACCAACCACCACATCCGCCTGCTGCAGAATGGCCAGCTGGCCTACACTGCCGAGCCGGTCAACGAAATCTATCGGCCTTCGATCGATGTGTTCTTCGAAAGCGTCGCGCGCTATTGGTCGGGCGATGCGGTGGGCGTGCTGCTC

The goal is to have the data one-hot-encoded. For this, we can first use the map function calling a function that encodes the letters to digits from 0-3, then constructs onehot encoding for each of the three numbers (0-3) and then applies this encoding to the previously 'translated' string.

In [5]:
# Mapping dictionary from letters to numbers
mapping = {'A': '0', 'C': '1', 'G': '2', 'T': '3'}

def one_hot_encoding(input, label):
  """
  Preprocess inputs and labels: Create onehot encoded input and labelling.
  """

  # Encode each letter to one number 0-3 according to 'mapping'-dictionary
  for key in mapping:
    input = tf.strings.regex_replace(input, key, mapping[key])
  
  # Split after each number
  split = tf.strings.bytes_split(input)

  # Encode input to Onehot encoding (numbers are strings atm, so convert those 
  # to ints before onehot encoding them)
  l = tf.cast(tf.strings.to_number(split), tf.uint8)
  onehot_input = tf.one_hot(l, 4)
  onehot_input = tf.reshape(onehot_input, (-1,))

  # Encode labels to onehot encoding
  onehot_label = tf.one_hot(label, 10)
  onehot_label = tf.reshape(onehot_label, (-1,10))

  return [onehot_input], onehot_label

In [6]:
train_data_new = tf.data.Dataset.from_tensors(train_data)
t = train_data_new.take(1)

for input in t:
  print(input)

<_VariantDataset shapes: ((), ()), types: (tf.string, tf.int64)>


In [7]:
# Preprocessing pipeline for train and test data
train_data_encoded = train_data.map(one_hot_encoding)
test_data_encoded = test_data.map(one_hot_encoding)

In [8]:
# TODO: this can be deleted, only to take a look 
this = train_data_encoded.take(1)

for input, label in this:
  print(input)

tf.Tensor(
[[1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1.
  1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0.
  0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1.
  0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.
  1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0.
  0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1.
  0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0.
  1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1.
  0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0.
  0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.
  0. 0. 0. 1. 1. 0. 0. 0. 1

# Task 2: Model

In [18]:
class Model(Layer):

  def __init__(self):
    # Call super-class (of Model)
    super(Model, self).__init__()

    # Get activation functions
    SIGMOID = tf.keras.activations.sigmoid
    SOFTMAX = tf.keras.activations.softmax

    # Define fully connected (Dense) layers: two hidden layers with 256 units each, one output layer
    self.hidden_layer_1 = tf.keras.layers.Dense(256, activation=SIGMOID)
    self.hidden_layer_2 = tf.keras.layers.Dense(256, activation=SIGMOID)

    # Using softmax as output for network because we have 10 categories
    self.output_layer = tf.keras.layers.Dense(10, activation=SOFTMAX)

  
  @tf.function
  def call(self, x):
    # Define the forward step through two hidden layers to output layer
    x = self.hidden_layer_1(x)
    x = self.hidden_layer_2(x)
    x = self.output_layer(x)
    return x


# Task 3: Training

In [17]:
# Define parameters
num_epochs = 10
learning_rate = 0.001
running_average_factor = 0.95
LOSS = tf.keras.losses.categorical_crossentropy
OPTIMIZER = tf.keras.optimizers.SGD(learning_rate)

# Initialize model
model = Model()

# Lists to store losses and accuracies for training and test steps
train_losses, train_accuracies = [], []
test_losses, test_accuracies = [], []

In [12]:
def train_step(model, input, target, loss_function, optimizer):
  """
  One training step on a given model with specified input, target, loss function 
  and optimizer.
  """
  with tf.GradientTape() as tape:
    prediction = model(input)
    loss = loss_function(target, prediction)
    gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  return loss

def test(model, test_data, loss_function):
  """
  Tests model on test data to receive loss and accuracy on previously unseen 
  data.
  """

  test_accuracy_aggregator = []
  test_loss_aggregator = []

  for (input, target) in test_data:
    prediction = model(input)
    sample_test_loss = loss_function(target, prediction)
    sample_test_accuracy = np.argmax(target, axis=1) == np.argmax(prediction, axis=1)
    sample_test_accuracy = np.mean(sample_test_accuracy)
    test_loss_aggregator.append(sample_test_loss.numpy())
    test_accuracy_aggregator.append(np.mean(sample_test_accuracy))

    test_loss = np.mean(test_loss_aggregator)
    test_accuracy = np.mean(test_accuracy_aggregator)

    return test_loss, test_accuracy

In [13]:
import time

for epoch in range(num_epochs):

  start = time.time()
  print('Start epoch ' + str(epoch) + '...')

  running_average = 0

  for i, (input, target) in enumerate(train_data_encoded):
    train_loss = train_step(model, input, target, LOSS, OPTIMIZER)
    running_average = running_average_factor * running_average + (1 - running_average_factor) * train_loss
  
  train_losses.append(running_average)

  test_loss, test_accuracy = test(model, test_data_encoded, LOSS)
  test_losses.append(test_loss)
  test_accuracies.append(test_accuracy)

  # Measure time required for each epoch
  print('... finished after ' + str(time.time() - start) + ' seconds')


Start epoch 0...
... finished after 67.1598310470581 seconds
Start epoch 1...
... finished after 65.94725322723389 seconds
Start epoch 2...


KeyboardInterrupt: ignored

In [14]:
test_accuracies

[1.0, 0.0]