# Study the Generative Model (GAN) for miRNA hairpin in Tensorflow

## Prepare the necessary packages

In [1]:
import tensorflow as tf
tf.enable_eager_execution()
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import time
import random

from IPython import display
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


## Load the dataset
Here we use RNA data as our first practice. In miRNA hairpin database, most of the hairpin lengths are 60nt. Therefore we only take sequences which lengths are 60nt to do the training. For each sequence, the matrix is [[0,0,1,0,0]] which represent the [A,C,G,U,N] matrix. N means if the RNA has label than A, C, G, U, it will be labeled N.

In [0]:
s = glob.glob('/content/drive/My Drive/Colab Notebooks/*')

train_RNA = np.load('/content/drive/My Drive/Colab Notebooks/table_hairpin60_s.npy')


Now we can see the shape of RNA is 44455 sequence, with 60 nts and each length has 5 types of nucleotides.

In [3]:
train_RNA.shape

(44455, 60, 4)

In [4]:
train_RNA[1]

array([[1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [1, 0,

To feed the matrix to our model, we need to add one depth to train our model and change the type to float32. After reshape and change the type we need to normalize the images to the value between (-1,1)
Because the grayscale is 0-255, we will substract 127.5 which is (255/2) and divide them by 127.5

In [0]:
#train_RNA = train_RNA.reshape(train_RNA.shape[0], 60, 5, 1).astype('float32')
train_RNA = train_RNA.astype('float32')

In [0]:
train_RNA.shape

(44455, 60, 4)

In [0]:
BUFFER_SIZE = 5000
BATCH_SIZE = 256

### Use tf.data to create batches and shuffle the dataset

In [0]:
train_dataset = tf.data.Dataset.from_tensor_slices(train_RNA).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

## Create the model
We will use tf.keras.Sequential API to construct our model

###The Generator Model


In [0]:
def make_generator_model():
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Flatten(input_shape=(240,)))
  model.add(tf.keras.layers.Dense(4*60, activation=tf.nn.relu))
  model.add(tf.keras.layers.Dropout(0.2))
  model.add(tf.keras.layers.Dense(200, activation=tf.nn.relu))
  model.add(tf.keras.layers.Dropout(0.2))
  model.add(tf.keras.layers.Dense(4*60, activation=tf.nn.relu))
  model.add(tf.keras.layers.Reshape((60,4)))
  return model

##The Discriminator model

The discriminator is responsible for distinguishing fake images from real images. It's similar to a regular CNN-based image classifier.

In [0]:
def make_discriminator_model():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Flatten(input_shape=(60, 4)))
    model.add(tf.keras.layers.Dense(240, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(120, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(60, activation=tf.nn.relu))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(1, activation=tf.nn.softmax))     
    return model

##Define the loss functions and the optimizer

Let's define the loss functions and the optimizers for the generator and the discriminator.

###Generator loss¶
The generator loss is a sigmoid cross entropy loss of the generated RNA matrix and an array of ones, since the generator is trying to generate fake RNAs that resemble the real RNAs.


In [0]:
def generator_loss(generated_output):
    return tf.losses.sigmoid_cross_entropy(tf.ones_like(generated_output), generated_output)
  #Study the function tf.ones_like()

### Discriminator loss
The discriminator loss function takes two inputs: real RNA, and generated RNAs. Here is how to calculate the discriminator loss:

Calculate real_loss which is a sigmoid cross entropy loss of the real RNAs and an array of ones (since these are the real RNAs).
Calculate generated_loss which is a sigmoid cross entropy loss of the generated RNAs and an array of zeros (since these are the fake RNAs).
Calculate the total_loss as the sum of real_loss and generated_loss.

$$
Total loss = real loss + generate loss
$$

In [0]:
def discriminator_loss(real_output, generated_output):
    # [1,1,...,1] with real output since it is true and we want our generated examples to look like it
    real_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=tf.ones_like(real_output), logits=real_output)

    # [0,0,...,0] with generated images since they are fake
    generated_loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=tf.zeros_like(generated_output), logits=generated_output)

    total_loss = real_loss + generated_loss

    return total_loss

### Setting the optimizers
The discriminator and the generator optimizers are different since we will train two networks separately.

In [0]:
generator_optimizer = tf.train.AdamOptimizer(1e-4)
discriminator_optimizer = tf.train.AdamOptimizer(1e-4)

In [0]:
generator = make_generator_model()
discriminator = make_discriminator_model()

In [0]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer,
                                 generator=generator,
                                 discriminator=discriminator)

## Set GAN for Training

In [16]:
EPOCHS = 50
noise_dim = 240 #create some noize
num_examples_to_generate = 5

def generate_random_vector(noise_dim, num_examples_to_generate):
  #In the tutorial, we will re-use this random vector used to seed the generator
  # it will be easier to see the improvement over time
  random_vec = []
  for i in range(num_examples_to_generate):
    for j in range(60):
      zero_matrix = [0,0,0,0]
      loc = random.randint(0,3)
      zero_matrix[loc] = 1
      random_vec.append(zero_matrix)
  random_vector_for_generation = tf.convert_to_tensor(np.array(random_vec),
                                                      dtype=tf.float32)
  random_vector_for_generation = tf.reshape(random_vector_for_generation,(num_examples_to_generate,noise_dim))
  return random_vector_for_generation

random_vector_for_generation = generate_random_vector(noise_dim, num_examples_to_generate)
#random_vector_for_generation = abs(tf.random_normal([num_examples_to_generate,
#                                                 noise_dim]))
random_vector_for_generation.shape

TensorShape([Dimension(5), Dimension(240)])

### Define training method
We start by iterating over the dataset. The generator is given a random vector as an input which is processed to output an RNA looking like a real miRNA. The discriminator is then shown the real miRNA sequence as well as the generated sequence.

Next, we calculate the generator and the discriminator loss. Then, we calculate the gradients of loss with respect to both the generator and the discriminator variables.

### Review
tf.GradientTape()
generator()
discriminator

In [0]:
def train_step(images):
   # generating noise from a normal distribution
      noise = tf.random_normal([BATCH_SIZE, noise_dim])
      
      with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_images = generator(noise, training=True)
      
        real_output = discriminator(images, training=True)
        generated_output = discriminator(generated_images, training=True)
         
        gen_loss = generator_loss(generated_output)
        disc_loss = discriminator_loss(real_output, generated_output)
        
      gradients_of_generator = gen_tape.gradient(gen_loss, generator.variables)
      gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.variables)
      
      generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.variables))
      discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.variables))

In [0]:
def train(dataset, epochs):  
  for epoch in range(epochs):
    start = time.time()
    print('Start epoch {}'.format(epoch))
    for images in dataset:
      train_step(images)
    #display.clear_output(wait=True)
    
    # saving (checkpoint) the model every 15 epochs
    if (epoch + 1) % 5 == 0:
      #  checkpoint.save(file_prefix = checkpoint_prefix)
      generate_and_save_RNA(generator,epochs,random_vector_for_generation)
      
    
    print ('Time taken for epoch {} is {} sec'.format(epoch + 1,
                                                      time.time()-start))
    
  # generating after the final epoch
  #display.clear_output(wait=True)
 # generate_and_save_RNA(generator,
 #                          epochs,
 #                          random_vector_for_generation)

In [0]:
def generate_and_save_RNA(model, epoch, test_input):
  # make sure the training parameter is set to False because we
  # don't want to train the batchnorm layer when doing inference.
  predictions = model(test_input, training=False)
  predictions = tf.keras.backend.eval(predictions)
  predictions = np.array(predictions)
  sequence = np.array(['A','C','G','U'])
  
  for number in predictions:
    seq = ''
    for list_label in number:
      result = np.where(list_label == np.amax(list_label))
      seq = seq + sequence[result][0]
    print(seq)
  #print(predictions[0])
  #print(np.shape(predictions))

In [20]:
 
sequence = np.array(['A','C','G','U','N'])
test = [[0,0,0,1,0], [0,1,0,0,0]]
test2 = np.array(test)
seq = ''
for list_label in test2:
  result = np.where(list_label == np.amax(list_label))
  seq = seq + sequence[result][0]
seq

'UC'

## Now let's do the training
We will call the training step to do the training.
According to my experiment, the training completed in the first few steps. I will find other algorithms to improve the algorithms in the future.

In [21]:
%%time
train(train_dataset, EPOCHS)

Start epoch 0


W0731 22:11:35.757817 140134306776960 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Time taken for epoch 1 is 8.315684795379639 sec
Start epoch 1
Time taken for epoch 2 is 7.516160011291504 sec
Start epoch 2
Time taken for epoch 3 is 7.521599054336548 sec
Start epoch 3
Time taken for epoch 4 is 7.476722240447998 sec
Start epoch 4
UUGAGCAGCAAUGGUGCAGCUAGCCAAUUAAUUGGACCCACACGGGCAUGGAAGGCGCGA
UGGCUCAGAAUUAGACCAAGGAUCCACAUGGUAAAAACCCAUCCAUGAUAUAAAACUGGA
CAACGUAGAAAGGGAGCUGGGAGACCAACGACUUGUCUGAACCCUGCAUGUCAGACCGCG
CCGAGCAGCAACGGUGCACCAUUCCUGAUAUCCAGACUCUUGGAGGCAAGUUAAAUAAGG
UUACGCGCCGUGUUUGGAGUAAUUAAAAAAAAUAAACCGUACCGCGCAUAUCCGGUUAAU
Time taken for epoch 5 is 7.42682671546936 sec
Start epoch 5
Time taken for epoch 6 is 7.432483196258545 sec
Start epoch 6
Time taken for epoch 7 is 7.506712436676025 sec
Start epoch 7
Time taken for epoch 8 is 7.396493434906006 sec
Start epoch 8
Time taken for epoch 9 is 7.52196192741394 sec
Start epoch 9
UUGAGCAGCAAUGGUGCAGCUAGCCAAUUAAUUGGACCCACACGGGCAUGGAAGGCGCGA
UGGCUCAGAAUUAGACCAAGGAUCCACAUGGUAAAAACCCAUCCAUGAUAUAAAACUGGA
CAACGUAGAAAGGGAGC

In [23]:
seed = generate_random_vector(240,20)
generate_and_save_RNA(generator,1,seed)

CAACGCAACAUGGCUGGAGCUAGGCCGAAGACUAGAUUGAUAGAGGCCUGUCAGGUGCCA
GCAUUCAGCACGAGUGCAGUGCUCAUCAUGGAUUGACUGACAGCAGCAUGUCUUCCGAUG
UCACAUAGCAAGCUUUCAGUGCUUAAAGUAUCUUAGUCCAUUAUGGCAUGUCUGGUUAGA
AAACUCGGCGAUUUUGCAGUAAGUCUAAUGGCUUUACAAACUCGGUCCGAUCUGGUGGCG
CGGCUCGGCGAGAUGCCCCUAUAUCUGGUGCCUGUACCCGUCCGACCAUGUCAGGUUGGG
UUAAACAUCUAGGUAGAAGCGAGUCUAAUGGCGGGACCCAUCCACGCUUGUCUGGAGCAA
UAACGCGCCGUGUCAAAAGCUAGGCAAAUAACUUGCCCGUCAGGAUCAUGUCUGACUCGU
CAAAGUGGCAUGGGCGGACUGAUUCUAAUGGCUGGACCGAAACUGUCAUGACUGGCGAUG
UUGCGCAACAAGAGAGUCGCUUGCCUAAUAUCUAAACCGGAACGGACAUGUCAGAUGAGU
UAACGCAACGUGGGUGAAGUUAUCCAAAUGGAUUGACCGAUCACGGCAUAUCAGGCGAAU
AUACGCGACAUGGUACAACUUAGCAAAAAAGCUGGAACCCCUGCAACAAGUCAGGUUAGG
UUGCGCACCAAGGGACUCCCAAGACAAAUGCCUAGAUCCUAACCGCCAUGUCAGAUGAGG
UGGACCGGAGAGGUUAGCGCAAGGAAAACGUCUGUUCAUAUACGAGCUUGUUAGGUGAGG
UAAUGCGUCACGGGUGCCCCGCUACACAUGACAAGACACACAGCAGGAUGUCAUUCGAUG
UCGAGCAGCAAGAACGCAGGAAGUAUAAUGACUUGACUCAAACCGGCAUAUCGAAUUAGG
UCGCACAAGGCGGGUUGACCGAUCCUAAUGAUUUGACCCAUGCGGUCCUCUCAAAUGAGA
GUACAUAACAAGAGUGCAGGGCUU