### GANs for RNA Sequences

In [1]:
import os 
import torch
import torch.nn as nn 
import torch.optim as optim
import torch.utils.data
from torch.nn.modules.pooling import MaxPool2d

import numpy as np 
import matplotlib.pyplot as plt
import math
import pickle
import pandas as pd
from pathlib import Path
from google.colab import drive
import random

assert(torch.cuda.is_available())
device = "cuda" 

drive.mount('/content/drive') 

Mounted at /content/drive


In [3]:
root_dir = '/content/drive/MyDrive/ML4FG/Project' 

In [4]:
rnbs_HNRNPK_dir = Path(root_dir + '/datasets/rnbs_HNRNPK')

HNRNPK_0 = pd.read_csv(rnbs_HNRNPK_dir / 'HNRNPK_0.fasta.gz', header=None, names = ['Sequence'], sep = '\t')
HNRNPK_0 

Unnamed: 0,Sequence
0,AGGACCTGACCATACGATGA
1,ACTAAATCTCATGCAGGATA
2,TTATAGCCCGATTAGGAGGG
3,ATCGCCAATTGTCTTCAGAT
4,CATTATCGCTTTATACAACT
...,...
11509893,ACAGTCTGGTTTCGAACGCG
11509894,ACGAAATCTAGAGTAACTTA
11509895,GCCATGAGATCATACGTCTT
11509896,ACTCCACCGTTAGTATCCTT


Generator: generates RNA sequences [batchsize, 20] from the given [batchsize, 100, 1, 1] (noise)
  
Discriminator: produce an output of [batchsize, 1] from the given input [batchsize, 20] 

In [28]:
class DCGAN_Generator(nn.Module):
    def __init__(self):
        super(DCGAN_Generator,self).__init__()

        self.generator = nn.Sequential(
            nn.ConvTranspose2d(in_channels=100, out_channels=256, kernel_size=(4,4)),
            nn.BatchNorm2d(num_features=256),
            nn.LeakyReLU(negative_slope=0.2),
            nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=(4,4), stride=(2,2), padding=(1,1)),
            nn.BatchNorm2d(num_features=128),
            nn.LeakyReLU(negative_slope=0.2),
            nn.ConvTranspose2d(in_channels=128, out_channels=64, kernel_size=(4,4), stride=(2,2), padding=(1,1)),
            nn.BatchNorm2d(num_features=64),
            nn.LeakyReLU(negative_slope=0.2),
            nn.ConvTranspose2d(in_channels=64, out_channels=1, kernel_size=(4,4), stride=(2,2), padding=(3,3)),
            nn.Flatten(),
            nn.Linear(in_features=784, out_features=20),
            nn.Sigmoid()
        ) 


    def forward(self, input):

        out = self.generator(input)

        return out 


class DCGAN_Discriminator(nn.Module):
    def __init__(self):
        super(DCGAN_Discriminator, self).__init__()

        self.discriminator = nn.Sequential(
            nn.Conv2d(in_channels=5, out_channels=64, kernel_size=(4,4), stride=(4,4), padding=(3,3)),
            nn.BatchNorm2d(num_features=64),
            nn.LeakyReLU(negative_slope=0.2),
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(4,4), stride=(2,2), padding=(1,1)),
            nn.BatchNorm2d(num_features=128),
            nn.LeakyReLU(negative_slope=0.2),
            nn.Flatten(),
            nn.Linear(in_features=128, out_features=1),
            nn.Sigmoid()
        ) 

    def forward(self, input):
        
        input = input.view(-1, 5, 2, 2)
        out = self.discriminator(input)

        return out


g=DCGAN_Generator()
batchsize=2
z=torch.zeros((batchsize, 100, 1, 1))
out = g(z)
print(out.size()) # should be size [batchsize, 20]


d=DCGAN_Discriminator()
x=torch.zeros((batchsize, 20))
out = d(x)
print(out.size()) # shoud be size [batchsize, 1]  

torch.Size([2, 20])
torch.Size([2, 1])


In [29]:
import torch

def loss_discriminator(D, real, G, noise, Valid_label, Fake_label, criterion, optimizerD):

    '''
    1. Forward real images into the discriminator
    2. Compute loss between Valid_label and discriminator output on real images
    3. Forward noise into the generator to get fake images
    4. Forward fake images to the discriminator
    5. Compute loss between Fake_label and discriminator output on fake images (and remember to detach the gradient from the fake images using detach()!)
    6. sum real loss and fake loss as the loss_D
    7. we also need to output fake images generate by G(noise) for loss_generator computation
    '''

    discriminator_output_real = D(real)
    real_loss = criterion(torch.squeeze(discriminator_output_real), Valid_label)
    fake_imgs = G(noise)
    fake = fake_imgs.detach()
    discriminator_output_fake = D(fake)
    fake_loss = criterion(torch.squeeze(discriminator_output_fake), Fake_label)
    loss_D = real_loss + fake_loss

    return loss_D, fake_imgs

def loss_generator(netD, netG, fake, Valid_label, criterion, optimizerG):
    '''
    1. Forward fake images to the discriminator
    2. Compute loss between valid labels and discriminator output on fake images
    '''

    discriminator_output_fake = netD(fake)
    loss_G = criterion(torch.squeeze(discriminator_output_fake), Valid_label) 
    
    return loss_G 
    

In [30]:
import torchvision.utils as vutils
from torch.optim.lr_scheduler import StepLR
import pdb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

netG = DCGAN_Generator().to(device)
netD = DCGAN_Discriminator().to(device)

from torchsummary import summary
print(summary(netG,(100, 1, 1)))
print(summary(netD,(1, 20))) 

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
   ConvTranspose2d-1            [-1, 256, 4, 4]         409,856
       BatchNorm2d-2            [-1, 256, 4, 4]             512
         LeakyReLU-3            [-1, 256, 4, 4]               0
   ConvTranspose2d-4            [-1, 128, 8, 8]         524,416
       BatchNorm2d-5            [-1, 128, 8, 8]             256
         LeakyReLU-6            [-1, 128, 8, 8]               0
   ConvTranspose2d-7           [-1, 64, 16, 16]         131,136
       BatchNorm2d-8           [-1, 64, 16, 16]             128
         LeakyReLU-9           [-1, 64, 16, 16]               0
  ConvTranspose2d-10            [-1, 1, 28, 28]           1,025
          Flatten-11                  [-1, 784]               0
           Linear-12                   [-1, 20]          15,700
          Sigmoid-13                   [-1, 20]               0
Total params: 1,083,029
Trainable param

In [None]:
import torchvision.utils as vutils
from torch.optim.lr_scheduler import StepLR
import pdb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Size of z latent vector (i.e. size of generator input)
nz = 100

# Create the generator and discriminator
netG = DCGAN_Generator().to(device)
netD = DCGAN_Discriminator().to(device)

# Initialize BCELoss function
criterion = nn.BCELoss()

# Create latent vector to test the generator performance
fixed_noise = torch.randn(36, nz, 1, 1, device=device)

# Establish convention for real and fake labels during training
real_label = 1
fake_label = 0

learning_rate = 0.0002
beta1 = 0.5

optimizerD = optim.Adam(netD.parameters(), lr=learning_rate, betas=(beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=learning_rate, betas=(beta1, 0.999))

img_list = []
real_img_list = []
G_losses = []
D_losses = []
iters = 0
num_epochs = 50  

  
def load_param(num_eps):
  model_saved = torch.load('/content/gan_{}.pt'.format(num_eps))
  netG.load_state_dict(model_saved['netG'])
  netD.load_state_dict(model_saved['netD'])

# GAN Training Loop
for epoch in range(num_epochs):
    for i, data in enumerate(gan_train_loader, 0):
        real = data[0].to(device)
        b_size = real.size(0)
        noise = torch.randn(b_size, nz, 1, 1, device=device)

        Valid_label = torch.full((b_size,), real_label, dtype=torch.float, device=device)
        Fake_label = torch.full((b_size,), fake_label, dtype=torch.float, device=device)

        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        loss_D, fake = loss_discriminator(netD, real, netG, noise, Valid_label, Fake_label, criterion, optimizerD)

        optimizerD.zero_grad()
        loss_D.backward()
        optimizerD.step()

        # (2) Update G network: maximize log(D(G(z)))
        loss_G = loss_generator(netD, netG, fake, Valid_label, criterion, optimizerG) 

        optimizerG.zero_grad()
        loss_G.backward()
        optimizerG.step()

        # Output training stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\t'
                  % (epoch, num_epochs, i, len(gan_train_loader),
                     loss_D.item(), loss_G.item()))

        # Save Losses for plotting later
        G_losses.append(loss_G.item())
        D_losses.append(loss_D.item())

        # Check how the generator is doing by saving G's output on fixed_noise
        if (iters % 500 == 0) or ((epoch == num_epochs-1) and (i == len(gan_train_loader)-1)):
            with torch.no_grad():
                fake = netG(fixed_noise).detach().cpu()
            img_list.append(vutils.make_grid(fake, padding=2, normalize=True))

        iters += 1

        

plt.title("Generator and Discriminator Loss During Training")
plt.plot(G_losses,label="G")
plt.plot(D_losses,label="D")
plt.xlabel("iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()


checkpoint = {'netG': netG.state_dict(),
              'netD': netD.state_dict()}
torch.save(checkpoint, 'gan_{}.pt'.format(num_epochs))
