In [1]:
import numpy as np
import csv
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from matplotlib import pyplot as plt
from torch.autograd import Variable
from mpl_toolkits.mplot3d import Axes3D
import sklearn.metrics as metrics
%matplotlib notebook

In [2]:
path = '/home/daiict/CVShare/Jeni/hts/bioassay-datasets/'
p_fingerprints = []
labels = []
with open(path+'AID362red_train.csv') as csvfile:
    readcsv = csv.reader(csvfile)
    for row in readcsv:
        p_fingerprints.append(row[:112])
        labels.append(row[-1])

In [3]:
p_fingerprints = np.asarray(p_fingerprints)[1:]
p_fingerprints = p_fingerprints.astype(int)
#p2_fingerprints = np.ones(p_fingerprints.shape)
(no_examples , ip_dim) = p_fingerprints.shape
labels = labels[1:]
print(no_examples,ip_dim)
print("total no of 1s",np.sum(p_fingerprints))
print("total no of 0s",no_examples*ip_dim-np.sum(p_fingerprints))

(3423, 112)
('total no of 1s', 25982)
('total no of 0s', 357394)


In [4]:
p_fingerprints[(p_fingerprints==0)] = -1

In [5]:
labels2 = np.zeros((len(labels),1))
for i,l in enumerate(labels):
    if l=='Active':
        labels2[i] = 1
    else:
        labels2[i] = 0

In [6]:
no_active_ele = (sum(labels2))
print(no_active_ele)

[ 48.]


In [7]:
X_dim = ip_dim
h1_dim = 500
h2_dim = 500
h3_dim = 500
z_dim = 50

In [8]:
def get_train_batch(batch_size):
    samples = np.random.randint(low=0,high=no_examples,size=(batch_size,1))
    train_batch = p_fingerprints[samples].reshape(batch_size,ip_dim)
    train_batch = train_batch.astype(int)
    train_batch = torch.cuda.FloatTensor(train_batch)
    train_batch = Variable(train_batch,requires_grad=False).cuda()
    target = Variable(torch.cuda.FloatTensor(labels2[samples]),requires_grad=False)
    
    return train_batch,target

In [9]:
class encoder(nn.Module):
    def __init__(self):
        super(encoder,self).__init__()
        self.l1 = nn.Linear(X_dim,h1_dim)
        self.l2 = nn.Linear(h1_dim,h2_dim)
        self.l3 = nn.Linear(h2_dim,h3_dim)
        self.l4 = nn.Linear(h3_dim,z_dim)
        
    def forward(self,x):
        x = F.leaky_relu(self.l1(x))
        x = F.leaky_relu(self.l2(x))
        x = F.leaky_relu(self.l3(x))
        x = self.l4(x)
        
        return x

In [10]:
class decoder(nn.Module):
    def __init__(self):
        super(decoder,self).__init__()
        self.l1 = nn.Linear(z_dim,h3_dim)
        self.l2 = nn.Linear(h3_dim,h2_dim)
        self.l3 = nn.Linear(h2_dim,h1_dim)
        self.l4 = nn.Linear(h1_dim,X_dim)
        
    def forward(self,x):
        x = F.leaky_relu(self.l1(x))
        x = F.leaky_relu(self.l2(x))
        x = F.leaky_relu(self.l3(x))
        x = F.tanh(self.l4(x))
        
        return x

In [11]:
class disc(nn.Module):
    def __init__(self):
        super(disc,self).__init__()
        self.lin1 = nn.Linear(z_dim+2,300)
        self.lin2 = nn.Linear(300,100)
        self.lin3 = nn.Linear(100,30)
        self.lin4 = nn.Linear(30,1)
        
    def forward(self,x):
        x = self.lin1(x)
        x = F.leaky_relu(x)
        x = self.lin2(x)
        x = F.leaky_relu(x)
        x = F.leaky_relu(self.lin3(x))
        x = F.sigmoid(self.lin4(x))
        
        return x

In [12]:
def add_label_info(y,batch_size):

    tmp = np.zeros((batch_size,2))
    tmp2 = np.zeros((batch_size,1))
    y = y.cpu().data.numpy().reshape(batch_size,1)
    tmp2[y==0] = 5
    tmp3 = np.zeros((batch_size,1))
    tmp3[y==1] = 5
    tmp = np.concatenate((tmp2,tmp3),1)
    label_info = torch.from_numpy((tmp)).cuda()
    return label_info

In [13]:
def train_model(Q,Q_solver,P,P_solver,D,D_solver,batch_size):
    
    for it in range(2000):
        x,y = get_train_batch(batch_size)
        z = Q(x)

        #Reconstruction
        
        x_recon = P(z)
        '''
        x_recon[x_recon<0] = 0
        x_recon[x_recon>0] = 1
        x_tar = Variable(torch.cuda.FloatTensor(x.size()),requires_grad=False)
        x_tar[x==-1] = 0
        x_tar[x==1] = 1'''
        
        criterion = nn.MSELoss()
        CEL = criterion(x_recon, x)
        
        CEL.backward()
        Q_solver.step()
        P_solver.step()
        
        Q.zero_grad()
        P.zero_grad()
        
        #Discriminator
        
        label_info = (add_label_info(y,batch_size))
        z_false = np.concatenate((z.cpu().data.numpy(),label_info.cpu().numpy()),1)
        z_false = Variable(torch.FloatTensor(z_false).cuda())
        #z_false = torch.cat((z,label_info),1)
        z_true = np.random.rand(batch_size,z_dim)
        z_true = np.concatenate((z_true,label_info.cpu().numpy()),1)
        z_true = Variable(torch.FloatTensor(z_true).cuda())
        #z_true = torch.cat((z_true,label_info),1)
        z_true_op = Variable(D(z_true).data,requires_grad=False)
        
        z_false_op = D(z_false)
        add_small = 1e-20
        
        
        criterion = nn.BCELoss()
        loss_d = criterion(z_false_op,z_true_op)
        #loss_d = -torch.mean(torch.log(z_true_op + add_small) + torch.log(1 - z_false_op + add_small))
        loss_d.backward(retain_variables = True)
        D_solver.step()
        D.zero_grad()
        
        #Updating the encoder
        
        G_loss = -torch.mean(torch.log(z_false_op+1e-20))
        G_loss.backward()
        Q_solver.step()
        Q_solver.zero_grad()
        
        
        
        
        if(it%50==0):
            #print(extra_loss.data[0],CEL.data[0])
            print('recon_loss:', CEL.data[0],'disc_loss:', loss_d.data[0],'gen_loss: ',G_loss.data[0])
            #print(x_recon[0][:50].cpu().data.numpy().T)
            #print()
            #print(x[0][:50].cpu().data.numpy().T)
           # print()
    return Q,P

In [14]:
def generate_model():
    Q = encoder()
    Q.cuda()
    Q_solver = optim.Adam(Q.parameters(),lr=1e-4)
    E_solver = optim.Adam(Q.parameters(),lr = 1e-5)
    P = decoder()
    P.cuda()
    P_solver = optim.Adam(P.parameters(),lr = 1e-4)
    D = disc()
    D.cuda()
    D_solver = optim.Adam(D.parameters(),lr = 1e-4)
    batch_size = 120
    Q,P = train_model(Q,Q_solver,P,P_solver,D,D_solver,batch_size)
    
    return Q,P

In [15]:
Q,P = generate_model()

('recon_loss:', 0.9965453147888184, 'disc_loss:', 0.6874924898147583, 'gen_loss: ', 0.5845605731010437)
('recon_loss:', 0.22615621984004974, 'disc_loss:', 0.7811915278434753, 'gen_loss: ', 0.2230936586856842)
('recon_loss:', 0.20745374262332916, 'disc_loss:', 0.7680944800376892, 'gen_loss: ', 0.22499129176139832)
('recon_loss:', 0.20160159468650818, 'disc_loss:', 0.7696407437324524, 'gen_loss: ', 0.2243676632642746)
('recon_loss:', 0.17638222873210907, 'disc_loss:', 0.7617305517196655, 'gen_loss: ', 0.22113513946533203)
('recon_loss:', 0.14872747659683228, 'disc_loss:', 0.745427131652832, 'gen_loss: ', 0.19750909507274628)
('recon_loss:', 0.14413218200206757, 'disc_loss:', 0.7322324514389038, 'gen_loss: ', 0.21477489173412323)
('recon_loss:', 0.12986476719379425, 'disc_loss:', 0.7394369840621948, 'gen_loss: ', 0.20232196152210236)
('recon_loss:', 0.10781777650117874, 'disc_loss:', 0.741156816482544, 'gen_loss: ', 0.19699278473854065)
('recon_loss:', 0.10728238523006439, 'disc_loss:', 0

In [16]:
entire_batch,batch_labels = get_train_batch(no_examples)
z_encoded = Q(entire_batch)
batch_labels_np = batch_labels.cpu().data.numpy()

In [17]:
'''x_encoded = z_encoded.cpu().data.numpy()[:,0]
y_encoded = z_encoded.cpu().data.numpy()[:,1]
w_encoded = z_encoded.cpu().data.numpy()[:,2]

batch_labels_np = batch_labels_np.astype(int)
print(batch_labels_np.dtype)
print(batch_labels_np.shape)
batch_labels_np = list(batch_labels_np.reshape(no_examples,1))

colors = []
for l in batch_labels_np:
    colors.append("C"+str(int(l)))
    
#plt.scatter(x_encoded,y_encoded,c=colors)
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(x_encoded,y_encoded,w_encoded,c=colors)
plt.show()'''

'x_encoded = z_encoded.cpu().data.numpy()[:,0]\ny_encoded = z_encoded.cpu().data.numpy()[:,1]\nw_encoded = z_encoded.cpu().data.numpy()[:,2]\n\nbatch_labels_np = batch_labels_np.astype(int)\nprint(batch_labels_np.dtype)\nprint(batch_labels_np.shape)\nbatch_labels_np = list(batch_labels_np.reshape(no_examples,1))\n\ncolors = []\nfor l in batch_labels_np:\n    colors.append("C"+str(int(l)))\n    \n#plt.scatter(x_encoded,y_encoded,c=colors)\nfig = plt.figure()\nax = Axes3D(fig)\nax.scatter(x_encoded,y_encoded,w_encoded,c=colors)\nplt.show()'

In [18]:
def sample_z(size):
    ind = torch.cuda.LongTensor(torch.randperm(no_examples)[:size].numpy())
    return z_encoded[ind], batch_labels[ind]

In [19]:
def train_disc(d,d_optim):
    for ep in range(2000):
        d_optim.zero_grad()
        x,true_l = sample_z(50)
        true_l = true_l.view(true_l.size()[0],)
        p_labels = d(x)
        weights = torch.Tensor([1,10]).cuda()
        criteria = nn.CrossEntropyLoss(weight=weights)
        true_l = true_l.type(torch.cuda.LongTensor)
        loss = criteria(p_labels,true_l)
        loss.backward(retain_variables = True)
        d_optim.step()
        
        if(ep%50==49):
            print(loss.data[0])
            
    return d

In [20]:
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator,self).__init__()
        self.l1 = nn.Linear(z_dim,400)
        self.l2 = nn.Linear(400,400)
        self.l3 = nn.Linear(400,70)
        self.l4 = nn.Linear(70,2)
        
    def forward(self,x):
        x = F.tanh(self.l1(x))
        x = F.tanh(self.l2(x))
        x = F.tanh(self.l3(x))
        x = (self.l4(x))
        
        return x

In [21]:
def gen_disc_model():
    d = Discriminator()
    d.cuda()
    d_optim = optim.Adam(d.parameters(),lr=1e-4)
    d = train_disc(d,d_optim)
    return d

In [22]:
d = gen_disc_model()

0.484246611595
0.197930753231
0.206379532814
0.13697257638
0.102194577456
0.535872042179
0.174371153116
0.27784961462
0.283189356327
0.260330915451
0.204557299614
0.135556429625
0.0916865393519
0.0415781922638
0.130641147494
0.176254764199
0.149524062872
0.218980535865
0.111575298011
0.037403807044
0.0844438448548
0.121269144118
0.0809474438429
0.0588055811822
0.0393026173115
0.043043538928
0.0594484135509
0.0616584680974
0.133588597178
0.0627502799034
0.0601399801672
0.175496920943
0.186965689063
0.488889932632
0.0300031118095
0.0738388374448
0.107866667211
0.0545143298805
0.0103065669537
0.0345447957516


# Check on Training Data 

In [23]:
#entire_batch,batch_labels = get_train_batch(no_examples)
train_encoded = Q(Variable(torch.cuda.FloatTensor(p_fingerprints)))
train_op = d(train_encoded)

In [24]:
#print(train_op.cpu().data.numpy())

In [25]:
train_op = train_op.cpu().data.numpy()
train_op = np.argmax(train_op,axis=1)
#batch_labels = batch_labels.view(batch_labels.size()[0],1)
#batch_labels = batch_labels.data.cpu().numpy()
#print(batch_labels)
#batch_labels = batch_labels.astype(int)
cf = metrics.confusion_matrix(l,train_op)
print('tn, fp, fn, tp: ',cf.ravel())

('tn, fp, fn, tp: ', array([3324,   52,   16,   31]))


In [26]:
print(train_op)

[0 1 0 ..., 0 0 0]


In [28]:
print(sum(batch_labels))

[ 47.]
