In [1]:
import numpy as np
import csv
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from matplotlib import pyplot as plt
from torch.autograd import Variable
from mpl_toolkits.mplot3d import Axes3D
import sklearn.metrics as metrics
import pickle
import time
%matplotlib notebook

In [2]:
with open('screen_info.txt','rb') as fl:
    t = pickle.load(fl)
fnames = t[0]
totf = t[1]
binf = t[2]
runfile = 2
fname = fnames[runfile]
bf = binf[runfile]

In [3]:
path = os.getcwd() + '/bioassay-datasets/'
p_fingerprints = []
c_fingerprints = []
labels = []
with open(path+fname+'red_train.csv') as csvfile:
    readcsv = csv.reader(csvfile)
    for row in readcsv:
        p_fingerprints.append(row[:bf])
        c_fingerprints.append(row[bf:-1])
        labels.append(row[-1])

In [4]:
p_fingerprints = np.asarray(p_fingerprints)[1:]
p_fingerprints = p_fingerprints.astype(int)
#p2_fingerprints = np.ones(p_fingerprints.shape)
(no_examples , ip_dim) = p_fingerprints.shape
labels = labels[1:]
print(no_examples,ip_dim)
print("total no of 1s",np.sum(p_fingerprints))
print("total no of 0s",no_examples*ip_dim-np.sum(p_fingerprints))

(7986, 121)
('total no of 1s', 42893)
('total no of 0s', 923413)


In [5]:
p_fingerprints[(p_fingerprints==0)] = -1

In [6]:
labels2 = np.zeros((len(labels),1))
for i,l in enumerate(labels):
    if l=='Active':
        labels2[i] = 1
    else:
        labels2[i] = 0
labels2 = labels2.astype(int)

In [7]:
no_active_ele = (sum(labels2))
print(no_active_ele)

[22]


In [8]:
X_dim = ip_dim
h1_dim = 500
h2_dim = 500
h3_dim = 500
z_dim = 60

In [9]:
def get_train_batch(batch_size):
    samples = np.random.randint(low=0,high=no_examples,size=(batch_size,1))
    train_batch = p_fingerprints[samples].reshape(batch_size,ip_dim)
    train_batch = train_batch.astype(int)
    train_batch = torch.cuda.FloatTensor(train_batch)
    train_batch = Variable(train_batch,requires_grad=False).cuda()
    target = Variable(torch.cuda.FloatTensor(labels2[samples]),requires_grad=False)
    
    return train_batch,target

In [10]:
class encoder(nn.Module):
    def __init__(self):
        super(encoder,self).__init__()
        self.l1 = nn.Linear(X_dim,h1_dim)
        self.l2 = nn.Linear(h1_dim,h2_dim)
        self.l3 = nn.Linear(h2_dim,h3_dim)
        self.l4 = nn.Linear(h3_dim,z_dim)
        
    def forward(self,x):
        x = F.leaky_relu(self.l1(x))
        x = F.leaky_relu(self.l2(x))
        x = F.leaky_relu(self.l3(x))
        x = self.l4(x)
        
        return x

In [11]:
class decoder(nn.Module):
    def __init__(self):
        super(decoder,self).__init__()
        self.l1 = nn.Linear(z_dim,h3_dim)
        self.l2 = nn.Linear(h3_dim,h2_dim)
        self.l3 = nn.Linear(h2_dim,h1_dim)
        self.l4 = nn.Linear(h1_dim,X_dim)
        
    def forward(self,x):
        x = F.leaky_relu(self.l1(x))
        x = F.leaky_relu(self.l2(x))
        x = F.leaky_relu(self.l3(x))
        x = F.tanh(self.l4(x))
        
        return x

In [12]:
class disc(nn.Module):
    def __init__(self):
        super(disc,self).__init__()
        self.lin1 = nn.Linear(z_dim+2,500)
        self.lin2 = nn.Linear(500,100)
        self.lin3 = nn.Linear(100,100)
        self.lin4 = nn.Linear(100,30)
        self.lin5 = nn.Linear(30,1)
        
    def forward(self,x):
        x = F.selu(self.lin1(x))
        x = F.selu(self.lin2(x))
        x = F.selu(self.lin3(x))
        x = F.selu(self.lin4(x))
        x = F.sigmoid(self.lin5(x))
        return x

In [13]:
def add_label_info(y,batch_size):

    tmp = np.zeros((batch_size,2))
    tmp2 = np.zeros((batch_size,1))
    y = y.cpu().data.numpy().reshape(batch_size,1)
    tmp2[y==0] = 5
    tmp3 = np.zeros((batch_size,1))
    tmp3[y==1] = 5
    tmp = np.concatenate((tmp2,tmp3),1)
    label_info = torch.from_numpy((tmp)).cuda()
    return label_info

In [14]:
def train_model(Q,Q_solver,P,P_solver,D,D_solver,batch_size):
    
    for it in range(3500):
        x,y = get_train_batch(batch_size)
        z = Q(x)

        #Reconstruction
        
        x_recon = P(z)
        '''
        x_recon[x_recon<0] = 0
        x_recon[x_recon>0] = 1
        x_tar = Variable(torch.cuda.FloatTensor(x.size()),requires_grad=False)
        x_tar[x==-1] = 0
        x_tar[x==1] = 1'''
        
        criterion = nn.MSELoss()
        CEL = criterion(x_recon, x)
        
        CEL.backward(retain_graph=True)
        Q_solver.step()
        P_solver.step()
        
        Q.zero_grad()
        P.zero_grad()
        
        #Discriminator
        
        label_info = (add_label_info(y,batch_size))
        z_false = np.concatenate((z.cpu().data.numpy(),label_info.cpu().numpy()),1)
        z_false = Variable(torch.FloatTensor(z_false)).cuda()
        #z_false = torch.cat((z,label_info),1)
        z_true = np.random.rand(batch_size,z_dim)
        z_true = np.concatenate((z_true,label_info.cpu().numpy()),1)
        z_true = Variable(torch.FloatTensor(z_true).cuda())
        #z_true = torch.cat((z_true,label_info),1)
        z_true_op = Variable(D(z_true).data,requires_grad=False)
        
        z_false_op = D(z_false)
        add_small = 1e-20
        
        
        criterion = nn.BCELoss()
        loss_d = criterion(z_false_op,z_true_op)
        #loss_d = -torch.mean(torch.log(z_true_op + add_small) + torch.log(1 - z_false_op + add_small))
        loss_d.backward(retain_graph=True)
        D_solver.step()
        D.zero_grad()
        
        #Updating the encoder
        
        G_loss = -torch.mean(torch.log(z_false_op+1e-20))
        G_loss.backward(retain_graph=True)
        Q_solver.step()
        Q_solver.zero_grad()
        
        
        
        
        if(it%50==0):
            #print(extra_loss.data[0],CEL.data[0])
            print('recon_loss:', CEL.data[0],'disc_loss:', loss_d.data[0],'gen_loss: ',G_loss.data[0])
            #print(x_recon[0][:50].cpu().data.numpy().T)
            #print()
            #print(x[0][:50].cpu().data.numpy().T)
           # print()
    return Q,P

In [15]:
def generate_model():
    Q = encoder()
    Q.cuda()
    Q_solver = optim.Adam(Q.parameters(),lr=1e-4)
    E_solver = optim.Adam(Q.parameters(),lr = 1e-5)
    P = decoder()
    P.cuda()
    P_solver = optim.Adam(P.parameters(),lr = 1e-4)
    D = disc()
    D.cuda()
    D_solver = optim.Adam(D.parameters(),lr = 1e-3)
    batch_size = 120
    Q,P = train_model(Q,Q_solver,P,P_solver,D,D_solver,batch_size)
    
    return Q,P

In [16]:
Q,P = generate_model()

('recon_loss:', 0.9992547035217285, 'disc_loss:', 0.6928920745849609, 'gen_loss: ', 0.67038494348526)
('recon_loss:', 0.16585776209831238, 'disc_loss:', 0.00829379539936781, 'gen_loss: ', 3.362654024385847e-05)
('recon_loss:', 0.1497836858034134, 'disc_loss:', 0.040067605674266815, 'gen_loss: ', 0.002255338476970792)
('recon_loss:', 0.15118305385112762, 'disc_loss:', 0.015966640785336494, 'gen_loss: ', 0.00035635617678053677)
('recon_loss:', 0.1404351443052292, 'disc_loss:', 0.026898503303527832, 'gen_loss: ', 0.0032471977174282074)
('recon_loss:', 0.13914090394973755, 'disc_loss:', 0.00925043411552906, 'gen_loss: ', 0.00039606387144885957)
('recon_loss:', 0.14799293875694275, 'disc_loss:', 0.0125607093796134, 'gen_loss: ', 0.005583351012319326)
('recon_loss:', 0.12539042532444, 'disc_loss:', 0.008006248623132706, 'gen_loss: ', 0.0005909120081923902)
('recon_loss:', 0.13333380222320557, 'disc_loss:', 0.007734369020909071, 'gen_loss: ', 0.00038184135337360203)
('recon_loss:', 0.11660851

In [17]:
encoder_path = os.getcwd() + '/model_enc_' + str(fname)
torch.save(Q.state_dict(),encoder_path)

In [18]:
#entire_batch,batch_labels = get_train_batch(no_examples)
tic = time.time()
## It takes too much memory. Split in chunks and 
z_encoded = Q(Variable(torch.cuda.FloatTensor(p_fingerprints)))
toc = time.time()
print(toc-tic)

5.41320896149


In [19]:
generate_new_z = False

### generate new samples from orignal ones 

In [20]:
if generate_new_z == True:
    n_comb = 5
    n_samples = 100
    extra_samples = torch.cuda.FloatTensor(n_samples,z_dim)
    extra_labels = Variable(torch.ones(n_samples).cuda())
    for i in range(n_samples):
        #coeff = np.random.rand(n_comb,1)
        coeff = Variable(torch.randn(n_comb,1)).cuda()
        active_z_encoded = z_encoded[torch.cuda.FloatTensor(labels2)==1]
        tmp_rand_nos = torch.randperm(int(no_active_ele))
        rand_nos = tmp_rand_nos[0:n_comb].cuda()
        rand_z = torch.transpose(z_encoded[rand_nos],0,1)
        extra_samples[i] = torch.cuda.FloatTensor(torch.matmul(rand_z,coeff).data)
    extra_samples = Variable(extra_samples)
    
    new_z_encoded = torch.cat((z_encoded,extra_samples),0)
    new_labels = torch.cat((Variable(torch.cuda.FloatTensor(labels2)),extra_labels),0)
    perm = torch.randperm(no_examples+n_samples).cuda()
    new_z_encoded = new_z_encoded[perm]
    new_labels = new_labels[perm]
    batch_labels_np = new_labels.cpu().data.numpy()

In [21]:
# x_encoded = z_encoded.cpu().data.numpy()[:,0]
# y_encoded = z_encoded.cpu().data.numpy()[:,1]
# w_encoded = z_encoded.cpu().data.numpy()[:,2]

# # batch_labels_np = batch_labels_np.astype(int)
# # print(batch_labels_np.dtype)
# # print(batch_labels_np.shape)
# batch_labels_np = list(labels2)

# colors = []
# for l in batch_labels_np:
#     colors.append("C"+str(int(l)))
    
# #plt.scatter(x_encoded,y_encoded,c=colors)
# fig = plt.figure()
# ax = Axes3D(fig)
# ax.scatter(x_encoded,y_encoded,w_encoded,c=colors)
# plt.show()

In [22]:
def sample_z(size):
    if generate_new_z == True:
        print("gng here")
        ind = torch.cuda.LongTensor(torch.randperm(no_examples+n_samples)[:size].numpy())
        return new_z_encoded[ind], new_labels[ind]
    else:
        ind = torch.cuda.LongTensor(torch.randperm(no_examples)[:size].numpy())
        return z_encoded[ind], Variable(torch.cuda.LongTensor(labels2)[ind],requires_grad = False)

In [23]:
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator,self).__init__()
        self.l1 = nn.Linear(z_dim,800)
        self.l2 = nn.Linear(800,500)
        self.l3 = nn.Linear(500,400)
        self.l4 = nn.Linear(400,70)
        self.l5 = nn.Linear(70,2)
        
    def forward(self,x):
        x = F.selu(self.l1(x))
        x = F.selu(self.l2(x))
        x = F.selu(self.l3(x))
        x = F.relu(self.l4(x))
        x = (self.l5(x))
        
        return x

In [24]:
def gen_disc_model(w):
    d = Discriminator().cuda()
    d_optim = optim.Adam(d.parameters(),lr=1e-4)
    d = train_disc(d,d_optim,w)
    return d

In [25]:
def train_disc(d,d_optim,w):
    for ep in range(2000):
        d_optim.zero_grad()
        x,true_l = sample_z(200)
        true_l = true_l.view(true_l.size()[0],)
        p_labels = d(x)
        weights = torch.Tensor([1,w]).cuda()
        criteria = nn.CrossEntropyLoss(weight=weights)
        true_l = true_l.type(torch.cuda.LongTensor)
        loss = criteria(p_labels,true_l)
        loss.backward(retain_graph=True)
        d_optim.step()
        
#         if(ep%50==49):
#             print(loss.data[0])
            
    return d

In [26]:
weights = np.linspace(5,30,25)
# with open("cnt_test_good)weights.txt",'rb') as f:
#     weights = pickle.load(f)
# print(weights)

In [None]:
# if generate_new_z == True:
#     train_encoded = (new_z_encoded)
#     labels_final = batch_labels_np
# else:
train_encoded = Q(Variable(torch.cuda.FloatTensor(p_fingerprints)))
labels_final = labels2
fn_min  = 48
    
cm_autoencoder = []
cm_autoencoder.append(weights)

In [None]:
model_path = os.getcwd() + '/model_autoencoder_' + str(fname)
for w in weights:
    print("w: ",w)
    d = gen_disc_model(w)
    train_op = d(train_encoded).cpu().data.numpy()
    train_op = np.argmax(train_op,axis=1)
    cf = metrics.confusion_matrix(labels_final,train_op)
    [tn, fp, fn, tp]  = cf.ravel()
    print('tn, fp, fn, tp: ',cf.ravel())
    if(fn < fn_min):
        fn_min = fn
        torch.save(d.state_dict(),model_path)
        print("saving model on weight: ",w)

('w: ', 5.0)
('tn, fp, fn, tp: ', array([7955,    9,    9,   13]))
('saving model on weight: ', 5.0)
('w: ', 6.041666666666667)
('tn, fp, fn, tp: ', array([7955,    9,   10,   12]))
('w: ', 7.0833333333333339)
('tn, fp, fn, tp: ', array([7943,   21,    8,   14]))
('saving model on weight: ', 7.0833333333333339)
('w: ', 8.125)


In [None]:
with open("autoencoder_arti_ex_cm",'wb') as f:
    pickle.dump(cm_autoencoder,f)

# Check on Training Data 

In [None]:
cm_autoencoder

In [None]:
cm_autoencoder

In [None]:
import pickle

In [None]:
with open("autoencoder_1.txt",'wb') as fb:
    pickle.dump(cm_autoencoder,fb)