In [1]:
import numpy as np
import csv
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from matplotlib import pyplot as plt
from torch.autograd import Variable
from mpl_toolkits.mplot3d import Axes3D
import sklearn.metrics as metrics
import pickle
import time
import sampling_with_data_cleaning as sdc
%matplotlib notebook

In [2]:
with open('screen_info.txt','rb') as fl:
    t = pickle.load(fl)
fnames = t[0]
totf = t[1]
binf = t[2]
runfile = 0
fname = fnames[runfile]
bf = binf[runfile]

In [3]:
path = os.getcwd() + '/bioassay-datasets/'
p_fingerprints = []
c_fingerprints = []
labels = []
with open(path+fname+'red_train.csv') as csvfile:
    readcsv = csv.reader(csvfile)
    for row in readcsv:
        p_fingerprints.append(row[:bf])
        c_fingerprints.append(row[bf:-1])
        labels.append(row[-1])

In [4]:
p_fingerprints = np.asarray(p_fingerprints)[1:]
print(p_fingerprints.shape)
print(p_fingerprints[1:5,-1])

(3423, 112)
['0' '0' '1' '0']


In [5]:
p_fingerprints = np.asarray(p_fingerprints)[1:]
p_fingerprints = p_fingerprints.astype(int)
#p2_fingerprints = np.ones(p_fingerprints.shape)
(no_examples , ip_dim) = p_fingerprints.shape
labels = labels[1:]
print(no_examples,ip_dim)
print("total no of 1s",np.sum(p_fingerprints))
print("total no of 0s",no_examples*ip_dim-np.sum(p_fingerprints))

(3422, 112)
('total no of 1s', 25981)
('total no of 0s', 357283)


In [6]:
labels2 = np.zeros((len(labels),1))
for i,l in enumerate(labels):
    if l=='Active':
        labels2[i] = 1
    else:
        labels2[i] = 0
labels2 = labels2.astype(int)

In [7]:
p_fingerprints,labels2 = sdc.clean_data(p_fingerprints,labels2)
no_examples,ip_dim = p_fingerprints.shape

Adding 106 new samples
removing 61 samples
[3422  968  967 2277 3260 2966 3090 1862 1324  770 2147  749 2145 1668  694
 3051 2535 3311 3363 3364 3176 1049 3045 3049 2285 2290 1156 1801 3212 1554
 1860 3064 1858 1476 2423 2422 2534  123 1284  126  125  122 1980 2483 2661
 2441 2260 1967 2259 2012 2225  852  843 3173 3107 3367 3263 3264 3096 3281
 3270]


In [8]:
no_active_ele = (sum(labels2))
print(no_active_ele)

[ 154.]


In [9]:
X_dim = ip_dim
h1_dim = 500
h2_dim = 500
h3_dim = 500
z_dim = 60

In [10]:
def get_train_batch(batch_size):
    samples = np.random.randint(low=0,high=no_examples,size=(batch_size,1))
    train_batch = p_fingerprints[samples].reshape(batch_size,ip_dim)
    train_batch = train_batch.astype(int)
    train_batch = torch.cuda.FloatTensor(train_batch)
    train_batch = Variable(train_batch,requires_grad=False).cuda()
    target = Variable(torch.cuda.FloatTensor(labels2[samples]),requires_grad=False)
    
    return train_batch,target

In [11]:
class encoder(nn.Module):
    def __init__(self):
        super(encoder,self).__init__()
        self.l1 = nn.Linear(X_dim,h1_dim)
        self.l2 = nn.Linear(h1_dim,h2_dim)
        self.l3 = nn.Linear(h2_dim,h3_dim)
        self.l4 = nn.Linear(h3_dim,z_dim)
        
    def forward(self,x):
        x = F.leaky_relu(self.l1(x))
        x = F.leaky_relu(self.l2(x))
        x = F.leaky_relu(self.l3(x))
        x = self.l4(x)
        
        return x

In [12]:
class decoder(nn.Module):
    def __init__(self):
        super(decoder,self).__init__()
        self.l1 = nn.Linear(z_dim,h3_dim)
        self.l2 = nn.Linear(h3_dim,h2_dim)
        self.l3 = nn.Linear(h2_dim,h1_dim)
        self.l4 = nn.Linear(h1_dim,X_dim)
        
    def forward(self,x):
        x = F.leaky_relu(self.l1(x))
        x = F.leaky_relu(self.l2(x))
        x = F.leaky_relu(self.l3(x))
        x = F.tanh(self.l4(x))
        
        return x

In [13]:
class disc(nn.Module):
    def __init__(self):
        super(disc,self).__init__()
        self.lin1 = nn.Linear(z_dim+2,500)
        self.lin2 = nn.Linear(500,100)
        #self.lin3 = nn.Linear(100,100)
        self.lin4 = nn.Linear(100,30)
        self.lin5 = nn.Linear(30,1)
        
    def forward(self,x):
        x = F.selu(self.lin1(x))
        x = F.selu(self.lin2(x))
        #x = F.selu(self.lin3(x))
        x = F.selu(self.lin4(x))
        x = F.sigmoid(self.lin5(x))
        return x

In [14]:
def add_label_info(y,batch_size):

    tmp = np.zeros((batch_size,2))
    tmp2 = np.zeros((batch_size,1))
    y = y.cpu().data.numpy().reshape(batch_size,1)
    tmp2[y==0] = 5
    tmp3 = np.zeros((batch_size,1))
    tmp3[y==1] = 5
    tmp = np.concatenate((tmp2,tmp3),1)
    label_info = torch.from_numpy((tmp)).cuda()
    return label_info

In [15]:
def train_model(Q,Q_solver,P,P_solver,D,D_solver,batch_size):
    
    for it in range(3500):
        x,y = get_train_batch(batch_size)
        z = Q(x)

        #Reconstruction
        
        x_recon = P(z)
        '''
        x_recon[x_recon<0] = 0
        x_recon[x_recon>0] = 1
        x_tar = Variable(torch.cuda.FloatTensor(x.size()),requires_grad=False)
        x_tar[x==-1] = 0
        x_tar[x==1] = 1'''
        
        criterion = nn.MSELoss()
        CEL = criterion(x_recon, x)
        
        CEL.backward(retain_graph=True)
        Q_solver.step()
        P_solver.step()
        
        Q.zero_grad()
        P.zero_grad()
        
        #Discriminator
        
        label_info = (add_label_info(y,batch_size))
        z_false = np.concatenate((z.cpu().data.numpy(),label_info.cpu().numpy()),1)
        z_false = Variable(torch.FloatTensor(z_false)).cuda()
        #z_false = torch.cat((z,label_info),1)
        z_true = np.random.rand(batch_size,z_dim)
        z_true = np.concatenate((z_true,label_info.cpu().numpy()),1)
        z_true = Variable(torch.FloatTensor(z_true).cuda())
        #z_true = torch.cat((z_true,label_info),1)
        z_true_op = Variable(D(z_true).data,requires_grad=False)
        
        z_false_op = D(z_false)
        add_small = 1e-20
        
        
        criterion = nn.BCELoss()
        loss_d = criterion(z_false_op,z_true_op)
        #loss_d = -torch.mean(torch.log(z_true_op + add_small) + torch.log(1 - z_false_op + add_small))
        loss_d.backward(retain_graph=True)
        D_solver.step()
        D.zero_grad()
        
        #Updating the encoder
        
        G_loss = -torch.mean(torch.log(z_false_op+1e-20))
        G_loss.backward(retain_graph=True)
        Q_solver.step()
        Q_solver.zero_grad()
        
        
        
        
        if(it%50==0):
            #print(extra_loss.data[0],CEL.data[0])
            print('recon_loss:', CEL.data[0],'disc_loss:', loss_d.data[0],'gen_loss: ',G_loss.data[0])
            #print(x_recon[0][:50].cpu().data.numpy().T)
            #print()
            #print(x[0][:50].cpu().data.numpy().T)
           # print()
    return Q,P

In [16]:
def generate_model():
    Q = encoder()
    Q.cuda()
    Q_solver = optim.Adam(Q.parameters(),lr=1e-4)
    E_solver = optim.Adam(Q.parameters(),lr = 1e-5)
    P = decoder()
    P.cuda()
    P_solver = optim.Adam(P.parameters(),lr = 1e-4)
    D = disc()
    D.cuda()
    D_solver = optim.Adam(D.parameters(),lr = 1e-3)
    batch_size = 120
    Q,P = train_model(Q,Q_solver,P,P_solver,D,D_solver,batch_size)
    
    return Q,P

In [17]:
Q,P = generate_model()

('recon_loss:', 0.06175412982702255, 'disc_loss:', 0.6929045915603638, 'gen_loss: ', 0.7217796444892883)
('recon_loss:', 0.047042906284332275, 'disc_loss:', 0.00024133353144861758, 'gen_loss: ', 1.0049680895463098e-05)
('recon_loss:', 0.03817024081945419, 'disc_loss:', 0.00020901851530652493, 'gen_loss: ', 8.547125617042184e-06)
('recon_loss:', 0.03266599401831627, 'disc_loss:', 0.00026663977769203484, 'gen_loss: ', 1.3540688087232411e-05)
('recon_loss:', 0.03228125348687172, 'disc_loss:', 0.00012333346239756793, 'gen_loss: ', 6.604869668080937e-06)
('recon_loss:', 0.023817162960767746, 'disc_loss:', 0.00022105086827650666, 'gen_loss: ', 1.3725653843721375e-05)
('recon_loss:', 0.022138474509119987, 'disc_loss:', 0.0001817665179260075, 'gen_loss: ', 1.3478643268172164e-05)
('recon_loss:', 0.02096492238342762, 'disc_loss:', 0.00012767301814164966, 'gen_loss: ', 7.5894636211160105e-06)
('recon_loss:', 0.016020575538277626, 'disc_loss:', 0.0001512121525593102, 'gen_loss: ', 1.4077148989599

In [18]:
encoder_path = os.getcwd() + '/model_enc_' + str(fname)
torch.save(Q.state_dict(),encoder_path)

In [19]:
#entire_batch,batch_labels = get_train_batch(no_examples)
tic = time.time()
## It takes too much memory. Split in chunks and 
z_encoded = Q(Variable(torch.cuda.FloatTensor(p_fingerprints)))
toc = time.time()
print(toc-tic)
z_encoded = z_encoded.cpu().data.numpy()

1.45639705658


In [20]:
# mu,std = np.mean(z_encoded,axis=0),np.std(z_encoded,axis=0)
# print mu.shape
# z_encoded = (z_encoded - mu)/std

In [21]:
#z_encoded,labels2 = sdc.clean_data(z_encoded,labels2,k=8)
no_examples,ip_dim = z_encoded.shape
z_encoded = z_encoded.astype(float)
print no_examples

3467


In [22]:
#z_encoded = (z_encoded + mu)*std
labels2 = labels2.astype(int)

In [23]:
# x_encoded = z_encoded.cpu().data.numpy()[:,0]
# y_encoded = z_encoded.cpu().data.numpy()[:,1]
# w_encoded = z_encoded.cpu().data.numpy()[:,2]

# # batch_labels_np = batch_labels_np.astype(int)
# # print(batch_labels_np.dtype)
# # print(batch_labels_np.shape)
# batch_labels_np = list(labels2)

# colors = []
# for l in batch_labels_np:
#     colors.append("C"+str(int(l)))
    
# #plt.scatter(x_encoded,y_encoded,c=colors)
# fig = plt.figure()
# ax = Axes3D(fig)
# ax.scatter(x_encoded,y_encoded,w_encoded,c=colors)
# plt.show()

In [24]:
def sample_z(size):
    if generate_new_z == True:
        print("gng here")
        ind = torch.cuda.LongTensor(torch.randperm(no_examples+n_samples)[:size].numpy())
        return new_z_encoded[ind], new_labels[ind]
    else:
        ind = torch.cuda.LongTensor(torch.randperm(no_examples)[:size].numpy())
        return z_encoded[ind], Variable(torch.cuda.LongTensor(labels2)[ind],requires_grad = False)

In [25]:
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator,self).__init__()
        self.l1 = nn.Linear(z_dim,500)
        self.l2 = nn.Linear(500,500)
        self.l3 = nn.Linear(500,100)
        #self.l4 = nn.Linear(400,70)
        self.l5 = nn.Linear(100,2)
        
    def forward(self,x):
        x = F.selu(self.l1(x))
        x = F.selu(self.l2(x))
        x = F.selu(self.l3(x))
        #x = F.relu(self.l4(x))
        x = (self.l5(x))
        
        return x

In [26]:
# def gen_disc_model(w):
#     d = Discriminator().cuda()
#     d_optim = optim.Adam(d.parameters(),lr=1e-4)
#     d = train_disc(d,d_optim,w)
#     return d

In [27]:
# def train_disc(d,d_optim,w):
#     for ep in range(2000):
#         d_optim.zero_grad()
#         x,true_l = sample_z(200)
#         true_l = true_l.view(true_l.size()[0],)
#         p_labels = d(x)
#         weights = torch.Tensor([1,w]).cuda()
#         criteria = nn.CrossEntropyLoss(weight=weights)
#         true_l = true_l.type(torch.cuda.LongTensor)
#         loss = criteria(p_labels,true_l)
#         loss.backward(retain_graph=True)
#         d_optim.step()
        
# #         if(ep%50==49):
# #             print(loss.data[0])
            
#     return d

In [28]:
def get_train_batch_z(batch_size,validation_iter=0,binary=True):
    
    if validation_iter == 0: #no validation
        curr_data_size = no_examples
        labels_train = labels2
    else:
        curr_data_size = int(no_examples*0.8)
        interval_size = int(no_examples*0.2)
        
        if(val_iter==1):
            s_ind1 = int((validation_iter)*interval_size)
            end_ind1 = int((validation_iter+1)*interval_size)
            s_ind2 = int((validation_iter + 1) * interval_size)
            end_ind2 = int(no_examples)
        else:
            s_ind1 = 0
            end_ind1 = int((validation_iter)*interval_size)
            s_ind2 = int((validation_iter + 1) * interval_size)
            end_ind2 = int(no_examples)
        
        #print("train_ind ",s_ind1,end_ind1,s_ind2,end_ind2)
        indices = range(s_ind1,end_ind1) + range(s_ind2,end_ind2)
        p_train_data = z_encoded[indices]
        labels_train = labels2[indices]
                               
    samples = np.random.randint(low=0,high=curr_data_size,size=(batch_size,1))
    if binary == True:
        train_batch = p_train_data[samples].reshape(batch_size,ip_dim)
        train_batch = train_batch.astype(float)
    else:
        None
    
    train_batch = torch.cuda.FloatTensor(train_batch)
    train_batch = Variable(train_batch,requires_grad=False).cuda()
    target = Variable(torch.cuda.LongTensor(labels_train[samples]),requires_grad=False)
    target = target.view(batch_size,)
    return train_batch,target

In [29]:
def get_val_data_z(validation_iter,binary = True):
    interval_size = int(no_examples)*0.2
    s_ind = int((validation_iter-1)*interval_size)
    e_ind = int((validation_iter) * interval_size)
    if(binary==True):
        train_data = z_encoded[s_ind:e_ind]
    else:
        None
    labels_val = labels2[s_ind:e_ind]   
    #print("val ind ",s_ind,e_ind)
    return Variable(torch.cuda.FloatTensor(train_data)),labels_val  

In [30]:
min_fn = 15
max_fp = 160
cm_list = []
batch_size = 128
for i in range(1,6):
    val_iter = i
    print("val iter: ",val_iter)
    
   
    #weights_array = [7]
    weights = np.linspace(15,30,3)
    for i,w in enumerate(weights): 
        mydisc = Discriminator().cuda()
        optimizer = torch.optim.Adagrad(mydisc.parameters(),lr=1e-3)
        criterion = nn.CrossEntropyLoss(weight=torch.cuda.FloatTensor([1,w]))

        for ep in range(3000):
            train_batch,target = get_train_batch_z(batch_size,binary = True,validation_iter = val_iter)
            model_op = mydisc(train_batch)
            #print(model_op.type)
            #print(target.type)
            loss = criterion(model_op,target)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

    #     if(ep%30==29):
    #         print(loss.data[0])

        ## After training check on cross validation data
        val_data,labels_val = get_val_data_z(val_iter,binary = True)
        train_op = mydisc(val_data)
        train_op = train_op.cpu().data.numpy()
        pred_labels = np.argmax(train_op,axis=1)
        #tmp_labels = tmp_labels.data.cpu().numpy()
        #print(sum(tmp_labels))
        cf = metrics.confusion_matrix(labels_val,pred_labels).ravel()
        #print(val_iter,w)
        print('tn, fp, fn, tp: ',cf)
        [tn,fp,fn,tp] = cf
        wcf = [val_iter] + [w] + [cf]
        if(fn < 3):
            cm_list.append(wcf)
            if(fn<4):
                if(fp < max_fp):
                    max_fp = fp
                    model_path = os.getcwd() + '/disc_for_aae' + fname
                    torch.save(mydisc.state_dict(),model_path)
                    print("saving model on val: ",val_iter," and weight: ",w)

        


('val iter: ', 1)
('tn, fp, fn, tp: ', array([607,  38,   2,  46]))
('saving model on val: ', 1, ' and weight: ', 15.0)
('tn, fp, fn, tp: ', array([628,  17,   6,  42]))
('tn, fp, fn, tp: ', array([579,  66,   0,  48]))
('val iter: ', 2)
('tn, fp, fn, tp: ', array([665,  28,   0,   0]))
('saving model on val: ', 2, ' and weight: ', 15.0)
('tn, fp, fn, tp: ', array([643,  50,   0,   0]))
('tn, fp, fn, tp: ', array([642,  51,   0,   0]))
('val iter: ', 3)
('tn, fp, fn, tp: ', array([674,  20,   0,   0]))
('saving model on val: ', 3, ' and weight: ', 15.0)
('tn, fp, fn, tp: ', array([659,  35,   0,   0]))
('tn, fp, fn, tp: ', array([662,  32,   0,   0]))
('val iter: ', 4)
('tn, fp, fn, tp: ', array([676,  17,   0,   0]))
('saving model on val: ', 4, ' and weight: ', 15.0)
('tn, fp, fn, tp: ', array([667,  26,   0,   0]))
('tn, fp, fn, tp: ', array([673,  20,   0,   0]))
('val iter: ', 5)
('tn, fp, fn, tp: ', array([555,  34,  48,  57]))
('tn, fp, fn, tp: ', array([555,  34,  43,  62]))
('

In [31]:
print labels_val.shape, pred_labels.shape
print pred_labels[:10]
#pred_labels = np.reshape(pred_labels.shape[0],1)
cf = metrics.confusion_matrix(labels_val,pred_labels).ravel()
print cf

(694, 1) (694,)
[0 0 0 0 0 0 0 0 0 0]
[544  45  39  66]


In [32]:
# model_path = os.getcwd() + '/model_autoencoder_' + str(fname)
# for w in weights:
#     print("w: ",w)
#     d = gen_disc_model(w)
#     train_op = d(train_encoded).cpu().data.numpy()
#     train_op = np.argmax(train_op,axis=1)
#     cf = metrics.confusion_matrix(labels_final,train_op)
#     [tn, fp, fn, tp]  = cf.ravel()
#     print('tn, fp, fn, tp: ',cf.ravel())
#     if(fn < fn_min):
#         fn_min = fn
#         torch.save(d.state_dict(),model_path)
#         print("saving model on weight: ",w)

# Check on Testing Data 

In [33]:
p_fingerprints_test = []
c_fingerprints_test = []
labels_test = []
with open(path+fname+'red_test.csv') as csvfile:
    readcsv = csv.reader(csvfile)
    for row in readcsv:
        p_fingerprints_test.append(row[:bf])
        c_fingerprints_test.append(row[bf:-1])
        labels_test.append(row[-1])
        
p_fingerprints_test = np.asarray(p_fingerprints_test)[1:]
p_fingerprints_test = p_fingerprints_test.astype(int)
p_fingerprints_test[(p_fingerprints_test==0)] = -1

c_fingerprints_test = np.asarray(c_fingerprints_test)[1:]
c_fingerprints_test = c_fingerprints_test.astype(float)

#Normalise the features
c_fingerprints_test = (c_fingerprints_test - np.mean(c_fingerprints_test,axis=0))/np.std(c_fingerprints_test,axis=0)

fingerprints_test = np.concatenate((p_fingerprints_test,c_fingerprints_test),axis=1)

#p2_fingerprints = np.ones(p_fingerprints.shape)
(no_examples_test , ip_dim_test) = fingerprints_test.shape
labels_test = labels_test[1:]

In [34]:
labels2_test = np.zeros((len(labels_test),1))
for i,l in enumerate(labels_test):
    if l=='Active':
        labels2_test[i] = 1
    else:
        labels2_test[i] = 0
labels2_test = labels2_test.astype(int)
total_pos = np.sum(labels2_test)
print("tot_positive",total_pos)

('tot_positive', 12)


In [35]:
p_fingerprints_test = Variable(torch.cuda.FloatTensor(p_fingerprints_test))
z_test = Q(p_fingerprints_test)
test_op = mydisc(z_test)

In [36]:
test_op = test_op.cpu().data.numpy()
pred_labels = np.argmax(test_op,axis=1)
cf = metrics.confusion_matrix(labels2_test,pred_labels).ravel()
print('tn, fp, fn, tp: ',cf)

('tn, fp, fn, tp: ', array([840,   4,  12,   0]))
