In [1]:
import numpy as np
import csv
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from matplotlib import pyplot as plt
from torch.autograd import Variable
from mpl_toolkits.mplot3d import Axes3D
import sklearn.metrics as metrics
import numpy.linalg as la
import pickle
%matplotlib notebook

In [2]:
with open('screen_info.txt','rb') as fl:
    t = pickle.load(fl)
fnames = t[0]
totf = t[1]
binf = t[2]
runfile = 0
fname = fnames[runfile]
bf = binf[runfile]

In [3]:
path = '/home/daiict/CVShare/Jeni/hts/bioassay-datasets/' + fname + 'red_train.csv'
p_fingerprints = []
labels = []
with open(path) as csvfile:
    readcsv = csv.reader(csvfile)
    for row in readcsv:
        p_fingerprints.append(row[:112])
        labels.append(row[-1])

In [4]:
p_fingerprints = np.asarray(p_fingerprints)[1:]
p_fingerprints = p_fingerprints.astype(int)
#p2_fingerprints = np.ones(p_fingerprints.shape)
(no_examples , ip_dim) = p_fingerprints.shape
labels = labels[1:]
print(no_examples,ip_dim)
print("total no of 1s",np.sum(p_fingerprints))
print("total no of 0s",no_examples*ip_dim-np.sum(p_fingerprints))

p_fingerprints[(p_fingerprints==0)] = -1

(3423, 112)
('total no of 1s', 25982)
('total no of 0s', 357394)


In [5]:
labels2 = np.zeros((len(labels),1))
for i,l in enumerate(labels):
    if l=='Active':
        labels2[i] = 1
    else:
        labels2[i] = 0

no_active_ele = (sum(labels2))
labels2 = labels2.astype(int)
print(no_active_ele)

[ 48.]


In [6]:
fname

'AID362'

In [7]:
with open(os.getcwd() +  '/' + 'minham_' + fname) as f:
    p = pickle.load(f)
p_inactive = p[1]
p_active = p[0]

In [8]:
def get_train_batch(batch_size,keep_numpy=False):
    samples = np.random.randint(low=0,high=no_examples,size=(batch_size,1))
    train_batch = p_fingerprints[samples].reshape(batch_size,ip_dim)
    train_batch = train_batch.astype(int)
    train_batch = torch.cuda.FloatTensor(train_batch)
    train_batch = Variable(train_batch,requires_grad=False).cuda()
    if(keep_numpy==False):
        target = (torch.cuda.LongTensor(labels2[samples]))
    else:
        target = labels2[samples]
    return train_batch,target

### Calculating p_vectors 

In [9]:
# p_active = np.float32(np.sum(p_fingerprints[labels2[:,0]==1],axis=0))/np.sum(labels2)
# p_inactive = np.float32(np.sum(p_fingerprints[labels2[:,0]==0],axis=0))/(no_examples-np.sum(labels2))

### Modifying p_vectors 

In [10]:
# p_active = p_active + 0.5*(p_active-p_inactive)
# p_active[p_active<0] = 0
# p_inactive = p_inactive + 0.5*(p_inactive - p_active)
# p_inactive[p_inactive<0] = 0

In [11]:
#find the angle between them
theta = np.dot(p_active,p_inactive)/(la.norm(p_active)*la.norm(p_inactive))
print(theta)

0.707106781187


In [12]:
class encoder(nn.Module):
    def __init__(self):
        super(encoder,self).__init__()
        self.l1 = nn.Linear(ip_dim,ip_dim+50)
        self.l2 = nn.Linear(ip_dim+50,ip_dim)
        self.l3 = nn.Linear(ip_dim,ip_dim)
        self.l4 = nn.Linear(ip_dim,ip_dim)
        
        
    def forward(self,x):
        x = F.tanh(self.l1(x))
        x = F.tanh(self.l2(x))
        x = F.tanh(self.l3(x))
        x = self.l4(x)
        #x = self.l4(x)
        
        return x

In [13]:
class decoder(nn.Module):
    def __init__(self):
        super(decoder,self).__init__()
        self.l1 = nn.Linear(ip_dim,ip_dim)
        self.l2 = nn.Linear(ip_dim,ip_dim)
        self.l3 = nn.Linear(ip_dim,ip_dim+50)
        self.l4 = nn.Linear(ip_dim+50,ip_dim)
        
        
    def forward(self,x):
        x = F.tanh(self.l1(x))
        x = F.tanh(self.l2(x))
        x = F.tanh(self.l3(x))
        x = F.tanh(self.l4(x))
        #x = self.l4(x)
        
        return x

In [14]:
class disc(nn.Module):
    def __init__(self):
        super(disc,self).__init__()
        self.lin1 = nn.Linear(ip_dim,500)
        self.lin2 = nn.Linear(500,500)
        self.lin3 = nn.Linear(500,100)
        self.lin4 = nn.Linear(100,1)
        
    def forward(self,x):
        x = F.tanh(self.lin1(x))
        x = F.tanh(self.lin2(x))
        x = F.tanh(self.lin3(x))
        x = F.sigmoid(self.lin4(x))
        
        return x

### Draw samples based on labels 

In [15]:
def draw_samples(batch_size,labels,var_active,var_inactive):
    z = np.zeros((batch_size,ip_dim))
    rand_vec = np.random.randn(batch_size,ip_dim)
    z[labels==1] = p_active + var_active*rand_vec[labels==1]
    z[labels==0] = p_inactive + var_inactive*rand_vec[labels==0]
    z = torch.cuda.FloatTensor(z)
    return z

In [16]:
def train_model(Q,Q_solver,P,P_solver,D,D_solver,batch_size):
    
    for it in range(2000):
        x,y = get_train_batch(batch_size,keep_numpy=True)
        y = y.reshape(batch_size,)
        z = Q(x)
        z_false = Q(x)
        
        
        #Reconstruction
        
        x_recon = P(z)
        criterion = nn.MSELoss()
        CEL = criterion(x_recon, x)
        CEL.backward()
        
        Q_solver.step()
        P_solver.step()
        
        Q.zero_grad()
        P.zero_grad()
        
        #Discriminator
        z_false_op = D(z_false)
        z_true = Variable(draw_samples(batch_size,y,var_active=0.01,var_inactive=0.1))
        
        
        z_true_op = Variable(D(z_true).data,requires_grad=False)
                
        add_small = 1e-20
        
        
        criterion = nn.BCELoss()
        loss_d = criterion(z_false_op,z_true_op)
        #loss_d = -torch.mean(torch.log(z_true_op + add_small) + torch.log(1 - z_false_op + add_small))
        loss_d.backward(retain_variables = True)
        D_solver.step()
        D.zero_grad()
        
        #Updating the encoder
        
        G_loss = -torch.mean(torch.log(z_false_op+1e-20))
        G_loss.backward()
        Q_solver.step()
        Q_solver.zero_grad()
        
        
        
        
        if(it%50==0):
            #print(extra_loss.data[0],CEL.data[0])
            print('recon_loss:', CEL.data[0],'disc_loss:', loss_d.data[0],'gen_loss: ',G_loss.data[0])
            #print(x_recon[0][:50].cpu().data.numpy().T)
            #print()
            #print(x[0][:50].cpu().data.numpy().T)
           # print()
    return Q,P

In [17]:
def generate_model():
    Q = encoder()
    Q.cuda()
    Q_solver = optim.Adam(Q.parameters(),lr=1e-4)
    E_solver = optim.Adam(Q.parameters(),lr = 1e-5)
    P = decoder()
    P.cuda()
    P_solver = optim.Adam(P.parameters(),lr = 1e-4)
    D = disc()
    D.cuda()
    D_solver = optim.Adam(D.parameters(),lr = 1e-3,weight_decay=1e-2)
    batch_size = 120
    Q,P = train_model(Q,Q_solver,P,P_solver,D,D_solver,batch_size)
    
    return Q,P

In [18]:
Q,P = generate_model()



('recon_loss:', 1.0012357234954834, 'disc_loss:', 0.6930471062660217, 'gen_loss: ', 0.6733259558677673)
('recon_loss:', 0.462269127368927, 'disc_loss:', 0.7001225352287292, 'gen_loss: ', 0.2221345752477646)
('recon_loss:', 0.2248225063085556, 'disc_loss:', 0.8317672610282898, 'gen_loss: ', 0.1590990424156189)
('recon_loss:', 0.21316368877887726, 'disc_loss:', 0.7657734155654907, 'gen_loss: ', 0.2741232216358185)
('recon_loss:', 0.20338603854179382, 'disc_loss:', 0.8075255155563354, 'gen_loss: ', 0.22618059813976288)
('recon_loss:', 0.2133547067642212, 'disc_loss:', 0.7730756402015686, 'gen_loss: ', 0.28262388706207275)
('recon_loss:', 0.21085059642791748, 'disc_loss:', 0.8310187458992004, 'gen_loss: ', 0.22243432700634003)
('recon_loss:', 0.19576016068458557, 'disc_loss:', 0.8372853398323059, 'gen_loss: ', 0.21625152230262756)
('recon_loss:', 0.20875658094882965, 'disc_loss:', 0.8298947811126709, 'gen_loss: ', 0.23181568086147308)
('recon_loss:', 0.19523419439792633, 'disc_loss:', 0.78

In [19]:
train_encoded = Q(Variable(torch.cuda.FloatTensor(p_fingerprints)))

In [20]:
train_encoded_np = train_encoded.data.cpu().numpy()
#train_encoded_np = np.tile(train_encoded_np,(len(train_encoded_np),1))
dist_active = la.norm(train_encoded_np - np.tile(p_active,(no_examples,1)),axis=1)
dist_inactive = la.norm(train_encoded_np - np.tile(p_inactive,(no_examples,1)),axis=1)
d = dist_active - dist_inactive
pred_labels = np.zeros((no_examples,1))
pred_labels[d>=0] = 1
cf = metrics.confusion_matrix(y_pred=pred_labels,y_true=labels2)
print('tn, fp, fn, tp: ',cf.ravel())

('tn, fp, fn, tp: ', array([3375,    0,   48,    0]))


### Test file

In [21]:
p_fingerprints_test = []
labels = []
path = '/home/daiict/CVShare/Jeni/hts/bioassay-datasets/' + fname + 'red_test.csv'
with open(path) as csvfile:
    readcsv = csv.reader(csvfile)
    for row in readcsv:
        p_fingerprints_test.append(row[:112])
        labels.append(row[-1])

In [22]:
p_fingerprints_test = np.asarray( p_fingerprints_test)[1:]
p_fingerprints_test =  p_fingerprints_test.astype(int)
#p2_fingerprints = np.ones(p_fingerprints.shape)
(no_examples , ip_dim) =  p_fingerprints_test.shape
labels = labels[1:]
print(no_examples,ip_dim)
print("total no of 1s",np.sum( p_fingerprints_test))
print("total no of 0s",no_examples*ip_dim-np.sum( p_fingerprints_test))

p_fingerprints_test[( p_fingerprints_test==0)] = -1

(856, 112)
('total no of 1s', 6305)
('total no of 0s', 89567)


In [23]:
labels_test = np.zeros((len(labels),1))
for i,l in enumerate(labels):
    if l=='Active':
        labels_test[i] = 1
    else:
        labels_test[i] = 0

no_active_ele = (sum(labels_test))
labels_test = labels_test.astype(int)
print(no_active_ele)

[ 12.]


In [24]:
import time
tic = time.time()
test_encoded = Q(Variable(torch.cuda.FloatTensor(p_fingerprints_test)))
toc = time.time()
print(toc-tic)

0.0581939220428


In [25]:
test_encoded_np = test_encoded.data.cpu().numpy()
#train_encoded_np = np.tile(train_encoded_np,(len(train_encoded_np),1))
dist_active = la.norm(test_encoded_np - np.tile(p_active,(no_examples,1)),axis=1)
dist_inactive = la.norm(test_encoded_np - np.tile(p_inactive,(no_examples,1)),axis=1)
d = dist_active - dist_inactive
pred_labels = np.zeros((no_examples,1))
pred_labels[d>=0] = 1
cf = metrics.confusion_matrix(y_pred=pred_labels,y_true=labels_test)
print('tn, fp, fn, tp: ',cf.ravel())

('tn, fp, fn, tp: ', array([844,   0,  12,   0]))
