In [1]:
import numpy as np
import csv
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from matplotlib import pyplot as plt
from torch.autograd import Variable
from mpl_toolkits.mplot3d import Axes3D
import sklearn.metrics as metrics
import pickle
import time
from sklearn.kernel_approximation import RBFSampler
%matplotlib notebook


In [2]:
with open('screen_info.txt','rb') as fl:
    t = pickle.load(fl)
fnames = t[0]
totf = t[1]
binf = t[2]
runfile = 0
fname = fnames[runfile]
bf = binf[runfile]

### X_features are real valued features. We apply kernel on it 

In [3]:
path = os.getcwd() + '/bioassay-datasets/'
Y_fingerprints = []
X_features = []
labels = []
with open(path+fname+'red_train.csv') as csvfile:
    readcsv = csv.reader(csvfile)
    for row in readcsv:
        Y_fingerprints.append(row[:bf])
        X_features.append(row[bf:-1])
        labels.append(row[-1])

In [4]:
Y_fingerprints = np.asarray(Y_fingerprints)[1:]
Y_fingerprints = Y_fingerprints.astype(int)

X_features = np.asarray(X_features)[1:]
X_features = X_features.astype(float)

#Normalise the features
#c_fingerprints = (c_fingerprints - np.mean(c_fingerprints,axis=0))/np.std(c_fingerprints,axis=0)

#p2_fingerprints = np.ones(p_fingerprints.shape)
(no_examples , ip_dim) = Y_fingerprints.shape
labels = labels[1:]
print(no_examples,ip_dim)
print("total no of 1s",np.sum(Y_fingerprints))
print("total no of 0s",no_examples*ip_dim-np.sum(Y_fingerprints))

(3423, 112)
('total no of 1s', 25982)
('total no of 0s', 357394)


In [5]:
Y_fingerprints[(Y_fingerprints==0)] = -1

labels2 = np.zeros((len(labels),1))
for i,l in enumerate(labels):
    if l=='Active':
        labels2[i] = 1
    else:
        labels2[i] = 0
labels2 = labels2.astype(int)

In [6]:
feat_dim = 250

rbf_feature = RBFSampler(gamma=1,n_components = feat_dim, random_state=1)
c_fingerprints = rbf_feature.fit_transform(X_features)
p_fingerprints = rbf_feature.fit_transform(Y_fingerprints)

In [7]:
ind = np.arange(no_examples)
np.random.shuffle(ind)
p_fingerprints = p_fingerprints[ind]
c_fingerprints = c_fingerprints[ind]
labels2 = labels2[ind]

In [8]:
no_active_ele = (sum(labels2))
print(no_active_ele)
batch_size = 400

[48]


In [9]:
def get_train_batch(batch_size,validation_iter=0,binary=True):
    
    if validation_iter == 0: #no validation
        curr_data_size = no_examples
        labels_train = labels2
    else:
        curr_data_size = int(no_examples*0.8)
        interval_size = int(no_examples*0.2)
        
        if(val_iter==1):
            s_ind1 = int((validation_iter)*interval_size)
            end_ind1 = int((validation_iter+1)*interval_size)
            s_ind2 = int((validation_iter + 1) * interval_size)
            end_ind2 = int(no_examples)
        else:
            s_ind1 = 0
            end_ind1 = int((validation_iter)*interval_size)
            s_ind2 = int((validation_iter + 1) * interval_size)
            end_ind2 = int(no_examples)
        
        #print("train_ind ",s_ind1,end_ind1,s_ind2,end_ind2)
        indices = range(s_ind1,end_ind1) + range(s_ind2,end_ind2)
        c_train_data = c_fingerprints[indices]
        p_train_data = p_fingerprints[indices]
        labels_train = labels2[indices]
                               
    samples = np.random.randint(low=0,high=curr_data_size,size=(batch_size,1))
    if binary == True:
        train_batch = p_train_data[samples].reshape(batch_size,feat_dim)
        train_batch = train_batch.astype(int)
    else:
        train_batch = c_train_data[samples].reshape(batch_size,feat_dim)
        train_batch = train_batch.astype(float)
    
    train_batch = torch.cuda.FloatTensor(train_batch)
    train_batch = Variable(train_batch,requires_grad=False).cuda()
    target = Variable(torch.cuda.LongTensor(labels_train[samples]),requires_grad=False)
    target = target.view(batch_size,)
    return train_batch,target

In [10]:
def get_val_data(validation_iter,binary = True):
    interval_size = int(no_examples)*0.2
    s_ind = int((validation_iter-1)*interval_size)
    e_ind = int((validation_iter) * interval_size)
    if(binary==True):
        train_data = p_fingerprints[s_ind:e_ind]
    else:
        train_data = c_fingerprints[s_ind:e_ind]
    labels_val = labels2[s_ind:e_ind]   
    #print("val ind ",s_ind,e_ind)
    #print train_data.shape, labels_val.shape
    return Variable(torch.cuda.FloatTensor(train_data)),labels_val  

In [11]:
class c_mlp(nn.Module):
    def __init__(self):
        super(c_mlp,self).__init__()
        self.l1 = nn.Linear(feat_dim,1000)
        self.l2 = nn.Linear(1000,500)
        #self.l3 = nn.Linear(1000,500)
        self.l4 = nn.Linear(500,500)
        self.l5 = nn.Linear(500,50)
        self.l6 = nn.Linear(50,2)
        
    def forward(self,x):
        x = F.leaky_relu(self.l1(x))
        x = F.leaky_relu(self.l2(x))
        #x = F.leaky_relu(self.l3(x))
        x = F.leaky_relu(self.l4(x))
        x = F.leaky_relu(self.l5(x))
        x = (self.l6(x))
        
        return x

In [12]:
min_fn = 15
max_fp = 200
maxtmpdiff = -2
cm_list = []
get_model = 1
for val_iter in range(1,6):
    print("val iter: ",val_iter)
    
   
    #weights_array = [5]
    weights_array = np.linspace(50,70,10)
    for i,w in enumerate(weights_array): 
        mymlp = c_mlp().cuda()
        optimizer = torch.optim.Adagrad(mymlp.parameters(),lr=1e-4)
        criterion = nn.CrossEntropyLoss(weight=torch.cuda.FloatTensor([1,w]))

        for ep in range(2000):
            train_batch,target = get_train_batch(batch_size,binary = True,validation_iter = val_iter)
            model_op = mymlp(train_batch)
            #print(model_op.type)
            #print(target.type)
            loss = criterion(model_op,target)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

    #     if(ep%30==29):
    #         print(loss.data[0])

        ## After training check on cross validation data
        val_data,labels_val = get_val_data(val_iter,binary = True)
        train_op = mymlp(val_data)
        train_op = train_op.cpu().data.numpy()
        pred_labels = np.argmax(train_op,axis=1)
        #tmp_labels = tmp_labels.data.cpu().numpy()
        #print(sum(tmp_labels))
        cf = metrics.confusion_matrix(labels_val,pred_labels).ravel()
        #print(val_iter,w)
        print('tn, fp, fn, tp: ',cf)
        [tn,fp,fn,tp] = cf
        wcf = [val_iter] + [w] + [cf]
        tmpdiff = tp-fn
        if(tmpdiff >= maxtmpdiff):
            cm_list.append(wcf)
            if(tmpdiff>= maxtmpdiff):
                if(fp <= max_fp):
                    max_fp = fp
                    maxtmpdiff = tmpdiff
                    model_path = os.getcwd() + '/kernel_mac' + fname + '_' + str(get_model)
                    torch.save(mymlp.state_dict(),model_path)
                    print("saving model on val: ",val_iter," and weight: ",w)
                    if(get_model<4):
                        get_model = get_model + 1
                    else:
                        break

('val iter: ', 1)
('tn, fp, fn, tp: ', array([680,   0,   4,   0]))
('tn, fp, fn, tp: ', array([680,   0,   4,   0]))
('tn, fp, fn, tp: ', array([680,   0,   4,   0]))
('tn, fp, fn, tp: ', array([680,   0,   4,   0]))
('tn, fp, fn, tp: ', array([680,   0,   4,   0]))
('tn, fp, fn, tp: ', array([680,   0,   4,   0]))
('tn, fp, fn, tp: ', array([680,   0,   4,   0]))
('tn, fp, fn, tp: ', array([  0, 680,   0,   4]))
('tn, fp, fn, tp: ', array([680,   0,   4,   0]))
('tn, fp, fn, tp: ', array([  0, 680,   0,   4]))
('val iter: ', 2)
('tn, fp, fn, tp: ', array([677,   0,   8,   0]))
('tn, fp, fn, tp: ', array([677,   0,   8,   0]))
('tn, fp, fn, tp: ', array([677,   0,   8,   0]))
('tn, fp, fn, tp: ', array([677,   0,   8,   0]))
('tn, fp, fn, tp: ', array([677,   0,   8,   0]))
('tn, fp, fn, tp: ', array([677,   0,   8,   0]))
('tn, fp, fn, tp: ', array([677,   0,   8,   0]))
('tn, fp, fn, tp: ', array([677,   0,   8,   0]))
('tn, fp, fn, tp: ', array([673,   4,   7,   1]))
('tn, fp, fn, 

### Test the model 

In [13]:
path = os.getcwd() + '/bioassay-datasets/'
p_fingerprints_test = []
X_features_test = []
labels_t = []
with open(path+fname+'red_test.csv') as csvfile:
    readcsv = csv.reader(csvfile)
    for row in readcsv:
        p_fingerprints_test.append(row[:bf])
        X_features_test.append(row[bf:-1])
        labels_t.append(row[-1])

In [14]:
p_fingerprints_test = np.asarray(p_fingerprints_test)[1:]
p_fingerprints_test = p_fingerprints_test.astype(int)

X_features_test = np.asarray(X_features_test)[1:]
X_features_test = X_features_test.astype(float)

#Normalise the features
#c_fingerprints = (c_fingerprints - np.mean(c_fingerprints,axis=0))/np.std(c_fingerprints,axis=0)

#p2_fingerprints = np.ones(p_fingerprints.shape)
(no_examples , ip_dim) = X_features_test.shape
labels_t = labels_t[1:]
print(no_examples,ip_dim)
print("total no of 1s",np.sum(p_fingerprints_test))
print("total no of 0s",no_examples*ip_dim-np.sum(p_fingerprints_test))

(856, 32)
('total no of 1s', 6305)
('total no of 0s', 21087)


In [15]:
p_fingerprints_test[(p_fingerprints_test==0)] = -1

labels2_t = np.zeros((len(labels_t),1))
for i,l in enumerate(labels_t):
    if l=='Active':
        labels2_t[i] = 1
    else:
        labels2_t[i] = 0
labels2_t = labels2_t.astype(int)

In [16]:
feat_dim = 250

rbf_feature = RBFSampler(gamma=1,n_components = feat_dim, random_state=1)
trans = rbf_feature.fit(X_features)
c_fingerprints_test = trans.transform(X_features_test)

In [17]:
testmlp = c_mlp().cuda()
model_path = os.getcwd() + '/kernel_mac' + fname + '_3'
testmlp.load_state_dict(torch.load(model_path))

In [18]:
op = testmlp(Variable(torch.cuda.FloatTensor(c_fingerprints_test)))
op = op.cpu().data.numpy()
pred_labels = np.argmax(op,axis=1)
cf = metrics.confusion_matrix(labels2_t,pred_labels).ravel()
#print(val_iter,w)
print('tn, fp, fn, tp: ',cf)

('tn, fp, fn, tp: ', array([838,   6,  11,   1]))


In [19]:
np.sum(labels2_t)

12