In [1]:
import numpy as np
import csv
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from matplotlib import pyplot as plt
from torch.autograd import Variable
from mpl_toolkits.mplot3d import Axes3D
import sklearn.metrics as metrics
import pickle
import time
from sklearn.kernel_approximation import RBFSampler
import sampling_with_data_cleaning as sdc
%matplotlib notebook
from utils import *

In [2]:
findex = 0
xtmp,y = get_features(findex,cleaned=False)
xtmp,y = shuffle(xtmp,y)
y = y.astype(int)
no_ex = xtmp.shape[0]

### X_features are real valued features. We apply kernel on it 

In [3]:
feat_dim = 250
rbf_feature = RBFSampler(gamma=1,n_components = feat_dim, random_state=1)
x_t = rbf_feature.fit_transform(xtmp)

In [4]:
print x_t.shape

(3423, 250)


In [5]:
y = y.reshape(y.shape[0],)

### Data Cleaning 

In [6]:
x,y = sdc.clean_data(x_t,y,k=8)

here
Adding 174 new samples
removing 74 samples
[ 821 1917 2121   35 2723 1497  746  227 1508 1022  669  886    0 1025  984
 2971 3319 2282 2989  118 2286 2566 2757 2341 1254 3172 2322 1065  473 1522
  108 1671 2309  422 3336 3228 3201  270 1087  819 1667  731 2531  626 2774
 1367  166 2896 3329 2283  793 2116  418  423  823 1114 2564 2668  790 1922
 2561  625  428 3154 3407 3257 1328  970 1808  337 2957 1242 1417 3076]


In [7]:
class c_mlp(nn.Module):
    def __init__(self):
        super(c_mlp,self).__init__()
        self.l1 = nn.Linear(feat_dim,1000)
        self.l2 = nn.Linear(1000,1000)
        self.l3 = nn.Linear(1000,500)
        self.l4 = nn.Linear(500,500)
        self.l5 = nn.Linear(500,500)
        self.l6 = nn.Linear(500,500)
        self.l6 = nn.Linear(500,50)
        self.l7 = nn.Linear(50,2)
        
    def forward(self,x):
        x = F.leaky_relu(self.l1(x))
        x = F.leaky_relu(self.l2(x))
        x = F.leaky_relu(self.l3(x))
        x = F.leaky_relu(self.l4(x))
        x = F.leaky_relu(self.l5(x))
        x = F.leaky_relu(self.l6(x))
        x = self.l7(x)
        
        return x

In [10]:
max_percent = 0.7
model_no = 1
for i in range(1,6):
    val_iter = i
    print("val iter: ",val_iter)
    
   
    #weights_array = [7]
    weights_array = np.linspace(30,50,5)
    
    for i,w in enumerate(weights_array): 
        mymlp = c_mlp().cuda()
        optimizer = torch.optim.Adagrad(mymlp.parameters(),lr=1e-3)
        criterion = nn.CrossEntropyLoss(weight=torch.cuda.FloatTensor([1,w]))

        for ep in range(2500):
            
            ind = get_train_ind(val_iter=val_iter,no_examples=no_ex)
            xtrain,ytrain = get_train_batch(x,y,batch_size=100,indices=ind)
            
            model_op = mymlp(xtrain)
           
            loss = criterion(model_op,ytrain)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

    #     if(ep%30==29):
    #         print(loss.data[0])

        ## After training check on cross validation data
        xval,yval = get_val_data(x,y,no_examples=no_ex,val_iter=val_iter)
        min_fp = xval.size()[0]*0.2
        yval = yval.reshape(yval.shape[0],)
        train_op = mymlp(xval)
        train_op = train_op.cpu().data.numpy()
        pred_labels = np.argmax(train_op,axis=1)
      
        cf = metrics.confusion_matrix(yval,pred_labels).ravel()
        [tn,fp,fn,tp] = cf
        print('tn, fp, fn, tp: ',cf)
        percent = float(float(tp)/float(tp+fn))
        if(percent>max_percent*0.98):
            if(fp<min_fp):
                if(model_no<6):
                        max_percent = percent
                        print("min fp, max_percent",fp,percent)
                        model_path = os.getcwd() + '/kernel_clean_' + str(findex)+'_model'+str(model_no)
                        torch.save(mymlp.state_dict(),model_path)
                        print("saving model on val: ",val_iter," and weight: ",w)
                        model_no = model_no + 1
                else:
                    break
        


('val iter: ', 1)
('tn, fp, fn, tp: ', array([680,   7,   3,  14]))
('min fp, max_percent', 7, 0.8235294117647058)
('saving model on val: ', 1, ' and weight: ', 30.0)
('tn, fp, fn, tp: ', array([678,   9,   1,  16]))
('min fp, max_percent', 9, 0.9411764705882353)
('saving model on val: ', 1, ' and weight: ', 35.0)
('tn, fp, fn, tp: ', array([678,   9,   0,  17]))
('min fp, max_percent', 9, 1.0)
('saving model on val: ', 1, ' and weight: ', 40.0)
('tn, fp, fn, tp: ', array([681,   6,   4,  13]))
('tn, fp, fn, tp: ', array([681,   6,   4,  13]))
('val iter: ', 2)
('tn, fp, fn, tp: ', array([692,   4,   3,   6]))
('tn, fp, fn, tp: ', array([688,   8,   0,   9]))
('min fp, max_percent', 8, 1.0)
('saving model on val: ', 2, ' and weight: ', 35.0)
('tn, fp, fn, tp: ', array([689,   7,   1,   8]))
('tn, fp, fn, tp: ', array([691,   5,   1,   8]))
('tn, fp, fn, tp: ', array([692,   4,   0,   9]))
('min fp, max_percent', 4, 1.0)
('saving model on val: ', 2, ' and weight: ', 50.0)
('val iter: ',

### Test the model 

In [17]:
feat_dim = 250

xtest,ytest = get_features(findex,train=False)
#xtest = Variable(torch.cuda.FloatTensor(xtest_t).cuda())
final_pred = np.zeros(ytest.shape[0],)
print final_pred.shape

(856,)


In [18]:
print type(xtest)

<type 'numpy.ndarray'>


In [20]:
rbf_feature = RBFSampler(gamma=1,n_components = feat_dim, random_state=1)
trans = rbf_feature.fit(xtmp)
xtest = trans.transform(xtest)

In [None]:
print(type(xtest))

In [23]:
xtest = Variable(torch.cuda.FloatTensor(xtest))
for i in range(1,6):
    model_path = os.getcwd() + '/kernel_clean_' + str(findex)+'_model'+str(i)    
    test_model = c_mlp().cuda()
    test_model.load_state_dict(torch.load(model_path))
    test_op = test_model(xtest)
    test_op = test_op.cpu().data.numpy()
    pred_labels = np.argmax(test_op,axis=1).reshape(final_pred.shape[0],)
    print pred_labels.shape
    final_pred = pred_labels + final_pred
    print final_pred.shape
    cf = metrics.confusion_matrix(ytest,pred_labels).ravel()
    print('tn, fp, fn, tp: ',cf)

RuntimeError: already counted a million dimensions in a given sequence. Most likely your items are also sequences and there's no way to infer how many dimension should the tensor have

In [None]:
final_pred[final_pred>1] = 1
cf = metrics.confusion_matrix(ytest,final_pred).ravel()
print('tn, fp, fn, tp: ',cf)

In [None]:
# testmlp = c_mlp().cuda()
# model_path = os.getcwd() + '/kernel_mac' + fname + '_3'
# testmlp.load_state_dict(torch.load(model_path))

In [None]:
# op = testmlp(Variable(torch.cuda.FloatTensor(c_fingerprints_test)))
# op = op.cpu().data.numpy()
# pred_labels = np.argmax(op,axis=1)
# cf = metrics.confusion_matrix(labels2_t,pred_labels).ravel()
# #print(val_iter,w)
# print('tn, fp, fn, tp: ',cf)