In [1]:
from __future__ import division, absolute_import, print_function
%matplotlib inline

In [2]:
import time
import numpy as np

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [4]:
!echo $CUDA_VISIBLE_DEVICES

0


## Get the data

This is the data from the MADELON dataset http://archive.ics.uci.edu/ml/datasets/madelon

In [5]:
!ls dataset/MADELON

madelon.param	   madelon_train.data	 madelon_valid.data
madelon_test.data  madelon_train.labels


In [6]:
data = np.loadtxt("dataset/MADELON/madelon_train.data", dtype=np.int32)
validation_data = np.loadtxt("dataset/MADELON/madelon_valid.data", dtype=np.int32)
print(data.shape)
print(validation_data.shape)

(2000, 500)
(600, 500)


In [7]:
labels = np.loadtxt("dataset/MADELON/madelon_train.labels", dtype=np.int32)
validation_labels= np.loadtxt("dataset/madelon_valid.labels", dtype=np.int32)
print(labels.shape)
print(labels)
num_positive_labels = np.sum(labels == 1)
positive_label_indices = np.nonzero(labels == 1)[0]
print("Positive labels ", num_positive_labels)
print("Positive label indices ", positive_label_indices)
print(validation_labels.shape)
print(validation_labels)
num_positive_validation_labels = np.sum(validation_labels == 1)
positive_validation_label_indices = np.nonzero(validation_labels == 1)[0]
print("Positive validation labels ", num_positive_validation_labels)
print("Positive val label indices ", positive_validation_label_indices)

(2000,)
[-1 -1 -1 ..., -1  1  1]
Positive labels  1000
Positive label indices  [   3    4    5    6    8   12   17   19   21   23   26   28   31   32   33
   34   40   41   42   43   46   52   55   56   58   59   61   67   69   71
   74   78   82   86   93   95   97  102  104  109  114  115  118  119  121
  124  127  133  134  138  139  143  146  155  156  161  162  163  164  166
  167  170  171  172  174  175  177  180  181  182  187  188  189  191  192
  199  201  202  203  206  210  213  216  217  218  219  222  226  227  229
  230  231  233  236  237  238  240  241  245  246  251  258  259  260  262
  264  265  266  267  271  272  273  274  275  276  277  278  281  283  284
  285  286  287  288  290  291  292  293  295  298  300  302  303  304  306
  308  309  311  312  313  314  315  316  318  319  323  324  325  327  330
  331  333  335  336  337  338  339  340  341  343  346  348  349  350  351
  356  357  360  367  369  370  373  374  376  377  378  379  381  385  387
  388  39

### Normalize the data by getting using average and stdev

In [8]:
def get_average_and_stdev(input_array):
    avg = np.average(input_array, axis=0)
    return avg, np.std(input_array, axis=0)

In [9]:
def normalize(input_array, avg, stdev):
    avgremoved = input_array - avg[None,:]
    return avgremoved/stdev

#### Note that validation data is normalized using original bias and stdev

In [10]:
avg, stdev = get_average_and_stdev(data)
normalized = normalize(data, avg, stdev)
normalized_validation = normalize(validation_data, avg, stdev)
print(avg.shape)
print(normalized.shape)
print(normalized)
print(normalized_validation.shape)
print(normalized_validation)

(500,)
(2000, 500)
[[ 0.51050105 -0.21380941  0.69000739 ..., -0.31785592 -0.88612143
   0.2220861 ]
 [ 0.19898248 -0.84339156 -1.28996462 ...,  0.64109672  0.05435982
   1.03544641]
 [ 0.82201962  1.94002424 -0.28712165 ...,  0.41979996 -0.24122     0.29954898]
 ..., 
 [-0.26829537  1.11162669  3.10711607 ...,  1.23122143  0.40368142
  -0.35888555]
 [ 0.35474177 -0.0812658  -0.13283812 ..., -0.76044945  0.51116499
  -0.20395978]
 [-1.20285107  0.31636503 -1.05853932 ...,  0.41979996  0.21558517
   0.99671497]]
(600, 500)
[[  1.98982485e-01  -9.75935165e-01   7.28732554e-02 ...,   1.08369025e+00
    4.03681421e-01   7.64326305e-01]
 [  5.10501052e-01   8.13403564e-01  -4.41405188e-01 ...,   8.62393488e-01
    1.07545374e+00  -5.13811329e-01]
 [  1.98982485e-01   1.24417029e+00  -8.14102775e-02 ...,  -1.35057415e+00
    6.18030532e-04   4.93206201e-01]
 ..., 
 [  5.10501052e-01   5.12778089e-02   5.10009932e-01 ...,  -1.70324742e-01
   -6.44283395e-01   1.03544641e+00]
 [ -7.35573217e-0

In [11]:
import sklearn.svm

start = time.time()
classifier = sklearn.svm.SVC(
             C=10, kernel="rbf", gamma=0.001).fit(
                          X=normalized,
                          y=labels)
print("Gaussian SVM trained in:", round(time.time() - start, 2), "s")
print("Number of support vectors: ", classifier.n_support_[0])

Gaussian SVM trained in: 2.36 s
Number of support vectors:  969


In [12]:
start = time.time()
training_predictions = classifier.predict(normalized)
training_prediction_time = round(time.time() - start, 2)
print("Prediction time (training set) ", training_prediction_time)
print("Training set accuracy  ", np.sum(training_predictions == labels)/float(labels.shape[0]))
start = time.time()
validation_predictions = classifier.predict(normalized_validation)
validation_prediction_time = round(time.time() - start, 2)
print("Prediction time (validation set) ", validation_prediction_time)
print("Validation set accuracy  ", np.sum(validation_predictions == validation_labels)/float(validation_labels.shape[0]))
print("Validation set positives accuracy", np.sum(validation_predictions[positive_validation_label_indices] == validation_labels[positive_validation_label_indices])/float(num_positive_validation_labels) )

Prediction time (training set)  1.99
Training set accuracy   1.0
Prediction time (validation set)  0.53
Validation set accuracy   0.603333333333
Validation set positives accuracy 0.596666666667


| Kernel | Train pred time (s) | Train pred accuracy | Valid pred time (s) | Valid pred accuracy |  C | Gamma | Number SV |
|--------|---------------------|---------------------|---------------------|---------------------|----|-------|-----------|
| rbf    | 2.06                | 1.0                 | 0.55                | 0.49                | 0.1| 0.1   | 1000 |
| rbf    | 2.0                 | 1.0                 | 0.54                | 0.5                 | 0.1| 1     | 1000 |
| rbf    | 2.0                 | 1.0                 | 0.53                | 0.5                 | 0.1| 10    | 1000 |
| rbf    | 2.06                | 1.0                 | 0.55                | 0.495               | 1  | 0.1   | 1000 |
| rbf    | 1.99                | 1.0                 | 0.76                | 0.497               | 1  | 1     | 1000 |
| rbf    | 1.99                | 1.0                 | 0.53                | 0.5                 | 1  | 10    | 1000 |
| rbf    | 2.06                | 1.0                 | 0.55                | 0.501               | 10 | 0.1   | 1000 |
| rbf    | 2.06                | 1.0                 | 0.53                | 0.497               | 10 | 1     | 1000 |
| rbf    | 1.99                | 1.0                 | 0.53                | 0.5                 | 10 | 10    | 1000 |