In [1]:
from __future__ import division, absolute_import, print_function
%matplotlib inline

In [2]:
import time
import numpy as np

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [4]:
!echo $CUDA_VISIBLE_DEVICES

0


## Get the data

This is the data from the MADELON dataset http://archive.ics.uci.edu/ml/datasets/madelon

In [5]:
!ls dataset/MADELON

madelon.param	   madelon_train.data	 madelon_valid.data
madelon_test.data  madelon_train.labels


In [6]:
data = np.loadtxt("dataset/MADELON/madelon_train.data", dtype=np.int32)
validation_data = np.loadtxt("dataset/MADELON/madelon_valid.data", dtype=np.int32)
print(data.shape)
print(validation_data.shape)

(2000, 500)
(600, 500)


In [7]:
labels = np.loadtxt("dataset/MADELON/madelon_train.labels", dtype=np.int32)
validation_labels= np.loadtxt("dataset/madelon_valid.labels", dtype=np.int32)
print(labels.shape)
print(labels)
print("Positive labels ", np.sum(labels == 1))
print(validation_labels.shape)
print(validation_labels)
print("Positive validation labels ", np.sum(validation_labels == 1))

(2000,)
[-1 -1 -1 ..., -1  1  1]
Positive labels  1000
(600,)
[-1 -1 -1  1 -1  1 -1 -1 -1  1  1  1 -1  1  1 -1 -1  1 -1 -1 -1 -1 -1  1  1
  1  1 -1 -1 -1 -1  1  1 -1  1 -1  1  1  1  1  1  1  1  1 -1  1  1  1 -1 -1
  1 -1 -1  1  1 -1  1 -1 -1 -1  1 -1  1  1  1  1 -1  1 -1 -1  1 -1  1  1  1
 -1  1 -1  1 -1  1 -1 -1  1  1  1  1  1  1  1 -1 -1 -1 -1  1 -1 -1  1  1 -1
 -1  1 -1  1 -1  1 -1 -1  1  1  1 -1  1 -1 -1 -1  1 -1  1 -1 -1  1  1 -1 -1
  1 -1 -1  1 -1  1  1 -1  1 -1  1 -1  1 -1 -1 -1  1  1 -1 -1  1  1 -1 -1 -1
  1 -1  1 -1 -1  1  1  1  1 -1  1  1 -1 -1 -1 -1  1  1 -1  1  1  1 -1 -1 -1
 -1  1 -1  1 -1  1 -1  1 -1 -1  1 -1 -1 -1 -1  1  1 -1 -1 -1  1 -1  1 -1 -1
 -1  1  1  1 -1 -1  1  1  1  1 -1 -1  1 -1 -1 -1  1  1 -1 -1  1  1 -1 -1  1
 -1 -1 -1 -1  1 -1 -1  1  1  1  1 -1 -1 -1  1 -1 -1  1 -1 -1  1  1 -1 -1  1
  1 -1 -1 -1 -1  1 -1 -1  1  1  1  1  1 -1  1  1 -1 -1  1 -1  1  1 -1 -1 -1
 -1  1  1  1  1  1 -1  1  1  1 -1 -1  1  1 -1 -1 -1  1  1  1  1 -1 -1  1  1
 -1 -1  1  1  1  1 -1  1  

### Normalize the data by removing bias (average value) from each feature and then dividing by feature stdev

In [8]:
def get_average_and_stdev(input_array):
    bias = np.average(input_array, axis=0)
    biasremoved = input_array - bias[None,:]
    return bias, np.linalg.norm(biasremoved, axis=0)

In [9]:
def normalize(input_array, bias, stdev):
    biasremoved = input_array - bias[None,:]
    return biasremoved/stdev

#### Note that validation data is normalized using original bias and stdev

In [10]:
bias, stdev = get_average_and_stdev(data)
normalized = normalize(data, bias, stdev)
normalized_validation = normalize(validation_data, bias, stdev)
print(bias.shape)
print(normalized.shape)
print(normalized)
print(normalized_validation.shape)
print(normalized_validation)

(500,)
(2000, 500)
[[ 0.01141515 -0.00478092  0.01542903 ..., -0.00710747 -0.01981428
   0.004966  ]
 [ 0.00444938 -0.01885881 -0.02884449 ...,  0.01433536  0.00121552
   0.02315329]
 [ 0.01838092  0.04338026 -0.00642024 ...,  0.00938701 -0.00539384
   0.00669812]
 ..., 
 [-0.00599927  0.02485673  0.06947723 ...,  0.02753095  0.00902659
  -0.00802492]
 [ 0.00793227 -0.00181716 -0.00297035 ..., -0.01700417  0.01143    -0.00456068]
 [-0.02689657  0.00707414 -0.02366966 ...,  0.00938701  0.00482063
   0.02228722]]
(600, 500)
[[  4.44938363e-03  -2.18225737e-02   1.62949553e-03 ...,   2.42320507e-02
    9.02659099e-03   1.70908557e-02]
 [  1.14151506e-02   1.81882566e-02  -9.87012005e-03 ...,   1.92837046e-02
    2.40478767e-02  -1.14891706e-02]
 [  4.44938363e-03   2.78204935e-02  -1.82038915e-03 ...,  -3.01997560e-02
    1.38195828e-05   1.10284259e-02]
 ..., 
 [  1.14151506e-02   1.14660667e-03   1.14041688e-02 ...,  -3.80857702e-03
   -1.44066147e-02   2.31532856e-02]
 [ -1.64479172e-0

In [11]:
import sklearn.svm

start = time.time()
classifier = sklearn.svm.SVC(
             C=100, kernel="rbf", gamma=1.0).fit(
                          X=normalized,
                          y=labels)
print("Gaussian SVM trained in:", round(time.time() - start, 2), "s")
print("Number of support vectors: ", classifier.n_support_[0])

Gaussian SVM trained in: 2.53 s
Number of support vectors:  943


In [12]:
start = time.time()
training_predictions = classifier.predict(normalized)
training_prediction_time = round(time.time() - start, 2)
print("Prediction time (training set) ", training_prediction_time)
print("Training set accuracy  ", np.sum(training_predictions == labels)/float(labels.shape[0]))
start = time.time()
validation_predictions = classifier.predict(normalized_validation)
validation_prediction_time = round(time.time() - start, 2)
print("Prediction time (validation set) ", validation_prediction_time)
print("Validation set accuracy  ", np.sum(validation_predictions == validation_labels)/float(validation_labels.shape[0]))

Prediction time (training set)  1.93
Training set accuracy   1.0
Prediction time (validation set)  0.51
Validation set accuracy   0.593333333333


| Kernel | Train pred time (s) | Train pred accuracy | Valid pred time (s) | Valid pred accuracy |  C | Gamma | Number SV |
|--------|---------------------|---------------------|---------------------|---------------------|----|-------|-----------|
| rbf    | 1.98                | 0.691               | 0.52                | 0.5716              | 1  | 0.5   | 952|
| rbf    | 1.72                | 0.8215              | 0.45                | 0.5833              | 10 | 0.5   | 829|
| rbf    | 1.92                | 0.74                | 0.51                | 0.575               | 1  | 1.0   | 927|
| rbf    | 1.85                | 0.977               | 0.51                | 0.585               | 10 | 1.0   | 898|
| rbf    | 1.92                | 0.74                | 0.51                | 0.575               | auto | 1.0 | 927|
| rbf    | 2.21                | 0.691               | 0.55                | 0.572               | auto | 0.5 | 952|
| linear | 1.81                | 0.6865              | 0.48                | 0.572               | auto | NA | 937|
| linear | 1.45                | 0.745               | 0.4                 | 0.578               | 10   | NA | 786|
| rbf    | 1.93                | 1.0                 | 0.51                | 0.593               | 100  | 1.0 | 943|