In [1]:
from __future__ import division, absolute_import, print_function
%matplotlib inline

In [2]:
import time
import numpy as np

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [4]:
!echo $CUDA_VISIBLE_DEVICES

0


## Get the data

This is the data from the MADELON dataset http://archive.ics.uci.edu/ml/datasets/madelon

In [5]:
!ls dataset/MADELON

madelon.param	   madelon_train.data	 madelon_valid.data
madelon_test.data  madelon_train.labels


In [6]:
data = np.loadtxt("dataset/MADELON/madelon_train.data", dtype=np.int32)
validation_data = np.loadtxt("dataset/MADELON/madelon_valid.data", dtype=np.int32)
print(data.shape)
print(validation_data.shape)

(2000, 500)
(600, 500)


### Subset the data to exclude irrelevant features

In [7]:
#relevant_features = np.array([339,443,473,49,379,476,242,106,319,29,452,434,129,282,454,154,337,65,494])
#relevant_features = np.array([ 29, 49, 65, 106, 129, 154, 242, 282, 319, 337, 339, 379, 434, 443, 452, 454, 456, 473, 476, 494])
relevant_features=np.array([337, 29, 136, 443, 473, 366, 379, 49, 242, 476])
#relevant_features = np.array([18, 29, 45, 49, 65, 78, 80, 106, 122, 129, 136, 148, 154, 155, 189, 242, 254, 282, 302, 315, 319, 323, 337, 339, 366, 367, 379, 402, 403, 428, 434, 438, 443, 452, 454, 456, 470, 473, 476, 494])
relevant_features = relevant_features - 1 # Convert 1 indexed to 0 indexed
data = data[:, relevant_features]
validation_data = validation_data[:, relevant_features]
print(data.shape)
print(validation_data.shape)

(2000, 40)
(600, 40)


In [8]:
labels = np.loadtxt("dataset/MADELON/madelon_train.labels", dtype=np.int32)
validation_labels= np.loadtxt("dataset/madelon_valid.labels", dtype=np.int32)
print(labels.shape)
print(labels)

(2000,)
[-1 -1 -1 ..., -1  1  1]


In [9]:
num_positive_labels = np.sum(labels == 1)
positive_label_indices = np.nonzero(labels == 1)[0]
print("Positive labels ", num_positive_labels)
#print("Positive label indices ", positive_label_indices)
print(validation_labels.shape)
print(validation_labels)
num_positive_validation_labels = np.sum(validation_labels == 1)
positive_validation_label_indices = np.nonzero(validation_labels == 1)[0]
print("Positive validation labels ", num_positive_validation_labels)
#print("Positive val label indices ", positive_validation_label_indices)

Positive labels  1000
(600,)
[-1 -1 -1  1 -1  1 -1 -1 -1  1  1  1 -1  1  1 -1 -1  1 -1 -1 -1 -1 -1  1  1
  1  1 -1 -1 -1 -1  1  1 -1  1 -1  1  1  1  1  1  1  1  1 -1  1  1  1 -1 -1
  1 -1 -1  1  1 -1  1 -1 -1 -1  1 -1  1  1  1  1 -1  1 -1 -1  1 -1  1  1  1
 -1  1 -1  1 -1  1 -1 -1  1  1  1  1  1  1  1 -1 -1 -1 -1  1 -1 -1  1  1 -1
 -1  1 -1  1 -1  1 -1 -1  1  1  1 -1  1 -1 -1 -1  1 -1  1 -1 -1  1  1 -1 -1
  1 -1 -1  1 -1  1  1 -1  1 -1  1 -1  1 -1 -1 -1  1  1 -1 -1  1  1 -1 -1 -1
  1 -1  1 -1 -1  1  1  1  1 -1  1  1 -1 -1 -1 -1  1  1 -1  1  1  1 -1 -1 -1
 -1  1 -1  1 -1  1 -1  1 -1 -1  1 -1 -1 -1 -1  1  1 -1 -1 -1  1 -1  1 -1 -1
 -1  1  1  1 -1 -1  1  1  1  1 -1 -1  1 -1 -1 -1  1  1 -1 -1  1  1 -1 -1  1
 -1 -1 -1 -1  1 -1 -1  1  1  1  1 -1 -1 -1  1 -1 -1  1 -1 -1  1  1 -1 -1  1
  1 -1 -1 -1 -1  1 -1 -1  1  1  1  1  1 -1  1  1 -1 -1  1 -1  1  1 -1 -1 -1
 -1  1  1  1  1  1 -1  1  1  1 -1 -1  1  1 -1 -1 -1  1  1  1  1 -1 -1  1  1
 -1 -1  1  1  1  1 -1  1  1 -1 -1  1 -1  1  1 -1 -1 -1  1  

### Normalize the data by getting using average and stdev

In [10]:
def get_average_and_stdev(input_array):
    avg = np.average(input_array, axis=0)
    return avg, np.std(input_array, axis=0)

In [11]:
def normalize(input_array, avg, stdev):
    avgremoved = input_array - avg[None,:]
    return avgremoved/stdev

#### Note that validation data is normalized using original bias and stdev

In [12]:
avg, stdev = get_average_and_stdev(data)
normalized = normalize(data, avg, stdev)
normalized_validation = normalize(validation_data, avg, stdev)
print(avg.shape)
print(normalized.shape)
print(normalized)
print(normalized_validation.shape)
print(normalized_validation)

(40,)
(2000, 40)
[[-2.40497551 -1.46919561  0.59057274 ...,  0.77853397 -1.14592077
  -0.09063621]
 [ 0.7600082  -0.38077659 -1.00117219 ..., -0.34645009  0.84065868
  -1.24264196]
 [ 2.37546863  0.70764243 -1.46100962 ...,  0.10354353 -0.43451056
   1.20243147]
 ..., 
 [ 1.089694    1.31987813 -0.85968376 ..., -0.70644499 -0.66269874
   1.49239211]
 [-0.26201779 -0.65288135 -0.32910211 ...,  0.68853525 -1.33384045
   1.98610886]
 [-0.7565465   0.16343292 -0.54133477 ...,  0.1485429   0.98830985
  -0.37276007]]
(600, 40)
[[-0.06420631  0.63961624  0.90892173 ..., -0.59394659  0.6930075
  -1.86174709]
 [-0.95435798  0.70764243 -0.36447422 ..., -0.93144181 -0.77008141
   0.29336571]
 [ 0.69407104 -0.10867184 -0.18761367 ...,  0.30604067  0.41112798
  -0.01226847]
 ..., 
 [ 0.49625956  1.59198288 -1.0365443  ..., -1.4264348   0.37085948
  -1.00753875]
 [-1.21810662 -0.85695991 -0.08149734 ...,  0.46353843  0.46481932
   1.10839019]
 [ 1.48531696  0.70764243  1.01503806 ...,  0.19354226 -1

In [13]:
import sklearn.svm
start = time.time()
linear_classifier = sklearn.svm.LinearSVC().fit(
                            X=normalized,
                            y=labels)
print("Linear SVM trained in:", round(time.time() - start, 2), "s")

Linear SVM trained in: 0.36 s


In [14]:
start = time.time()
training_predictions = linear_classifier.predict(normalized)
training_prediction_time = round(time.time() - start, 2)
print("Linear Classifier Prediction time (training set) ", training_prediction_time)
print("Linear Classifier Training set accuracy  ", np.sum(training_predictions == labels)/float(labels.shape[0]))
start = time.time()
validation_predictions = linear_classifier.predict(normalized_validation)
validation_prediction_time = round(time.time() - start, 2)
print("Linear Classifier Prediction time (validation set) ", validation_prediction_time)
print("Linear Classifier Validation set accuracy  ", np.sum(validation_predictions == validation_labels)/float(validation_labels.shape[0]))
print("Linear Classifier Validation set positives accuracy", np.sum(validation_predictions[positive_validation_label_indices] == validation_labels[positive_validation_label_indices])/float(num_positive_validation_labels) )

Linear Classifier Prediction time (training set)  0.04
Linear Classifier Training set accuracy   0.611
Linear Classifier Prediction time (validation set)  0.0
Linear Classifier Validation set accuracy   0.583333333333
Linear Classifier Validation set positives accuracy 0.58


In [16]:
import sklearn.svm

start = time.time()
classifier = sklearn.svm.SVC(
             C=1, kernel="rbf", gamma=0.1).fit(
                          X=normalized,
                          y=labels)
print("Gaussian SVM trained in:", round(time.time() - start, 2), "s")
print("Number of support vectors: ", classifier.n_support_[0])

Gaussian SVM trained in: 0.44 s
Number of support vectors:  976


In [17]:
start = time.time()
training_predictions = classifier.predict(normalized)
training_prediction_time = round(time.time() - start, 2)
print("Prediction time (training set) ", training_prediction_time)
print("Training set accuracy  ", np.sum(training_predictions == labels)/float(labels.shape[0]))
start = time.time()
validation_predictions = classifier.predict(normalized_validation)
validation_prediction_time = round(time.time() - start, 2)
print("Prediction time (validation set) ", validation_prediction_time)
print("Validation set accuracy  ", np.sum(validation_predictions == validation_labels)/float(validation_labels.shape[0]))
print("Validation set positives accuracy", np.sum(validation_predictions[positive_validation_label_indices] == validation_labels[positive_validation_label_indices])/float(num_positive_validation_labels) )

Prediction time (training set)  0.33
Training set accuracy   0.9985
Prediction time (validation set)  0.09
Validation set accuracy   0.818333333333
Validation set positives accuracy 0.823333333333


| Kernel | Train pred time (s) | Train pred accuracy | Valid pred time (s) | Valid pred accuracy |  C | Gamma | Number SV |
|--------|---------------------|---------------------|---------------------|---------------------|----|-------|-----------|
| rbf    | 2.06                | 1.0                 | 0.55                | 0.49                | 0.1| 0.1   | 1000 |
| rbf    | 2.0                 | 1.0                 | 0.54                | 0.5                 | 0.1| 1     | 1000 |
| rbf    | 2.0                 | 1.0                 | 0.53                | 0.5                 | 0.1| 10    | 1000 |
| rbf    | 2.06                | 1.0                 | 0.55                | 0.495               | 1  | 0.1   | 1000 |
| rbf    | 1.99                | 1.0                 | 0.76                | 0.497               | 1  | 1     | 1000 |
| rbf    | 1.99                | 1.0                 | 0.53                | 0.5                 | 1  | 10    | 1000 |
| rbf    | 2.06                | 1.0                 | 0.55                | 0.501               | 10 | 0.1   | 1000 |
| rbf    | 2.06                | 1.0                 | 0.53                | 0.497               | 10 | 1     | 1000 |
| rbf    | 1.99                | 1.0                 | 0.53                | 0.5                 | 10 | 10    | 1000 |
| rbf    | 4.39                | 1.0                 | 1.32                | 0.6 (positives 0.59)| 1  | 0.01  | 1000 |
| rbf    | 4.37                | 1.0                 | 1.31                | 0.59 (positives 0.6)| 10  | 0.01  | 1000 |
| rbf    | 4.34                | 1.0                 | 1.31                | 0.59 (positives 0.6)| 100  | 0.01  | 1000 |
| rbf    | 4.08                | 0.8425              | 1.22                | 0.58 (positives 0.57)| 1  | 0.001  | 929 |
| rbf    | 4.25                | 1.0                 | 1.26                | 0.6 (positives 0.6)| 10  | 0.001  | 969 |
| rbf    | 4.34                | 0.699               | 1.31                | 0.6 (positives 0.52)| 0.1  | 0.001  | 1000 |
| rbf    | 4.16                | 0.955               | 1.26                | 0.6 (positives 0.58)| 5  | 0.001  | 962 |
| rbf    | 2.06                | 0.655               | 0.55                | 0.58 (positives 0.58)| 0.1  | 0.0001  | 1000 |
| rbf    | 2.04                | 0.662               | 0.54                | 0.59 (positives 0.56)| 1  | 0.0001  | 994 |
| rbf    | 2.0                 | 0.746               | 0.47                | 0.59 (positives 0.58)| 10  | 0.0001  | 857 |
| rbf    | 1.68                | 0.872               | 0.44                | 0.58 (positives 0.57)| 100  | 0.0001  | 790 |
| rbf    | 2.11                | 1.0                 | 0.5                 | 0.6 (positives 0.6)| 1000  | 0.0001  | 914 |
| rbf    | 1.99                | 1.0                 | 0.5                 | 0.6 (positives 0.6)| 500  | 0.0001  | 914 |
| rbf    | 1.76                | 0.9655              | 0.47                | 0.58 (positives 0.58)| 200  | 0.0001  | 843 |

### Results for training only with relevant features
| Kernel | Train pred time (s) | Train pred accuracy | Valid pred time (s) | Valid pred accuracy |  C | Gamma | Number SV |
|--------|---------------------|---------------------|---------------------|---------------------|----|-------|-----------|
| rbf    | 0.22                | 0.991               | 0.08                | 0.915               | 1  | 1     | 909 |

In [17]:
from sklearn.externals import joblib
# now you can save it to a file
joblib.dump(classifier, 'GaussianClassifier_Madelon_20Best_20Random_C1_G0_1.pkl')

['GaussianClassifier_Madelon_20Best_20Random_C1_G0_1.pkl']