In [1]:
from __future__ import division, absolute_import, print_function
%matplotlib inline

In [2]:
import time
import numpy as np

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [4]:
!echo $CUDA_VISIBLE_DEVICES

0


In [5]:
def get_average_and_stdev(input_array):
    avg = np.average(input_array, axis=0)
    return avg, np.std(input_array, axis=0)

In [6]:
def normalize(input_array, avg, stdev):
    avgremoved = input_array - avg[None,:]
    return avgremoved/stdev

## Get the data

This is the data from the MADELON dataset http://archive.ics.uci.edu/ml/datasets/madelon

In [7]:
!ls dataset/MADELON

madelon.param	   madelon_train.data	 madelon_valid.data
madelon_test.data  madelon_train.labels


In [8]:
data = np.loadtxt("dataset/MADELON/madelon_train.data", dtype=np.int32)
validation_data = np.loadtxt("dataset/MADELON/madelon_valid.data", dtype=np.int32)
print(data.shape)
print(validation_data.shape)

(2000, 500)
(600, 500)


### Normalize the data and validation data

In [9]:
avg, stdev = get_average_and_stdev(data)
normalized = normalize(data, avg, stdev)
normalized_validation = normalize(validation_data, avg, stdev)
print(avg.shape)
print(normalized.shape)
print(normalized)
print(normalized_validation.shape)
print(normalized_validation)

(500,)
(2000, 500)
[[ 0.51050105 -0.21380941  0.69000739 ..., -0.31785592 -0.88612143
   0.2220861 ]
 [ 0.19898248 -0.84339156 -1.28996462 ...,  0.64109672  0.05435982
   1.03544641]
 [ 0.82201962  1.94002424 -0.28712165 ...,  0.41979996 -0.24122     0.29954898]
 ..., 
 [-0.26829537  1.11162669  3.10711607 ...,  1.23122143  0.40368142
  -0.35888555]
 [ 0.35474177 -0.0812658  -0.13283812 ..., -0.76044945  0.51116499
  -0.20395978]
 [-1.20285107  0.31636503 -1.05853932 ...,  0.41979996  0.21558517
   0.99671497]]
(600, 500)
[[  1.98982485e-01  -9.75935165e-01   7.28732554e-02 ...,   1.08369025e+00
    4.03681421e-01   7.64326305e-01]
 [  5.10501052e-01   8.13403564e-01  -4.41405188e-01 ...,   8.62393488e-01
    1.07545374e+00  -5.13811329e-01]
 [  1.98982485e-01   1.24417029e+00  -8.14102775e-02 ...,  -1.35057415e+00
    6.18030532e-04   4.93206201e-01]
 ..., 
 [  5.10501052e-01   5.12778089e-02   5.10009932e-01 ...,  -1.70324742e-01
   -6.44283395e-01   1.03544641e+00]
 [ -7.35573217e-0

### Get labels

In [10]:
labels = np.loadtxt("dataset/MADELON/madelon_train.labels", dtype=np.int32)
validation_labels= np.loadtxt("dataset/madelon_valid.labels", dtype=np.int32)
print(labels.shape)
print(labels)

(2000,)
[-1 -1 -1 ..., -1  1  1]


In [11]:
import sklearn.svm
def get_and_train_classifier(tr_set, tr_labels, C=1, gamma=0.1, probability=False):
    start = time.time()
    classifier = sklearn.svm.SVC(C=C, kernel="rbf", gamma=gamma, probability=probability).fit(
                          X=tr_set,
                          y=tr_labels)
    print("Gaussian SVM with C=", C, ", gamma=", gamma, ", probability=", probability, " trained in:", round(time.time() - start, 2), "s", " with support vectors: ", classifier.n_support_[0])
    return classifier

In [12]:
def get_classifier_accuracy(classifier, training_set, training_labels, validation_set, validation_labels):
    start = time.time()
    training_predictions = classifier.predict(training_set)
    training_prediction_time = round(time.time() - start, 2)
    training_accuracy = np.sum(training_predictions == training_labels)/float(training_labels.shape[0])
    print("Prediction time (training set) ", training_prediction_time,  ", accuracy: ", training_accuracy)
    start = time.time()
    validation_predictions = classifier.predict(validation_set)
    validation_prediction_time = round(time.time() - start, 2)
    validation_accuracy = np.sum(validation_predictions == validation_labels)/float(validation_labels.shape[0])
    print("Prediction time (validation set) ", validation_prediction_time, ", accuracy: ",  validation_accuracy)
    return training_accuracy, validation_accuracy

###  Run classifiers for top 10 and top 20 features for all 10 models

In [13]:
for model in range(1,11):
        file_name = "IG_M" + str(model) + "_N100_imp_feat_asc.npy"
        imp_feat_asc = np.load(file_name)
        print("**************************************")
        print("\tModel: ", file_name, ": ", imp_feat_asc)
        print("**************************************")
        print("Top 10 features: ")
        current_features = np.sort(imp_feat_asc[-10:])-1
        train = normalized[:, current_features]
        valid = normalized_validation[:, current_features]
        clsfr = get_and_train_classifier(train, labels)
        get_classifier_accuracy(clsfr, train, labels, valid, validation_labels)
        print("\nTop 20 features:")
        current_features = np.sort(imp_feat_asc[-20:])-1
        train = normalized[:, current_features]
        valid = normalized_validation[:, current_features]
        clsfr = get_and_train_classifier(train, labels)
        get_classifier_accuracy(clsfr, train, labels, valid, validation_labels)

**************************************
	Model:  IG_M1_N100_imp_feat_asc.npy :  [494 190 173 454  61 456 408 238 264  24  82 484  16  99 282 154 469 127
 157 434  65 251 224 129 337 106 453 319 137 452 443 473 422 339  29 150
 379 476  49 242]
**************************************
Top 10 features: 
Gaussian SVM with C= 1 , gamma= 0.1 , probability= False  trained in: 0.45 s  with support vectors:  660
Prediction time (training set)  0.17 , accuracy:  0.815
Prediction time (validation set)  0.05 , accuracy:  0.795

Top 20 features:
Gaussian SVM with C= 1 , gamma= 0.1 , probability= False  trained in: 0.25 s  with support vectors:  703
Prediction time (training set)  0.11 , accuracy:  0.9205
Prediction time (validation set)  0.03 , accuracy:  0.818333333333
**************************************
	Model:  IG_M2_N100_imp_feat_asc.npy :  [294 219 302  51  93  21 159 210 433 494 456 454 271 120 403 411 154 434
 106 282 241 345 468  81 129 142 485  65 337 339 473 319 413 452 443  29
 379  49 