In [1]:
from __future__ import division, absolute_import, print_function
%matplotlib inline

In [2]:
import time
import numpy as np

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [4]:
!echo $CUDA_VISIBLE_DEVICES

0


## Get the data

This is the data from the MADELON dataset http://archive.ics.uci.edu/ml/datasets/madelon

In [5]:
!ls dataset/MADELON

madelon.param	   madelon_train.data	 madelon_valid.data
madelon_test.data  madelon_train.labels


In [6]:
data = np.loadtxt("dataset/MADELON/madelon_train.data", dtype=np.int32)
validation_data = np.loadtxt("dataset/MADELON/madelon_valid.data", dtype=np.int32)
print(data.shape)
print(validation_data.shape)

(2000, 500)
(600, 500)


### Subset the data to exclude irrelevant features

In [7]:
#relevant_features = np.array([339,443,473,49,379,476,242,106,319,29,452,434,129,282,454,154,337,65,494])
#relevant_features = np.array([ 29, 49, 65, 106, 129, 154, 242, 282, 319, 337, 339, 379, 434, 443, 452, 454, 456, 473, 476, 494])
#relevant_features = np.array([18, 29, 45, 49, 65, 78, 80, 106, 122, 129, 136, 148, 154, 155, 189, 242, 254, 282, 302, 315, 319, 323, 337, 339, 366, 367, 379, 402, 403, 428, 434, 438, 443, 452, 454, 456, 470, 473, 476, 494])
all_models = np.load("TenMadelonModels.npy")
features_list = all_models[9].tolist()
relevant_features = np.array(all_models[9])
relevant_features = relevant_features - 1 # Convert 1 indexed to 0 indexed
data = data[:, relevant_features]
validation_data = validation_data[:, relevant_features]
print(data.shape)
print(validation_data)

(2000, 40)
[[454 490 510 ..., 454 538 259]
 [483 491 490 ..., 439 429 534]
 [457 479 474 ..., 494 517 495]
 ..., 
 [474 504 505 ..., 417 514 368]
 [461 468 490 ..., 501 521 638]
 [513 491 496 ..., 489 398 694]]


In [8]:
labels = np.loadtxt("dataset/MADELON/madelon_train.labels", dtype=np.int32)
validation_labels= np.loadtxt("dataset/madelon_valid.labels", dtype=np.int32)
print(labels.shape)
print(labels)

(2000,)
[-1 -1 -1 ..., -1  1  1]


### Normalize the data by getting using average and stdev

In [9]:
def get_average_and_stdev(input_array):
    avg = np.average(input_array, axis=0)
    return avg, np.std(input_array, axis=0)

In [10]:
def normalize(input_array, avg, stdev):
    avgremoved = input_array - avg[None,:]
    return avgremoved/stdev

#### Note that validation data is normalized using original bias and stdev

In [11]:
avg, stdev = get_average_and_stdev(data)
normalized = normalize(data, avg, stdev)
normalized_validation = normalize(validation_data, avg, stdev)
validation_pos_indices = np.nonzero(validation_labels == 1)[0]
normalized_validation_pos=normalize(validation_data[validation_pos_indices], avg, stdev)
print(avg.shape)
print(normalized.shape)
print(normalized_validation.shape)

(40,)
(2000, 40)
(600, 40)


In [12]:
background_indices = np.nonzero(labels == -1)[0]
#print(background_indices.shape)
#print(background_indices)
mean_bg_feature_values = np.average(normalized[background_indices], axis=0)
print(mean_bg_feature_values)
#print(mean_bg_feature_values.shape)

[-0.03609533 -0.00357137 -0.00671681 -0.11847695 -0.0338104   0.15032661
  0.00036677 -0.01377585 -0.12626289 -0.12675752 -0.00861636  0.03378346
 -0.04081777  0.00088387 -0.01811258 -0.03407828  0.03015419 -0.21782664
  0.03331272  0.03363522 -0.0126518   0.04952507 -0.00469233  0.01596474
  0.15167207  0.12178815 -0.00886381 -0.11596197  0.04598214  0.02981902
  0.02276057  0.11435599  0.0141396  -0.01160078  0.0955404   0.0064516
  0.02386877  0.11207091 -0.21993314  0.09505615]


In [13]:
print(np.std(normalized, axis=0))

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.]


In [14]:
from sklearn.externals import joblib
classifier = joblib.load('GaussianClassifier_Model10_ProbTrue_Madelon_20Best_20Random_C1_G0_1.pkl')

In [15]:
def get_scores_from_explanation(exp):
    scoresmap = exp.as_map()
    scoreslist = scoresmap[1]
    returnable = np.empty([len(scoreslist)])
    indices = np.array([scoreslist[i][0] for i in range(len(scoreslist))])
    values = np.array([scoreslist[i][1] for i in range(len(scoreslist))])
    returnable[indices] = values
    return returnable

In [16]:
import lime
import lime.lime_tabular
importance_scores = np.array([])
start = time.time()
explainer = lime.lime_tabular.LimeTabularExplainer(normalized, feature_names=features_list)
for i in range(normalized_validation.shape[0]):
    exp = explainer.explain_instance(normalized_validation[i], classifier.predict_proba, num_features=40)
    next_score = get_scores_from_explanation(exp)
    if (i % 30 == 0):
        print("In iteration ", i, " Got imp score ", next_score)
    importance_scores = np.append(importance_scores, next_score)
importance_scores = importance_scores.reshape(normalized_validation.shape[0], 40)
print("Importance scores computed in:", round(time.time() - start, 2), "s")

In iteration  0  Got imp score  [-0.01693925 -0.00362951 -0.00036468 -0.03061242 -0.00532423  0.00742123
 -0.00362864 -0.01914619 -0.00840506 -0.00873837  0.00640449  0.00345308
 -0.00502058  0.00166263 -0.00419897 -0.00186197 -0.00553322  0.01531226
  0.01609466  0.00415045 -0.0093911  -0.02029574 -0.00265662 -0.02438639
  0.00316419 -0.00749885  0.00209497 -0.03287207 -0.01335823  0.00034583
 -0.00889214  0.00274447 -0.00093281  0.00507974  0.00774107 -0.00046204
 -0.00235933  0.00210832  0.00933171  0.01173852]
In iteration  30  Got imp score  [-0.01961756  0.00680933 -0.00068598  0.02842665 -0.00332883  0.01065748
 -0.01830833 -0.02049413  0.0253819   0.02150715 -0.01158708 -0.00822915
 -0.00818258 -0.00473821 -0.00504931  0.00218959  0.00246252 -0.01638784
 -0.00078369 -0.00610778 -0.00350146  0.01915608  0.00245329  0.00949228
  0.0108469   0.02481458  0.00127959  0.02947773  0.01131364 -0.00105029
  0.0041225   0.02157163 -0.00504146  0.00386195 -0.01690236  0.00544813
 -0.00222

In [17]:
summed = np.sum(np.absolute(importance_scores), axis=0)

In [18]:
sorted_indices = np.argsort(summed)
print(sorted_indices)

[29 15 35  2 36 32 13  1 14 22 33 11 16 26 12 20  4 19 28 18 30 10 39  6 34
 23 21  7  0 31  5 37  8 24 25  9 27  3 17 38]


In [19]:
important_features_ascending = np.array(features_list)[sorted_indices]
print(important_features_ascending)
top_10_feature_set = set(important_features_ascending[-10:].flat)
top_20_feature_set = set(important_features_ascending[-20:].flat)
print(top_10_feature_set)
print(top_20_feature_set)

[434 208 456  30 464 451 170  29 187 319 452 154 218 352 164 285  54 282
 426 256 442 134 494  98 454 335 287 105  27 443  65 473 106 337 339 129
 379  49 242 476]
set([129, 49, 65, 106, 337, 242, 339, 473, 379, 476])
set([65, 98, 443, 134, 454, 129, 379, 105, 106, 494, 335, 49, 337, 242, 339, 473, 442, 27, 476, 287])


In [20]:
original_feature_set = set(np.array([ 29, 49, 65, 106, 129, 154, 242, 282, 319, 337, 339, 379, 434, 443, 452, 454, 456, 473, 476, 494]).flat)
print("Number in top 10 ", len(top_10_feature_set) - len(top_10_feature_set - original_feature_set))
print("Number in top 20 ", len(top_20_feature_set) - len(top_20_feature_set - original_feature_set))

Number in top 10  10
Number in top 20  13
