In [1]:
from __future__ import division, absolute_import, print_function
%matplotlib inline

In [2]:
import numpy as np
import random

### FIX  RANDOM SEEDS HERE! (Comment out to get fully random)

In [3]:
np.random.seed(0)
random.seed(1001)

In [4]:
import time
import shap

Using TensorFlow backend.


In [5]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

## Get the data

In [6]:
!echo $CUDA_VISIBLE_DEVICES

0


This is the data from the MADELON dataset http://archive.ics.uci.edu/ml/datasets/madelon

In [7]:
!ls dataset/MADELON

madelon.param	   madelon_train.data	 madelon_valid.data
madelon_test.data  madelon_train.labels


In [8]:
data = np.loadtxt("dataset/MADELON/madelon_train.data", dtype=np.int32)
validation_data = np.loadtxt("dataset/MADELON/madelon_valid.data", dtype=np.int32)
print(data.shape)
print(validation_data.shape)

(2000, 500)
(600, 500)


### Subset the data to exclude irrelevant features

In [9]:
#relevant_features = np.array([339,443,473,49,379,476,242,106,319,29,452,434,129,282,454,154,337,65,494])
#relevant_features = np.array([ 29, 49, 65, 106, 129, 154, 242, 282, 319, 337, 339, 379, 434, 443, 452, 454, 456, 473, 476, 494])
#relevant_features = np.array([18, 29, 45, 49, 65, 78, 80, 106, 122, 129, 136, 148, 154, 155, 189, 242, 254, 282, 302, 315, 319, 323, 337, 339, 366, 367, 379, 402, 403, 428, 434, 438, 443, 452, 454, 456, 470, 473, 476, 494])
all_models = np.load("TenMadelonModels.npy")
relevant_features = np.array(all_models[9])
relevant_features = relevant_features - 1 # Convert 1 indexed to 0 indexed
data = data[:, relevant_features]
validation_data = validation_data[:, relevant_features]
print(data.shape)
print(validation_data)

(2000, 40)
[[454 490 510 ..., 454 538 259]
 [483 491 490 ..., 439 429 534]
 [457 479 474 ..., 494 517 495]
 ..., 
 [474 504 505 ..., 417 514 368]
 [461 468 490 ..., 501 521 638]
 [513 491 496 ..., 489 398 694]]


In [10]:
labels = np.loadtxt("dataset/MADELON/madelon_train.labels", dtype=np.int32)
validation_labels= np.loadtxt("dataset/madelon_valid.labels", dtype=np.int32)
print(labels.shape)
print(labels)

(2000,)
[-1 -1 -1 ..., -1  1  1]


### Normalize the data by getting using average and stdev

In [11]:
def get_average_and_stdev(input_array):
    avg = np.average(input_array, axis=0)
    return avg, np.std(input_array, axis=0)

In [12]:
def normalize(input_array, avg, stdev):
    avgremoved = input_array - avg[None,:]
    return avgremoved/stdev

#### Note that validation data is normalized using original bias and stdev

In [13]:
avg, stdev = get_average_and_stdev(data)
normalized = normalize(data, avg, stdev)
normalized_validation = normalize(validation_data, avg, stdev)
validation_pos_indices = np.nonzero(validation_labels == 1)[0]
normalized_validation_pos=normalize(validation_data[validation_pos_indices], avg, stdev)
print(avg.shape)
print(normalized.shape)
print(normalized_validation.shape)

(40,)
(2000, 40)
(600, 40)


In [14]:
background_indices = np.nonzero(labels == -1)[0]
#print(background_indices.shape)
#print(background_indices)
mean_bg_feature_values = np.average(normalized[background_indices], axis=0)
print(mean_bg_feature_values)
#print(mean_bg_feature_values.shape)

[-0.03609533 -0.00357137 -0.00671681 -0.11847695 -0.0338104   0.15032661
  0.00036677 -0.01377585 -0.12626289 -0.12675752 -0.00861636  0.03378346
 -0.04081777  0.00088387 -0.01811258 -0.03407828  0.03015419 -0.21782664
  0.03331272  0.03363522 -0.0126518   0.04952507 -0.00469233  0.01596474
  0.15167207  0.12178815 -0.00886381 -0.11596197  0.04598214  0.02981902
  0.02276057  0.11435599  0.0141396  -0.01160078  0.0955404   0.0064516
  0.02386877  0.11207091 -0.21993314  0.09505615]


In [15]:
print(np.std(normalized, axis=0))

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.]


In [16]:
from sklearn.externals import joblib
classifier = joblib.load('GaussianClassifier_Model10_Madelon_20Best_20Random_C1_G0_1.pkl')

In [17]:
#shapely explanations - specific reference
from shap import KernelExplainer, DenseData
features_list = all_models[9].tolist()
bg = DenseData(np.array(mean_bg_feature_values[None,:]), features_list)
explainer = KernelExplainer(classifier.decision_function, bg)
#explanation = explainer.explain(normalized_validation[0]).effects
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    shap_values = explainer.shap_values(normalized_validation, nsamples=100)
print(shap_values.shape)
#shap.initjs()
#shap.force_plot(shap_values[0], np.array(features_list))
#for i in range(10):
#    shap.force_plot(shap_values[i], np.array([18, 29, 45, 49, 65, 78, 80, 106, 122, 129, 136, 148, 154, 155, 189, 242, 254, 282, 302, 315, 319, 323, 337, 339, 366, 367, 379, 402, 403, 428, 434, 438, 443, 452, 454, 456, 470, 473, 476, 494]))
#shap.force_plot(shap_values, np.array([18, 29, 45, 49, 65, 78, 80, 106, 122, 129, 136, 148, 154, 155, 189, 242, 254, 282, 302, 315, 319, 323, 337, 339, 366, 367, 379, 402, 403, 428, 434, 438, 443, 452, 454, 456, 470, 473, 476, 494]))

100%|██████████| 600/600 [00:23<00:00, 27.44it/s]

(600, 41)





In [18]:
#shap.force_plot(shap_values[1], np.array([18, 29, 45, 49, 65, 78, 80, 106, 122, 129, 136, 148, 154, 155, 189, 242, 254, 282, 302, 315, 319, 323, 337, 339, 366, 367, 379, 402, 403, 428, 434, 438, 443, 452, 454, 456, 470, 473, 476, 494]))

In [19]:
#shap.force_plot(shap_values[2], np.array([18, 29, 45, 49, 65, 78, 80, 106, 122, 129, 136, 148, 154, 155, 189, 242, 254, 282, 302, 315, 319, 323, 337, 339, 366, 367, 379, 402, 403, 428, 434, 438, 443, 452, 454, 456, 470, 473, 476, 494]))

In [20]:
#shap.force_plot(shap_values[3], np.array([18, 29, 45, 49, 65, 78, 80, 106, 122, 129, 136, 148, 154, 155, 189, 242, 254, 282, 302, 315, 319, 323, 337, 339, 366, 367, 379, 402, 403, 428, 434, 438, 443, 452, 454, 456, 470, 473, 476, 494]))

In [21]:
#shap.force_plot(shap_values[4], np.array([18, 29, 45, 49, 65, 78, 80, 106, 122, 129, 136, 148, 154, 155, 189, 242, 254, 282, 302, 315, 319, 323, 337, 339, 366, 367, 379, 402, 403, 428, 434, 438, 443, 452, 454, 456, 470, 473, 476, 494]))

In [22]:
print(shap_values[4])

[ 0.         -0.05735498  0.          0.          0.          0.11904855
  0.         -0.08946861  0.          0.          0.          0.          0.0298901
  0.          0.22722251  0.          0.          0.05213477  0.
 -0.04890878  0.         -0.0210808   0.          0.          0.          0.
  0.          0.          0.0131429   0.          0.          0.          0.
  0.          0.          0.         -0.01048636  0.          0.07766478
  0.         -0.47839448]


In [23]:
summed = np.sum(np.absolute(shap_values), axis=0)

In [24]:
sorted_indices = np.argsort(summed)
print(sorted_indices)

[35 36 13 12  4 15 14 16  2 32 26 28 39 34 20 22 33 18  1 29 10 11  9 30 19
  8 21  5 23 31 25 24 37  6  0  7  3 27 17 38 40]


In [25]:
important_features_ascending = np.array(features_list)[sorted_indices[:-1]]
print(important_features_ascending)
np.save("SHAP_M10_N100_imp_feat_asc", important_features_ascending)
top_10_feature_set = set(important_features_ascending[-10:].flat)
top_20_feature_set = set(important_features_ascending[-20:].flat)
print(top_10_feature_set)
print(top_20_feature_set)

[456 464 170 164  54 208 187 218  30 451 352 426 494 454 285 319 452 256
  29 434 134 154 129 442 282 106 287  65 335 443 339 337 473  98  27 105
  49 379 242 476]
set([98, 379, 49, 105, 337, 242, 339, 473, 27, 476])
set([129, 98, 443, 49, 134, 65, 379, 105, 106, 335, 27, 337, 242, 339, 473, 154, 287, 476, 282, 442])


In [26]:
original_feature_set = set(np.array([ 29, 49, 65, 106, 129, 154, 242, 282, 319, 337, 339, 379, 434, 443, 452, 454, 456, 473, 476, 494]).flat)
print("Number in top 10 ", len(top_10_feature_set) - len(top_10_feature_set - original_feature_set))
print("Number in top 20 ", len(top_20_feature_set) - len(top_20_feature_set - original_feature_set))

Number in top 10  7
Number in top 20  13
