In [1]:
from __future__ import division, absolute_import, print_function
%matplotlib inline

In [2]:
import time
import numpy as np
import random

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [4]:
!echo $CUDA_VISIBLE_DEVICES

0


### FIX  RANDOM SEEDS HERE! (Comment out to get fully random)

In [5]:
np.random.seed(0)
random.seed(1001)

## Get the data

This is the data from the MADELON dataset http://archive.ics.uci.edu/ml/datasets/madelon

In [6]:
!ls dataset/MADELON

madelon.param	   madelon_train.data	 madelon_valid.data
madelon_test.data  madelon_train.labels


In [7]:
data = np.loadtxt("dataset/MADELON/madelon_train.data", dtype=np.int32)
validation_data = np.loadtxt("dataset/MADELON/madelon_valid.data", dtype=np.int32)
print(data.shape)
print(validation_data.shape)

(2000, 500)
(600, 500)


### Subset the data to exclude irrelevant features

In [8]:
#relevant_features = np.array([339,443,473,49,379,476,242,106,319,29,452,434,129,282,454,154,337,65,494])
#relevant_features = np.array([ 29, 49, 65, 106, 129, 154, 242, 282, 319, 337, 339, 379, 434, 443, 452, 454, 456, 473, 476, 494])
#relevant_features = np.array([18, 29, 45, 49, 65, 78, 80, 106, 122, 129, 136, 148, 154, 155, 189, 242, 254, 282, 302, 315, 319, 323, 337, 339, 366, 367, 379, 402, 403, 428, 434, 438, 443, 452, 454, 456, 470, 473, 476, 494])
all_models = np.load("TenMadelonModels.npy")
features_list = all_models[9].tolist()
relevant_features = np.array(all_models[9])
relevant_features = relevant_features - 1 # Convert 1 indexed to 0 indexed
data = data[:, relevant_features]
validation_data = validation_data[:, relevant_features]
print(data.shape)
print(validation_data)

(2000, 40)
[[454 490 510 ..., 454 538 259]
 [483 491 490 ..., 439 429 534]
 [457 479 474 ..., 494 517 495]
 ..., 
 [474 504 505 ..., 417 514 368]
 [461 468 490 ..., 501 521 638]
 [513 491 496 ..., 489 398 694]]


In [9]:
labels = np.loadtxt("dataset/MADELON/madelon_train.labels", dtype=np.int32)
validation_labels= np.loadtxt("dataset/madelon_valid.labels", dtype=np.int32)
print(labels.shape)
print(labels)

(2000,)
[-1 -1 -1 ..., -1  1  1]


### Normalize the data by getting using average and stdev

In [10]:
def get_average_and_stdev(input_array):
    avg = np.average(input_array, axis=0)
    return avg, np.std(input_array, axis=0)

In [11]:
def normalize(input_array, avg, stdev):
    avgremoved = input_array - avg[None,:]
    return avgremoved/stdev

#### Note that validation data is normalized using original bias and stdev

In [12]:
avg, stdev = get_average_and_stdev(data)
normalized = normalize(data, avg, stdev)
normalized_validation = normalize(validation_data, avg, stdev)
validation_pos_indices = np.nonzero(validation_labels == 1)[0]
normalized_validation_pos=normalize(validation_data[validation_pos_indices], avg, stdev)
print(avg.shape)
print(normalized.shape)
print(normalized_validation.shape)

(40,)
(2000, 40)
(600, 40)


In [13]:
background_indices = np.nonzero(labels == -1)[0]
#print(background_indices.shape)
#print(background_indices)
mean_bg_feature_values = np.average(normalized[background_indices], axis=0)
print(mean_bg_feature_values)
#print(mean_bg_feature_values.shape)

[-0.03609533 -0.00357137 -0.00671681 -0.11847695 -0.0338104   0.15032661
  0.00036677 -0.01377585 -0.12626289 -0.12675752 -0.00861636  0.03378346
 -0.04081777  0.00088387 -0.01811258 -0.03407828  0.03015419 -0.21782664
  0.03331272  0.03363522 -0.0126518   0.04952507 -0.00469233  0.01596474
  0.15167207  0.12178815 -0.00886381 -0.11596197  0.04598214  0.02981902
  0.02276057  0.11435599  0.0141396  -0.01160078  0.0955404   0.0064516
  0.02386877  0.11207091 -0.21993314  0.09505615]


In [14]:
print(np.std(normalized, axis=0))

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.]


In [15]:
from sklearn.externals import joblib
classifier = joblib.load('GaussianClassifier_Model10_ProbTrue_Madelon_20Best_20Random_C1_G0_1.pkl')

In [16]:
def get_scores_from_explanation(exp):
    scoresmap = exp.as_map()
    scoreslist = scoresmap[1]
    returnable = np.empty([len(scoreslist)])
    indices = np.array([scoreslist[i][0] for i in range(len(scoreslist))])
    values = np.array([scoreslist[i][1] for i in range(len(scoreslist))])
    returnable[indices] = values
    return returnable

In [17]:
import lime
import lime.lime_tabular
importance_scores = np.array([])
start = time.time()
explainer = lime.lime_tabular.LimeTabularExplainer(normalized, feature_names=features_list)
for i in range(normalized_validation.shape[0]):
    exp = explainer.explain_instance(normalized_validation[i], classifier.predict_proba, num_features=40, num_samples=100)
    next_score = get_scores_from_explanation(exp)
    if (i % 30 == 0):
        print("In iteration ", i, " Got imp score ", next_score)
    importance_scores = np.append(importance_scores, next_score)
importance_scores = importance_scores.reshape(normalized_validation.shape[0], 40)
#importance_scores = np.multiply(importance_scores.reshape(normalized_validation.shape[0], 40), normalized_validation - mean_bg_feature_values[None,:]) 
print("Importance scores computed in:", round(time.time() - start, 2), "s")

In iteration  0  Got imp score  [-0.00999705  0.009352    0.01243736 -0.03024458  0.00289736 -0.01019548
 -0.00247993  0.01107845 -0.0167779  -0.03087184  0.03488663  0.00032883
 -0.02473095 -0.00252216 -0.00405525  0.01976385 -0.04108559 -0.01790256
 -0.00402301  0.00854379 -0.02848948  0.0113931  -0.01217289 -0.06175826
  0.02001772 -0.02227115 -0.03158633 -0.04184051 -0.03929972  0.02435989
 -0.00838413 -0.01350088 -0.02444156  0.02952744 -0.01859225 -0.02027465
 -0.01761195  0.00713297  0.05810955  0.04965255]
In iteration  30  Got imp score  [-0.04199781 -0.02389431 -0.02006494  0.04274236  0.01176213 -0.03560963
 -0.00304001 -0.05867874  0.00928854  0.04414736 -0.04185279 -0.00816406
  0.00607156 -0.02283954  0.01376737 -0.01192572  0.00879736 -0.01142937
 -0.03562592 -0.01126067  0.00659652  0.01359022  0.00809469  0.01303306
 -0.02241622 -0.04942015  0.03496433  0.00840376 -0.01810821 -0.00848581
  0.00057604 -0.0213233  -0.01781949 -0.04071659 -0.02132292 -0.03258524
 -0.02454

In [18]:
summed = np.sum(np.absolute(importance_scores), axis=0)

In [19]:
sorted_indices = np.argsort(summed)
print(sorted_indices)

[14 22 35  2 15 33 36  1 11 20 18  4 12 16 29 13 26  6 32 30 19 10 39 34 28
 21 23  7 31 37  5  0  8  9 24 27 25  3 17 38]


In [20]:
important_features_ascending = np.array(features_list)[sorted_indices]
print(important_features_ascending)
np.save("LIME_M10_N100_imp_feat_asc", important_features_ascending)
top_10_feature_set = set(important_features_ascending[-10:].flat)
top_20_feature_set = set(important_features_ascending[-20:].flat)
print(top_10_feature_set)
print(top_20_feature_set)

[187 319 456  30 208 452 464  29 154 285 256  54 164 218 434 170 352  98
 451 442 282 134 494 454 426 287 335 105 443 473  65  27 106 129 337 379
 339  49 242 476]
set([65, 27, 49, 129, 106, 337, 242, 339, 379, 476])
set([65, 27, 134, 454, 129, 379, 105, 426, 494, 335, 49, 337, 242, 339, 473, 282, 443, 476, 106, 287])


In [21]:
original_feature_set = set(np.array([ 29, 49, 65, 106, 129, 154, 242, 282, 319, 337, 339, 379, 434, 443, 452, 454, 456, 473, 476, 494]).flat)
print("Number in top 10 ", len(top_10_feature_set) - len(top_10_feature_set - original_feature_set))
print("Number in top 20 ", len(top_20_feature_set) - len(top_20_feature_set - original_feature_set))

Number in top 10  9
Number in top 20  14
