In [None]:
from __future__ import print_function
import numpy as np
np.random.seed(1)
import sys
import sklearn
import sklearn.ensemble
%load_ext autoreload
%autoreload 2
from anchor import utils
from anchor import anchor_tabular

## Preparing Graduate Admissions Dataset

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('./data/graduate-admissions/Admission_Predict_Ver1.1.csv')
df.head()

### there are no classes, so to make it work, set binary threshold (>=0.5 chance of admit)

In [None]:
df['Admit'] = df['Chance of Admit '].apply(lambda x: 1 if x >= 0.5 else 0)
df.head()

In [None]:
df.to_csv('./data/graduate-admissions/admissions.csv',index=False)

In [None]:
df.columns

### Loading the dataset
This dataset is about predicting if a person makes more or less than 50,000 dollars

In [None]:
feature_names = df.columns
features_to_use = [1,2,3,4,5,6,7]
target_idx = 9
categorical_features = [3,7]

In [None]:
dataset = utils.load_csv_dataset('./data/graduate-admissions/admissions.csv', target_idx,
                              feature_names=feature_names, categorical_features=categorical_features,
                              features_to_use=features_to_use,skip_first = True)

In [None]:
explainer = anchor_tabular.AnchorTabularExplainer(dataset.class_names, dataset.feature_names, dataset.data, dataset.categorical_names)
explainer.fit(dataset.train, dataset.labels_train, dataset.validation, dataset.labels_validation)

In [None]:
dataset.train.shape

In [None]:
dataset.labels_train

In [None]:
c = sklearn.ensemble.RandomForestClassifier(n_estimators=50, n_jobs=5)
c.fit(dataset.train, dataset.labels_train)
predict_fn = lambda x: c.predict(x)
print('Train', sklearn.metrics.accuracy_score(dataset.labels_train, predict_fn(dataset.train)))
print('Test', sklearn.metrics.accuracy_score(dataset.labels_test, predict_fn(dataset.test)))

### Getting an anchor
Below, we get an anchor for prediction number 0. An anchor is a sufficient condition - that is, when the anchor holds, the prediction should be the same as the prediction for this instance.

In [None]:
idx = 0
np.random.seed(1)
print('Prediction: ', explainer.class_names[predict_fn(dataset.test[idx].reshape(1, -1))[0]])
exp = explainer.explain_instance(dataset.test[idx], c.predict, threshold=0.95)

In [None]:
print('Anchor: %s' % (' AND '.join(exp.names())))
print('Precision: %.2f' % exp.precision())
print('Coverage: %.2f' % exp.coverage())

Note that we set threshold to 0.95, so we guarantee (with high probability) that precision will be above 0.95 - that is, that predictions on instances where the anchor holds will be the same as the original prediction at least 95% of the time. Let's try it out on the test set

In [None]:
# Get test examples where the anchora pplies
fit_anchor = np.where(np.all(dataset.test[:, exp.features()] == dataset.test[idx][exp.features()], axis=1))[0]
print('Anchor test coverage: %.2f' % (fit_anchor.shape[0] / float(dataset.test.shape[0])))
print('Anchor test precision: %.2f' % (np.mean(predict_fn(dataset.test[fit_anchor]) == predict_fn(dataset.test[idx].reshape(1, -1)))))

### Looking at a partial anchor
You can look at just part of the anchor - for example, the first two clauses. Note how these do not have enough precision, which is why the explainer added a third one

In [None]:
print('Partial anchor: %s' % (' AND '.join(exp.names(1))))
print('Partial precision: %.2f' % exp.precision(1))
print('Partial coverage: %.2f' % exp.coverage(1))


In [None]:
fit_partial = np.where(np.all(dataset.test[:, exp.features(1)] == dataset.test[idx][exp.features(1)], axis=1))[0]
print('Partial anchor test precision: %.2f' % (np.mean(predict_fn(dataset.test[fit_partial]) == predict_fn(dataset.test[idx].reshape(1, -1)))))
print('Partial anchor test coverage: %.2f' % (fit_partial.shape[0] / float(dataset.test.shape[0])))


## See a visualization of the anchor with examples and etc (won't work if you're seeing this on github)

In [None]:
exp.show_in_notebook()