### Skope rules 

#### Skope rules exploits sklearn externals. There is a bug with last version of sklearn, use an older one.

In [1]:
import sklearn
from skrules import SkopeRules
import pickle



In [2]:
nome = 'adult'
title = "../datasets/train_set_"+nome+"_strat.p"
train = open(title,"rb")
train_set = pickle.load(train)
train_set.pop('fnlwgt')
title = "../datasets/train_label_"+nome+"_strat.p"
train_l = open(title,"rb")
train_label = pickle.load(train_l)
title = "../datasets/test_set_" + nome + "_strat.p"
test = open(title, "rb")
test_set = pickle.load(test)
title = "../datasets/test_label_" + nome + "_strat.p"
test_l = open(title, "rb")
test_label = pickle.load(test_l)

#### Skope-rules allows for the extraction of rules on the dataset, before the model. In the following, we define the SkopeRules model and then we fit it, obtaining the sets of rules to describe the dataset. 

In [3]:
feature_names = train_set.columns

##### There are several parameters to set: in particular, the min precision and recall have to be set.

In [4]:
clf = SkopeRules(max_depth_duplication=10,
                 max_depth=10,
                 n_estimators=50,
                 precision_min=0.40,
                 recall_min=0.10,
                 feature_names=feature_names)

In [5]:
import time
start = time.time()
for idx in range(0,1):
    X, y = train_set.values, train_label
    clf.fit(X, y == idx)
end = time.time()
print('Time to fit the model ', end - start)    

Time to fit the model  320.41760993003845


In [6]:
clf

SkopeRules(bootstrap=False, bootstrap_features=False,
           feature_names=Index(['age', 'workclass', 'education-num', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
       'hours-per-week', 'native-country'],
      dtype='object'),
           max_depth=10, max_depth_duplication=10, max_features=1.0,
           max_samples=0.8, max_samples_features=1.0, min_samples_split=2,
           n_estimators=50, n_jobs=1, precision_min=0.4, random_state=None,
           recall_min=0.1, verbose=0)

In [7]:
clf.rules_[1]

('age <= 28.5 and hours-per-week <= 45.5 and education-num <= 12.5 and marital-status > 1.5 and occupation > 1.5 and relationship > 2.5 and capital-gain <= 4718.5 and capital-loss <= 2218.5',
 (0.998687431217566, 0.21066967208102388, 2))

#### We can apply the SkopeRules model for each class, obtaining a set of rules for each class. In this way, we can have a better understanding of the classes.

In [8]:
i_cluster = 0
for i_cluster in range(2):
    X_train = train_set.values
    y_train = (train_label==i_cluster)*1
    skope_rules_clf = SkopeRules(max_depth_duplication=10,
                 max_depth=10,
                 n_estimators=20,
                 precision_min=0.40,
                 recall_min=0.10,
                 feature_names=feature_names)
    skope_rules_clf.fit(X_train, y_train)
    print('Cluster '+str(i_cluster)+':')
    print(skope_rules_clf.rules_[0:3])

Cluster 0:
[('age <= 28.5 and workclass <= 3.5 and hours-per-week <= 45.5 and education-num <= 12.5 and marital-status > 1.5 and relationship > 2.5 and race <= 3.5 and capital-gain <= 7073.5 and capital-loss <= 2218.5', (0.9988553987027852, 0.20687475306203082, 1)), ('age <= 28.5 and workclass <= 3.5 and hours-per-week <= 45.5 and education-num <= 12.5 and marital-status > 1.5 and occupation > 1.5 and relationship > 2.5 and capital-gain <= 4718.5 and capital-loss <= 2218.5', (0.9988349514563106, 0.20315955766192734, 1)), ('age <= 28.5 and hours-per-week <= 44.5 and education-num <= 12.5 and marital-status > 1.5 and occupation > 1.5 and relationship > 2.5 and race <= 3.5 and capital-gain <= 7073.5 and capital-loss <= 2218.5', (0.9988184324537219, 0.20087128712871288, 1))]
Cluster 1:
[('age <= 69.0 and age > 31.5 and hours-per-week > 26.5 and education-num > 12.5 and occupation <= 3.5 and relationship <= 2.5 and capital-gain <= 4447.0 and capital-loss <= 742.5', (0.7622080679405521, 0.17

SkopeRules can be employed also as an explanation method: in this case, the labels passed to the SkopeRule model are the labels predicted by the black box model 

In [9]:
clf = SkopeRules(max_depth_duplication=None,
                 n_estimators=50,
                 max_depth=30,
                 precision_min=0.40,
                 recall_min=0.10,
                 feature_names=feature_names)

In [10]:
from xgboost import XGBClassifier
bb = XGBClassifier(C= 1, penalty='l2', objective='binary:logistic', seed = 42, bootstrap=True, max_depth=90,learning_rate=0.1, n_estimators=500, tree_method='auto')
bb.fit(train_set.values, train_label.values)
y_score = bb.predict(train_set.values)



Parameters: { C, bootstrap, penalty } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [11]:
clf.fit(train_set, y_score)

KeyboardInterrupt: 

In [None]:
clf.rules_

In [None]:
from sklearn.metrics import precision_recall_curve
from matplotlib import pyplot as plt
precision, recall, _ = precision_recall_curve(test_label, y_score)
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision Recall curve')
plt.show()

In [None]:
precision

In [None]:
recall

In [None]:
# Get number of survival rules created
print(str(len(clf.rules_)) + ' rules have been built with ' +
      'SkopeRules.\n')
print('The 4 most performing "Adult rules" are the following one:\n')
for i_rule, rule in enumerate(clf.rules_[:4]):
    print(rule[0])

In [None]:
import numpy as np
import pandas as pd
def compute_y_pred_from_query(X, rule):
    score = np.zeros(X.shape[0])
    X = X.reset_index(drop=True)
    score[list(X.query(rule).index)] = 1
    return(score)

def compute_performances_from_y_pred(y_true, y_pred, index_name='default_index'):
    df = pd.DataFrame(data=
        {
            'precision':[sum(y_true * y_pred)/sum(y_pred)],
            'recall':[sum(y_true * y_pred)/sum(y_true)]
        },
        index=[index_name],
        columns=['precision', 'recall']
    )
    return(df)

def compute_train_test_query_performances(X_train, y_train, X_test, y_test, rule):
    
    y_train_pred = compute_y_pred_from_query(X_train, rule)
    y_test_pred = compute_y_pred_from_query(X_test, rule)
    
    performances = None
    performances = pd.concat([
        performances,
        compute_performances_from_y_pred(y_train, y_train_pred, 'train_set')],
        axis=0)
    performances = pd.concat([
        performances,
        compute_performances_from_y_pred(y_test, y_test_pred, 'test_set')],
        axis=0)
            
    return(performances)


for i in range(4):
    print('Rule '+str(i+1)+':')
    display(compute_train_test_query_performances(train_set, train_label,
                                                  test_set, test_label,
                                                  clf.rules_[i][0])
           )


In [None]:
train_set.columns

In [None]:
test_set.columns

In [None]:
clf.rules_[4]

In [None]:
n_rule_chosen = 6
y_pred = clf.predict_top_rules(test_set, n_rule_chosen)

print('The performances reached with '+str(n_rule_chosen)+' discovered rules are the following:')
compute_performances_from_y_pred(test_label, y_pred, 'test_set')