### Head-to-head comparison of IREP Python implementation and sklearn's Decision Tree Classifier 

In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import precision_score, recall_score

In [3]:
import ruleset

def make_irep_dataset(dataset_filename, random_state=42):
    # Load df
    df = pd.read_csv(dataset_filename)
    
    # Split
    train, test = train_test_split(df, test_size=.33, random_state=random_state)
    
    return train, test

In [4]:
def make_irep(train, class_feat, pos_class, random_state=42):
    # Train
    irep_clf = ruleset.IREP(class_feat=class_feat,pos_class=pos_class)
    irep_clf.fit(train, seed=random_state, prune=True, display=False)
    
    return irep_clf

In [5]:
def score_irep(irep_clf, test, class_feat):
    X_test = test.drop(class_feat,axis=1)
    y_test = test[class_feat]

    precision = irep_clf.score(X_test, y_test, precision_score)
    recall = irep_clf.score(X_test, y_test, recall_score)
    total_conds = sum([len(rule.conds) for rule in irep_clf.ruleset.rules])
    return precision, recall, total_conds

In [6]:
from sklearn.tree import DecisionTreeClassifier

def make_tree_dataset(dataset_filename, class_feat, pos_class, n_classes, random_state=42):
    
    # Load df
    df = pd.read_csv(dataset_filename)
    
    # sklearn's Tree (of all things!) doesn't directly take categorical features.
    # One-hot preprocess the data to make it work.
    le = LabelEncoder()
    df_le=df.apply(le.fit_transform)
    enc = OneHotEncoder(sparse=False)
    enc.fit(df_le)
    df_hot=enc.transform(df_le)
    
    # Split
    train, test = train_test_split(df_hot, test_size=.33, random_state=random_state)
    train_X = train[:,n_classes:]
    train_y = train[:,0]
    test_X = test[:,n_classes:]
    test_y = test[:,0]
    
    return train_X, train_y, test_X, test_y

In [72]:
def make_tree(train_X, train_y, max_depth=None, random_state=42):
    tree = DecisionTreeClassifier(max_depth=max_depth, random_state=random_state)
    tree.fit(train_X, train_y)
    return tree

In [8]:
def score_tree(tree_clf, test_X, test_y):
    predictions = tree_clf.predict(test_X)
    precision = precision_score(test_y, predictions)
    recall = recall_score(test_y, predictions)
    return precision, recall, tree_clf.tree_.node_count

In [81]:
import numpy as np
import math

def run_experiment(filename, class_feat, pos_class, n_classes, seed_range=(0,10)):
    irep_precision_list = []
    irep_recall_list = []
    total_conds_list = []
    
    tree_precision_list = []
    tree_recall_list = []
    tree_nodes_list = []
    
    for i in range(seed_range[1]):
        
        # Set random_state
        random_state = i
        
        # Run irep
        train, test = make_irep_dataset(filename, random_state=random_state)
        irep_clf = make_irep(train, class_feat, pos_class, random_state=random_state)
        irep_precision, irep_recall, total_conds = score_irep(irep_clf, test, class_feat)
        irep_precision, irep_recall, total_conds

        # Run tree
        tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, random_state=random_state)
        tree_clf = make_tree(tree_train_X, tree_train_y, random_state=random_state)
        tree_precision, tree_recall, tree_nodes = score_tree(tree_clf, tree_test_X, tree_test_y)
        tree_precision, tree_recall, tree_nodes

        irep_precision_list.append(irep_precision)
        irep_recall_list.append(irep_recall)
        total_conds_list.append(total_conds)

        tree_precision_list.append(tree_precision)
        tree_recall_list.append(tree_recall)
        tree_nodes_list.append(tree_nodes)
        
    print(np.mean(irep_precision_list), np.mean(irep_recall_list), np.mean(total_conds_list))
    print(np.mean(tree_precision_list), np.mean(tree_recall_list), np.mean(tree_nodes_list))
        
    tree_precision_list = []
    tree_recall_list = []
    tree_nodes_list = []
    
    for i in range(seed_range[1]):
        # Run max_depth-specified tree
        tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, random_state=random_state)
        tree_clf = make_tree(tree_train_X, tree_train_y, max_depth=math.log2(np.mean(total_conds_list)), random_state=random_state)
        tree_precision, tree_recall, tree_nodes = score_tree(tree_clf, tree_test_X, tree_test_y)
        tree_precision, tree_recall, tree_nodes

        irep_precision_list.append(irep_precision)
        irep_recall_list.append(irep_recall)
        total_conds_list.append(total_conds)

        tree_precision_list.append(tree_precision)
        tree_recall_list.append(tree_recall)
        tree_nodes_list.append(tree_nodes)
    
    print(np.mean(tree_precision_list), np.mean(tree_recall_list), np.mean(tree_nodes_list))

          

In [82]:
datasets_path = '../datasets/'
random_state = 0

In [83]:
# Set up
dataset = 'house-votes-84.csv'
filename = datasets_path + dataset
class_feat = 'Party'
pos_class = 'democrat'
n_classes = 2

In [84]:
run_experiment(filename, class_feat, pos_class, n_classes)

0.9437519305307361 0.9388489194300581 3.1
0.9545119098462994 0.9631594347666965 37.0
0.9787234042553191 0.9892473118279568 3.0


### Performance: Tie

In [11]:
# Run irep
train, test = make_irep_dataset(filename, random_state=random_state)
irep_clf = make_irep(train, class_feat, pos_class, random_state=random_state)
irep_precision, irep_recall, total_conds = score_irep(irep_clf, test, class_feat)
irep_precision, irep_recall, total_conds

(0.9753086419753086, 0.9404761904761905, 3)

In [12]:
# Run tree
tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, random_state=random_state)
tree_clf = make_tree(tree_train_X, tree_train_y, random_state=random_state)
tree_precision, tree_recall, tree_nodes = score_tree(tree_clf, tree_test_X, tree_test_y)
tree_precision, tree_recall, tree_nodes

(0.975609756097561, 0.9523809523809523, 33)

### Speed: sklearn

In [13]:
%timeit irep_clf = make_irep(train, class_feat, pos_class, random_state=random_state)

KeyboardInterrupt: 

In [None]:
%timeit tree_clf = make_tree(tree_train_X, tree_train_y, random_state=random_state)

### Explainability: IREP

In [None]:
irep_clf

In [None]:
irep_clf.ruleset

In [None]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(tree_clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

### Cancer dataset

In [86]:
# Set up
dataset = 'breast-cancer.csv'
filename = datasets_path + dataset
class_feat = 'Recurrence'
pos_class = 'recurrence-events'
n_classes = 2
random_state=30

In [87]:
df = pd.read_csv(datasets_path+filename)
df.head()

Unnamed: 0,Recurrence,Age,Menopause,Tumor-size,Inv-nodes,Node-caps,Deg-malig,Breast,Quadrant,Irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


### Performance: IREP

In [88]:
# Run irep
train, test = make_irep_dataset(filename, random_state=random_state)
irep_clf = make_irep(train, class_feat, pos_class, random_state=random_state)
irep_precision, irep_recall, total_conds = score_irep(irep_clf, test, class_feat)
irep_precision, irep_recall, total_conds

(0.75, 0.23076923076923078, 2)

In [89]:
# Run tree
tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, random_state=random_state)
tree_clf = make_tree(tree_train_X, tree_train_y, random_state=random_state)
tree_precision, tree_recall, tree_nodes = score_tree(tree_clf, tree_test_X, tree_test_y)
tree_precision, tree_recall, tree_nodes

(0.7567567567567568, 0.8115942028985508, 123)

In [90]:
run_experiment(filename, class_feat, pos_class, n_classes)

0.4731328320802005 0.5800091916471227 2.2
0.7427184062196605 0.7232929986113384 124.8
0.7113450292397662 0.9484848484848485 1.4


### Speed: sklearn

In [None]:
%timeit irep_clf = make_irep(train, class_feat, pos_class, random_state=random_state)

In [None]:
%timeit tree_clf = make_tree(tree_train_X, tree_train_y, random_state=random_state)

### Explainability: IREP


In [42]:
irep_clf

<IREP object fit ruleset=[[Deg-malig=3^Node-caps=yes]]>

In [None]:
irep_clf.ruleset

In [None]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(tree_clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

### Tic-Tac-Toe dataset


In [91]:
# Set up
dataset = 'tic-tac-toe.csv'
filename = datasets_path + dataset
class_feat = 'Class'
pos_class = 'positive'
n_classes = 2
random_state = 42

In [92]:
df = pd.read_csv(filename)
df.head()

Unnamed: 0,Top-left,Top-middle,Top-right,Middle-left,Middle-middle,Middle-right,Bottom-left,Bottom-middle,Bottom-right,Class
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


### Performance: IREP

In [93]:
# Run irep
train, test = make_irep_dataset(filename, random_state=random_state)
irep_clf = make_irep(train, class_feat, pos_class, random_state=random_state)
irep_precision, irep_recall, total_conds = score_irep(irep_clf, test, class_feat)
irep_precision, irep_recall, total_conds

(0.738831615120275, 1.0, 3)

In [94]:
# Run tree
tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, random_state=random_state)
tree_clf = make_tree(tree_train_X, tree_train_y, random_state=random_state)
tree_precision, tree_recall, tree_nodes = score_tree(tree_clf, tree_test_X, tree_test_y)
tree_precision, tree_recall, tree_nodes

(0.38461538461538464, 0.32894736842105265, 301)

In [95]:
run_experiment(filename, class_feat, pos_class, n_classes)

0.7574227230946787 0.9586715528996294 7.7
0.3216647935667165 0.32690064994853596 339.0
0.32727272727272727 0.046753246753246755 8.6


  'precision', 'predicted', average, warn_for)


In [50]:
irep_clf

<IREP object fit ruleset=[[Middle-middle=x]v[Bottom-right=x]v[Top-left=x]]>

### Speed: sklearn



In [None]:
%timeit irep_clf = make_irep(train, class_feat, pos_class, random_state=random_state)

In [None]:
%timeit tree_clf = make_tree(tree_train_X, tree_train_y, random_state=random_state)

### Explainability: IREP


In [None]:
irep_clf.ruleset

In [None]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(tree_clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'max_depth': [1,2,3,4,5],
    'min_samples_leaf': [3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
    }
grid_clf = GridSearchCV(tree_clf, parameters, cv=5)
grid_clf.fit(tree_train_X, tree_train_y)
grid_clf.best_params_

In [None]:
grid_predictions = grid_clf.predict(tree_test_X)
precision = precision_score(grid_predictions, tree_test_y)
recall = recall_score(grid_predictions, tree_test_y)
print(f'precision {precision}, recall {recall}')

### Census Income dataset

In [102]:
# Set up
dataset = 'adult.csv'
filename = datasets_path + dataset
class_feat = 'income'
pos_class = '>50K'
n_classes = 2
random_state = 42

In [103]:
df = pd.read_csv(filename)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [104]:
len(df[df['income']=='>50K']) / len(df)

0.2408095574460244

In [105]:
# Run irep
train, test = make_irep_dataset(filename, random_state=random_state)
irep_clf = make_irep(train, class_feat, pos_class, random_state=random_state)
irep_precision, irep_recall, total_conds = score_irep(irep_clf, test, class_feat)
irep_precision, irep_recall, total_conds

(0.6594892338507762, 0.5073189522342064, 24)

In [None]:
# Run tree
tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, random_state=random_state)
tree_clf = make_tree(tree_train_X, tree_train_y, random_state=random_state)
tree_precision, tree_recall, tree_nodes = score_tree(tree_clf, tree_test_X, tree_test_y)
tree_precision, tree_recall, tree_nodes

(0.6276595744680851, 0.44696969696969696, 397)

In [None]:
run_experiment(filename, class_feat, pos_class, n_classes)

In [56]:
for rule in irep_clf.ruleset.rules:
    print(rule)

[marital.status=Married-civ-spouse^education=Bachelors]
[marital.status=Married-civ-spouse^occupation=Prof-specialty]
[marital.status=Married-civ-spouse^occupation=Exec-managerial]
[marital.status=Married-civ-spouse^occupation=Sales^education=Masters^native.country=United-States]
[marital.status=Married-civ-spouse^education=Some-college^occupation=Sales]
[marital.status=Married-civ-spouse^education=Some-college^occupation=Craft-repair^workclass=Private^age=59]
[marital.status=Married-civ-spouse^education=Some-college^occupation=Craft-repair^workclass=Private^native.country=United-States^age=42]


### Speed: sklearn


In [29]:
%timeit irep_clf = make_irep(train, class_feat, pos_class, random_state=random_state)

2min 45s ± 9.44 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%timeit tree_clf = make_tree(tree_train_X, tree_train_y, random_state=random_state)

### Chess

In [21]:
# Set up
dataset = 'kr-vs-kp.csv'
filename = datasets_path + dataset
class_feat = 'won/lost'
pos_class = 'won'
n_classes = 2
random_state = 42

In [22]:
df = pd.read_csv(filename)
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,28,29,30,31,32,33,34,35,36,won/lost
0,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
1,f,f,f,f,t,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won
2,f,f,f,f,t,f,t,f,f,f,...,f,f,f,f,f,f,t,t,n,won
3,f,f,f,f,f,f,f,f,t,f,...,f,f,f,f,f,f,t,t,n,won
4,f,f,f,f,f,f,f,f,f,f,...,f,f,f,f,f,f,t,t,n,won


In [23]:
# Run irep
train, test = make_irep_dataset(filename, random_state=random_state)
irep_clf = make_irep(train, class_feat, pos_class, random_state=random_state)
irep_precision, irep_recall, total_conds = score_irep(irep_clf, test, class_feat)
irep_precision, irep_recall, total_conds

(0.7741935483870968, 0.8617594254937163, 4)

In [24]:
# Run tree
tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, random_state=random_state)
tree_clf = make_tree(tree_train_X, tree_train_y, random_state=random_state)
tree_precision, tree_recall, tree_nodes = score_tree(tree_clf, tree_test_X, tree_test_y)
tree_precision, tree_recall, tree_nodes

(0.9770992366412213, 0.9542066027689031, 277)

In [60]:
irep_clf

<IREP object fit ruleset=[[33=f^10=f]v[18=f^7=f]]>

### Mushroom

In [25]:
# Set up
dataset = 'mushroom.csv'
filename = datasets_path + dataset
class_feat = 'Poisonous/Edible'
pos_class = 'p'
n_classes = 2
random_state = 0

In [26]:
df = pd.read_csv(filename)
df.head()

Unnamed: 0,Poisonous/Edible,Cap-shape,Cap-surface,Cap-color,Bruises?,Odor,Gill-attachment,Gill-spacing,Gill-size,Gill-color,...,Stalk-surface-below-ring,Stalk-color-above-ring,Stalk-color-below-ring,Veil-type,Veil-color,Ring-number,Ring-type,Sport-print-color,Population,Habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [27]:
# Run irep
train, test = make_irep_dataset(filename, random_state=random_state)
irep_clf = make_irep(train, class_feat, pos_class, random_state=random_state)
irep_precision, irep_recall, total_conds = score_irep(irep_clf, test, class_feat)
irep_precision, irep_recall, total_conds

(0.9219088937093276, 1.0, 12)

In [28]:
# Run tree
tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, random_state=random_state)
tree_clf = make_tree(tree_train_X, tree_train_y, random_state=random_state)
tree_precision, tree_recall, tree_nodes = score_tree(tree_clf, tree_test_X, tree_test_y)
tree_precision, tree_recall, tree_nodes

(1.0, 1.0, 27)

In [31]:
for rule in irep_clf.ruleset.rules:
    print(rule)

[Stalk-surface-above-ring=k^Gill-spacing=c]
[Gill-size=n]
[Cap-surface=s^Stalk-root=b]
[Cap-color=b^Stalk-root=b]
[Cap-color=p^Stalk-root=b]
[Ring-number=t^Population=v^Cap-color=w]


In [1]:
len('[marital.status=Married-civ-spouse^education.num=10-14^occupation=Exec-managerial^capital.gain=0-99999^hours.per.week=48-99] V [marital.status=Married-civ-spouse^education.num=10-14^occupation=Exec-managerial^workclass=Private^hours.per.week=48-99^education=Masters] V [marital.status=Married-civ-spouse^education.')

314