In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import precision_score, recall_score

In [3]:
import irep

def make_irep_dataset(dataset_filename, random_state=42):
    # Load df
    df = pd.read_csv(dataset_filename)
    
    # Split
    train, test = train_test_split(df, test_size=.33, random_state=random_state)
    
    return train, test

In [4]:
def make_irep(train, class_feat, pos_class, random_state=42):
    # Train
    irep_cls = irep.IREP(class_feat=class_feat,pos_class=pos_class)
    irep_cls.fit(train, seed=random_state, prune=True, display=False)
    
    return irep_cls

In [5]:
def score_irep(irep_cls, test, class_feat):
    X_test = test.drop(class_feat,axis=1)
    y_test = test[class_feat]

    precision = irep_cls.score(X_test, y_test, precision_score)
    recall = irep_cls.score(X_test, y_test, recall_score)
    total_conds = sum([len(rule.conds) for rule in irep_cls.ruleset.rules])
    return precision, recall, total_conds

In [6]:
from sklearn.tree import DecisionTreeClassifier

def make_tree_dataset(dataset_filename, class_feat, pos_class, n_classes, random_state=42):
    
    # Load df
    df = pd.read_csv(dataset_filename)
    
    # sklearn's Tree (of all things!) doesn't directly take categorical features.
    # One-hot preprocess the data to make it work.
    le = LabelEncoder()
    df_le=df.apply(le.fit_transform)
    enc = OneHotEncoder(sparse=False)
    enc.fit(df_le)
    df_hot=enc.transform(df_le)
    
    # Split
    train, test = train_test_split(df_hot, test_size=.33, random_state=random_state)
    train_X = train[:,n_classes:]
    train_y = train[:,0]
    test_X = test[:,n_classes:]
    test_y = test[:,0]
    
    return train_X, train_y, test_X, test_y

In [7]:
def make_tree(train_X, train_y, random_state=42):
    tree = DecisionTreeClassifier(random_state=random_state)
    tree.fit(train_X, train_y)
    return tree

In [8]:
def score_tree(tree_cls, test_X, test_y):
    predictions = tree_cls.predict(test_X)
    precision = precision_score(test_y, predictions)
    recall = recall_score(test_y, predictions)
    return precision, recall, tree_cls.tree_.node_count

In [9]:
datasets_path = '../../datasets/'
random_state = 0

In [10]:
# Set up
dataset = 'house-votes-84.csv'
filename = datasets_path + dataset
class_feat = 'Party'
pos_class = 'democrat'
n_classes = 2

In [13]:
# Run irep
train, test = make_irep_dataset(filename, random_state=random_state)
%timeit irep_cls = make_irep(train, class_feat, pos_class, random_state=random_state)
irep_precision, irep_recall, total_conds = score_irep(irep_cls, test, class_feat)
irep_precision, irep_recall, total_conds

869 ms ± 19.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


(0.9753086419753086, 0.9404761904761905, 3)

In [14]:
# Run tree
tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, random_state=random_state)
%timeit tree_cls = make_tree(tree_train_X, tree_train_y, random_state=random_state)
tree_precision, tree_recall, tree_nodes = score_tree(tree_cls, tree_test_X, tree_test_y)
tree_precision, tree_recall, tree_nodes

755 µs ± 95.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


(0.975609756097561, 0.9523809523809523, 33)