In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import precision_score, recall_score, accuracy_score

In [3]:
import ripper

def make_ripper_dataset(dataset_filename, random_state=42):
    # Load df
    df = pd.read_csv(dataset_filename)
    
    # Split
    train, test = train_test_split(df, test_size=.33, random_state=random_state)
    
    return train, test

In [4]:
def make_ripper(train, class_feat, pos_class, k=1, random_state=42, verbosity=0):
    # Train
    irep_clf = ripper.RIPPER(class_feat=class_feat,pos_class=pos_class, k=k, verbosity=verbosity)
    irep_clf.fit(train, n_discretize_bins=5, seed=random_state)
    return irep_clf

In [5]:
def score_ripper(ripper_clf, test):
    X_test = test.drop(ripper_clf.class_feat,axis=1)
    y_test = test[class_feat]

    precision = ripper_clf.score(X_test, y_test)#, score_accuracy)
    recall = ripper_clf.score(X_test, y_test, recall_score)
    total_conds = ripper_clf.ruleset_.count_conds()
    return precision, recall, total_conds

In [6]:
datasets_path = '../datasets/'
random_state = 0

In [7]:
# Set up
dataset = 'house-votes-84.csv'
filename = datasets_path + dataset
class_feat = 'Party'
pos_class = 'democrat'
n_classes = 2
random_state=42
k=2

In [8]:
# Run ripper
k=1
random_state=2
verbosity=0
train, test = make_ripper_dataset(filename, random_state=random_state)
ripper_clf = make_ripper(train, class_feat, pos_class, k=k, random_state=random_state, verbosity=verbosity)
#timer.buzz()
#ripper_precision, ripper_recall, ripper_conds = score_ripper(ripper_clf, test, class_feat)
#ripper_precision, ripper_recall, ripper_conds

In [15]:
X_test = test.drop(ripper_clf.class_feat,axis=1)
y_test = test[class_feat].tolist()

In [16]:
ripper_clf.score(X_test, y_test)#, accuracy_score)

covered_indices {129, 258, 131, 3, 389, 262, 12, 13, 398, 20, 24, 411, 157, 286, 415, 32, 161, 289, 159, 29, 165, 417, 160, 40, 297, 170, 298, 172, 173, 428, 175, 41, 301, 42, 307, 180, 181, 182, 312, 418, 186, 192, 193, 321, 68, 69, 198, 70, 200, 328, 329, 203, 294, 205, 199, 337, 216, 348, 94, 354, 226, 100, 109, 237, 241, 243, 244, 381, 383}
predictions [True, False, True, True, True, True, True, True, False, True, False, False, True, True, False, False, False, True, True, True, True, True, False, True, True, False, False, False, False, False, True, True, False, False, False, True, True, True, False, False, True, True, False, False, False, True, False, False, False, False, False, True, True, False, True, False, False, True, False, True, True, True, False, False, False, False, True, False, False, False, True, True, True, False, False, False, False, False, False, True, True, False, False, True, False, True, True, False, True, False, True, True, False, True, False, False, True, False, 

0.8819444444444444

In [11]:
test_y

NameError: name 'test_y' is not defined

In [None]:
end`

In [None]:
# Set up
dataset = 'breast-cancer.csv'
filename = datasets_path + dataset
class_feat = 'Recurrence'
pos_class = 'recurrence-events'
n_classes = 2
random_state=30

In [None]:
# Run ripper
k=2
random_state=2
verbosity=5
train, test = make_ripper_dataset(filename, random_state=random_state)
timer = Timer()
ripper_clf = make_ripper(train, class_feat, pos_class, k=k, random_state=random_state, verbosity=verbosity)
timer.buzz()
#ripper_precision, ripper_recall, ripper_conds = score_ripper(ripper_clf, test, class_feat)
#ripper_precision, ripper_recall, ripper_conds

In [None]:
# Set up
dataset = 'adult.csv'
filename = datasets_path + dataset
class_feat = 'income'
pos_class = '>50K'
n_classes = 2
random_state = 0
k=2

In [None]:
train, test = make_ripper_dataset(filename, random_state=random_state)

In [None]:
import pickle
import base
ripper_clf = pickle.load(open('ripper_clf.pkl','rb'))
test_binned = base.bin_transform(test, ripper_clf.bin_transformer_)
score_ripper(ripper_clf, test_binned, class_feat)

In [None]:
pos_test,neg_test=base.pos_neg_split(test_binned, class_feat, pos_class)
len(pos_test)/len(test_binned)

In [None]:
from sklearn.tree import DecisionTreeClassifier

def make_tree_dataset(dataset_filename, class_feat, pos_class, n_classes, bin_transformer, random_state=42):
    
    # Load df
    df = pd.read_csv(dataset_filename)
    if bin_transformer is not None:
        df = base.bin_transform(df, bin_transformer)
    
    # DecisionTreeClassifier (of all things!) doesn't directly take categorical features.
    # We need to one-hot preprocess the data to make it work.
    le = LabelEncoder()
    df_le=df.apply(le.fit_transform)
    enc = OneHotEncoder(sparse=False)
    enc.fit(df_le)
    df_hot=enc.transform(df_le)
    
    # Split
    train, test = train_test_split(df_hot, test_size=.33, random_state=random_state)
    train_X = train[:,n_classes:]
    train_y = train[:,0]
    test_X = test[:,n_classes:]
    test_y = test[:,0]
    
    return train_X, train_y, test_X, test_y

In [None]:
def make_tree(train_X, train_y, max_depth=None, random_state=42):
    tree = DecisionTreeClassifier(max_depth=max_depth, random_state=random_state)
    tree.fit(train_X, train_y)
    return tree

In [None]:
def score_tree(tree_clf, test_X, test_y):
    predictions = tree_clf.predict(test_X)
    precision = precision_score(test_y, predictions)
    recall = recall_score(test_y, predictions)
    return precision, recall, tree_clf.tree_.node_count

In [None]:
# Run tree
tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, bin_transformer=ripper_clf.bin_transformer_, random_state=random_state)
tree_clf = make_tree(tree_train_X, tree_train_y, random_state=random_state)
tree_precision, tree_recall, tree_nodes = score_tree(tree_clf, tree_test_X, tree_test_y)
tree_precision, tree_recall, tree_nodes

In [None]:
ripper_clf.ruleset_.count_conds()

In [None]:
# Run tree
import math
tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, bin_transformer=ripper_clf.bin_transformer_, random_state=random_state)
tree_clf = make_tree(tree_train_X, tree_train_y, max_depth=math.log2(ripper_clf.ruleset_.count_conds()), random_state=random_state)
tree_precision, tree_recall, tree_nodes = score_tree(tree_clf, tree_test_X, tree_test_y)
tree_precision, tree_recall, tree_nodes

In [None]:
end

In [None]:
# Run ripper
random_state=2
verbosity=5
train, test = make_ripper_dataset(filename, random_state=random_state)
timer = Timer()
ripper_clf = make_ripper(train, class_feat, pos_class, k=2, random_state=random_state, verbosity=verbosity)
timer.buzz()
#ripper_precision, ripper_recall, ripper_conds = score_ripper(ripper_clf, test, class_feat)
#ripper_precision, ripper_recall, ripper_conds

Should ruleset best dl be simply the best subset, rather than removing rules piecemeal?

In [None]:
# Set up
dataset = 'tic-tac-toe.csv'
filename = datasets_path + dataset
class_feat = 'Class'
pos_class = 'positive'
n_classes = 2
random_state=30

In [None]:
# Run ripper
random_state=2
verbosity=2
train, test = make_ripper_dataset(filename, random_state=random_state)
timer = Timer()
ripper_clf = make_ripper(train, class_feat, pos_class, k=k, random_state=random_state, verbosity=verbosity)
timer.buzz()
#ripper_precision, ripper_recall, ripper_conds = score_ripper(ripper_clf, test, class_feat)
#ripper_precision, ripper_recall, ripper_conds

In [None]:
score_ripper(ripper_clf, test, class_feat)

In [None]:
# Run tree
import math
tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, bin_transformer=None, random_state=random_state)
tree_clf = make_tree(tree_train_X, tree_train_y, max_depth=math.log2(ripper_clf.ruleset_.count_conds()), random_state=random_state)
tree_precision, tree_recall, tree_nodes = score_tree(tree_clf, tree_test_X, tree_test_y)
tree_precision, tree_recall, tree_nodes

In [None]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(tree_clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
# Run tree
import math
tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, bin_transformer=None, random_state=random_state)
tree_clf = make_tree(tree_train_X, tree_train_y, random_state=random_state)
tree_precision, tree_recall, tree_nodes = score_tree(tree_clf, tree_test_X, tree_test_y)
tree_precision, tree_recall, tree_nodes

In [None]:
# Run ripper
random_state=2
verbosity=2
k=2
train, test = make_ripper_dataset(filename, random_state=random_state)
timer = Timer()
ripper_clf = make_ripper(train, class_feat, pos_class, k=k, random_state=random_state, verbosity=verbosity)
timer.buzz()
#ripper_precision, ripper_recall, ripper_conds = score_ripper(ripper_clf, test, class_feat)
#ripper_precision, ripper_recall, ripper_conds

In [None]:
score_ripper(ripper_clf, test, class_feat)

In [None]:
# Run tree
import math
tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, bin_transformer=None, random_state=random_state)
tree_clf = make_tree(tree_train_X, tree_train_y, random_state=random_state)
tree_precision, tree_recall, tree_nodes = score_tree(tree_clf, tree_test_X, tree_test_y)
tree_precision, tree_recall, tree_nodes

In [None]:
# Run tree
tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, bin_transformer=None, random_state=random_state)
tree_clf = make_tree(tree_train_X, tree_train_y, max_depth=1, random_state=random_state)
tree_precision, tree_recall, tree_nodes = score_tree(tree_clf, tree_test_X, tree_test_y)
tree_precision, tree_recall, tree_nodes

In [None]:
# Set up
dataset = 'kr-vs-kp.csv'
filename = datasets_path + dataset
class_feat = 'won/lost'
pos_class = 'won'
n_classes = 2
random_state = 42
k=2

In [None]:
# Run ripper
random_state=2
verbosity=2
k=2
train, test = make_ripper_dataset(filename, random_state=random_state)
timer = Timer()
ripper_clf = make_ripper(train, class_feat, pos_class, k=k, random_state=random_state, verbosity=verbosity)
timer.buzz()
#ripper_precision, ripper_recall, ripper_conds = score_ripper(ripper_clf, test, class_feat)
#ripper_precision, ripper_recall, ripper_conds

In [None]:
score_ripper(ripper_clf, test, class_feat)

In [None]:
# Run tree
tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, bin_transformer=None, random_state=random_state)
tree_clf = make_tree(tree_train_X, tree_train_y, random_state=random_state)
tree_precision, tree_recall, tree_nodes = score_tree(tree_clf, tree_test_X, tree_test_y)
tree_precision, tree_recall, tree_nodes

In [None]:
from ripper import Rule, Cond
ripper_clf.ruleset_._set_possible_conds(train,train)
% timeit ripper.r_theory_bits(Rule([Cond('Quadrant','left_up'),Cond('Irradiat','no'),Cond('Age','40-49')]),ripper_clf.ruleset_.possible_conds, verbosity=verbosity)

In [None]:
%timeit ripper_clf.ruleset_.covers(test)

In [None]:
# Set up
dataset = 'soybean-small.csv'
filename = datasets_path + dataset
class_feat = 'class'
pos_class = 'D1'
n_classes = 2
random_state = 0
k=2

In [None]:
# Run tree
tree_train_X, tree_train_y, tree_test_X, tree_test_y = make_tree_dataset(filename, class_feat, pos_class, n_classes, bin_transformer=None, random_state=random_state)
tree_clf = make_tree(tree_train_X, tree_train_y, random_state=random_state)
tree_precision, tree_recall, tree_nodes = score_tree(tree_clf, tree_test_X, tree_test_y)
tree_precision, tree_recall, tree_nodes

In [None]:
tree_clf.predict(tree_train_X)[0]==False

In [None]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(tree_clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
ripper.r_theory_bits(Rule([Cond('a','a'),Cond('a','a')]),[1,2,3,5,6,7,8])