In [1]:
import pandas as pd
import numpy as np
import pickle
import math
import random
import copy 
import functools

from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier

from slice_finder import SliceFinder
from decision_tree import DecisionTree

from ipywidgets import interact, interactive
from IPython.display import display

from bokeh.layouts import widgetbox, row
from bokeh.models import ColumnDataSource
from bokeh.models import HoverTool
from bokeh.models.widgets import DataTable, TableColumn  
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
output_notebook()

import warnings
warnings.filterwarnings('ignore')

In [2]:
def discretization(data_frame, n_bin):
    ''' high cadinality columns '''
    n, m = data_frame.shape[0], data_frame.shape[1]
    for col in data_frame.columns:
        uniques, counts = np.unique(data_frame[col], return_counts=True)
        if len(uniques) > n / 2.:
            bin_edges = binning(data_frame[col], n_bin=n_bin)
            for i in range(len(bin_edges)-1):
                data_idx = data_frame[ np.logical_and(bin_edges[i] <= data_frame[col], data_frame[col] <= bin_edges[i+1]) ].index
                data_frame.loc[data_idx, col] = bin_edges[i]
    
            for i in range(len(bin_edges)-1):
                data_idx = data_frame[data_frame[col] == bin_edges[i]].index
                data_frame.loc[data_idx, col] = str([bin_edges[i], bin_edges[i+1]])
    
    return data_frame

def binning(col, n_bin=20):
    bin_edges = stats.mstats.mquantiles(col, np.arange(0., 1.+1./n_bin, 1./n_bin))
    return bin_edges


In [3]:
import time

columns = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital Status",
        "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
        "Hours per week", "Country", "Target"]

adult_data = pd.read_csv(
    "data/adult.data",
    names=columns,
        sep=r'\s*,\s*',
        engine='python',
        na_values="?")

n_bin = 10
adult_data = adult_data.dropna().reset_index(drop=True)
adult_data = discretization(adult_data, n_bin)

encoders = {}
cg_list = []
for column in adult_data.columns:
    if adult_data.dtypes[column] == np.object:
        cg_list.append(1)
    else:
        cg_list.append(0) 
    le = LabelEncoder()
    adult_data[column] = le.fit_transform(adult_data[column])
    encoders[column] = le
    #print(column, le.classes_, le.transform(le.classes_))

In [4]:
X, y = adult_data[adult_data.columns.delete(-1)], adult_data["Target"]

lr = RandomForestClassifier(max_depth=5, n_estimators=10)
lr.fit(X, y)
print("Model Accuracy: %f" % lr.score(X, y))

Model Accuracy: 0.842815


In [5]:
def evaluate_model(data, model, metric=log_loss):
    ''' loss of data for a given model '''
    X, y = copy.deepcopy(data[0]), copy.deepcopy(data[1])
    X['Label'] = y
    X = X.dropna()
    y = X['Label'].values
    X = X.drop(['Label'], axis=1).values

    y_p = model.predict_proba(X)
    y_p = list(map(functools.partial(np.expand_dims, axis=0), y_p))
    y = list(map(functools.partial(np.expand_dims, axis=0), y))
    return np.array(list(map(functools.partial(metric, labels=model.classes_), y, y_p)))

loss_list = evaluate_model((X, y), lr)

# SliceFinder Demo


## Lattice Searching

In [6]:
sf = SliceFinder((X, y), n_bin, loss_list)
recommendations = sf.find_slice(k=5, epsilon=0.4, degree=1, max_workers=4)
for s in recommendations:
    print ('\n=====================\nSlice description:')
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        print ('%s:%s'%(k, values))
    print ('---------------------\neffect_size: %s'%(s.effect_size))
    print ('---------------------\nmetric: %s'%(s.metric))
    print ('size: %s'%(s.size))

degree 1

Slice description:
Marital Status:Married-civ-spouse 
---------------------
effect_size: 0.6079689287553006
---------------------
metric: 0.5559068089364655
size: 14065

Slice description:
Relationship:Husband 
---------------------
effect_size: 0.5520947436120717
---------------------
metric: 0.5525768010286262
size: 12463

Slice description:
Relationship:Wife 
---------------------
effect_size: 0.4355302988843631
---------------------
metric: 0.6127947183347726
size: 1406

Slice description:
Capital Loss:1887 
---------------------
effect_size: 0.523434530734066
---------------------
metric: 0.6460278480363489
size: 155

Slice description:
Capital Gain:3103 
---------------------
effect_size: 0.8641719005855658
---------------------
metric: 0.8621161720694388
size: 94


## Decision Tree Training

In [7]:
dt = DecisionTree((X, y), (X, y), loss_list, cg_list)
dt = dt.fit()

dt_recs = dt.recommend_slices(k=5, min_effect_size=0.4)

for s in dt_recs:
    size = s.size
    eff_size = s.eff_size
    print ('\n=====================\nSlice description:')
    while s.parent != None:
        operator = s.operator()
        print('%s %s %s' % (s.parent.desc[0], operator, encoders[s.parent.desc[0]].inverse_transform(s.parent.desc[1])))
        s = s.parent

    print ('---------------------\neffect_size: %s'%(eff_size))
    print ('size: %s' %(size))


Slice description:
Marital Status == Married-civ-spouse
---------------------
effect_size: 0.6079689287553006
size: 14065

Slice description:
Education-Num < 13
Capital Gain < 8614
Capital Gain >= 7298
Marital Status != Married-civ-spouse
---------------------
effect_size: 0.9592819257958988
size: 7

Slice description:
Hours per week >= 44
Age >= 28
Education-Num >= 13
Capital Gain < 7298
Marital Status != Married-civ-spouse
---------------------
effect_size: 0.4230697799512546
size: 855

Slice description:
Capital Loss >= 2231
Age < 28
Education-Num >= 13
Capital Gain < 7298
Marital Status != Married-civ-spouse
---------------------
effect_size: 1.3417907853504667
size: 5

Slice description:
Education-Num >= 15
Hours per week < 44
Age >= 28
Education-Num >= 13
Capital Gain < 7298
Marital Status != Married-civ-spouse
---------------------
effect_size: 0.48051614576832136
size: 101


# Visualization Tool

In [8]:
sf = SliceFinder((X, y), n_bin, loss_list)
recommendations = sf.find_slice(k=20, epsilon=0.3, degree=1, max_workers=4)

with open('log/recommend.p','rb') as handle:
    slices = pickle.load(handle)
with open('log/uninteresting.p','rb') as handle:
    uninteresting = pickle.load(handle)  

def get_top_k_slices(candidates, min_effect_size):
    description_ = list()
    size_ = list()
    effect_size_ = list()
    metric_ = list()
    for s in candidates:
        if s.effect_size < min_effect_size:
            continue
            
        description = ''
        for k, v in list(s.filters.items()):
            values = ''
            if k in encoders:
                le = encoders[k]
                for v_ in v:
                    values += '%s '%(le.inverse_transform(v_)[0])
            else:
                for v_ in sorted(v, key=lambda x: x[0]):
                    if len(v_) > 1:
                        values += '%s ~ %s'%(v_[0], v_[1])
                    else:
                        values += '%s '%(v_[0])
            description += '%s:%s '%(k, values)
        description_.append(description)
        size_.append(s.size)
        effect_size_.append(s.effect_size)
        metric_.append(s.metric)
    return description_, size_, effect_size_, metric_

# load interesting slices
description_, size_, effect_size_, metric_ = get_top_k_slices(slices, 0)
# load uninteresting slices (candidates)
description_c, size_c, effect_size_c, metric_c = get_top_k_slices(uninteresting, 0)

data = dict(
        description=description_,
        size=size_,
        effect_size=effect_size_,
        metric=metric_,
    )
source = ColumnDataSource(data)

# scatter plot
hover = HoverTool(tooltips=[
    ("desc", "@description"),
    ("size", "@size"),
    ("effect_size", "@effect_size"),
    ("metric", "@metric"),
])
TOOLS = [hover]
p = figure(tools=TOOLS, plot_width=400, plot_height=400, 
           y_axis_label='Effect Size', x_axis_label='Size', title=None)
r = p.circle('size', 'effect_size', source=source)

# data table
columns = [
        TableColumn(field="description", title="Description"),
        TableColumn(field="size", title="Size"),
        TableColumn(field="effect_size", title="Effect Size"),
        TableColumn(field="metric", title="Log Loss"),
    ]
t = DataTable(source=r.data_source, columns=columns, width=600, height=400)

show(row(p, t), notebook_handle=True)

def update(k=30, min_eff_size=0.3):
    desc_, size_, effect_size_, metric_ = get_top_k_slices(slices+uninteresting, min_eff_size)
    idx = sorted(range(len(size_)), key=lambda x: (-len(desc_[x].split(':')), size_[x], effect_size_[x]), reverse=True)
    
    r.data_source.data['description'] = np.array(desc_)[idx[:k]]
    r.data_source.data['size'] = np.array(size_)[idx[:k]]
    r.data_source.data['effect_size'] = np.array(effect_size_)[idx[:k]]
    r.data_source.data['metric'] = np.array(metric_)[idx[:k]]
    push_notebook()
    
w = interactive(update, k=(1,20), min_eff_size=(0, 1, 0.05))
display(w)

degree 1


interactive(children=(IntSlider(value=20, description='k', max=20, min=1), FloatSlider(value=0.3, description=…