## Global Settings

In [1039]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
# import plotly.io as pio
# pio.renderers.default = "svg"  # for easier sharing the charts, e.g., on github, needs: pip install -U kaleido ; see https://plotly.com/python/renderers/

In [1040]:
DEFAULT_RI_PIPELINES = ['NATIVE-RIPPER', 'LENC-CART:4', 'TREES-BRCG', 'TREES-CORELS', 'NATIVE-R2N']
RIPPER_PIPELINES = ['QUANTILE-RIPPER', 'TREES-RIPPER', 'NATIVE-RIPPER2', 'NATIVE-RIPPER3', 'NATIVE-RIPPER']
BRCG_PIPELINES = ['QUANTILE-BRCG', 'TREES-BRCG2', 'QUANTILE-BRCG2', 'TREES2-BRCG2', 'TREES2-BRCG', 'TREES-BRCG']
CORELS_PIPELINES = ['QUANTILE-CORELS','TREES-CORELS2', 'TREES2-CORELS2', 'TREES-CORELS']
CART_PIPELINES = ['LENC-CART:*', 'LENC-CART:6', 'LENC-CART:4']
ALL_RI_PIPELINES = RIPPER_PIPELINES + BRCG_PIPELINES + CORELS_PIPELINES + CART_PIPELINES + ['NATIVE-R2N']
ALL_PIPELINES = ALL_RI_PIPELINES + ['LENC-XGB']
DEFAULT_PIPELINES = DEFAULT_RI_PIPELINES + ['LENC-XGB']

metrics = ['f2', 'f', 'recall', 'precision', 'acc', 'adj_bal_acc', 'sum_preds']
style_map = {
    'LENC-XGB': ('gray', 'solid', 'circle'),
    'LENC-CART:4': ('mediumblue', 'dot', 'arrow-right'), 
    'LENC-CART:6': ('cornflowerblue', 'longdash', 'diamond'), 
    'LENC-CART:*': ('lightskyblue', 'solid', 'arrow-left'),
    'TREES-BRCG': ('lime', 'dash', 'cross'),
    'TREES2-BRCG': ('aquamarine', 'dash', 'cross'),
    'NATIVE-RIPPER': ('crimson', 'longdash', 'diamond'),
    'NATIVE-RIPPER2': ('magenta', 'longdash', 'diamond'),
    'NATIVE-RIPPER3': ('darkorchid', 'longdash', 'diamond'),
    'QUANTILE-RIPPER': ('salmon', 'longdash', 'diamond'),
    'TREES-RIPPER': ('orangered', 'longdash', 'diamond'),
    'TREES-CORELS': ('gold', 'dashdot', 'star'),
    'TREES-CORELS2': ('darkorange', 'dashdot', 'star'),
    'TREES2-CORELS2': ('wheat', 'dashdot', 'star'),
    'NATIVE-R2N': ('silver', 'dot', 'square'),
    'QUANTILE-RULENET': ('cyan', 'dot', 'square'),
    'QUANTILE-BRCG': ('plum', 'dash', 'cross'),
    'QUANTILE-CORELS': ('palevioletred', 'dashdot', 'star'),
    'TREES-BRCG2': ('forestgreen', 'dash', 'cross'),
    'QUANTILE-BRCG2': ('maroon', 'dash', 'cross'),
    'TREES2-BRCG2': ('yellowgreen', 'solid', 'cross'),
    'maxRI': ('crimson', 'solid', 'cross'),
    'maxRI50': ('orangered', 'solid', 'cross'),
    'maxRI25': ('darkorange', 'solid', 'cross'),
    'maxRI10': ('gold', 'solid', 'cross'),
    'maxRI5': ('palegoldenrod', 'solid', 'cross'),
    'NATIVE-CONST': ('darkkhaki', 'solid', 'cross')
}

color_map = {}
for pipeline in style_map:
    for m in metrics:
        name = pipeline + '_' + m
        color_map[name] = style_map[pipeline][0]

def rename( pipeline ):
    if pipeline == 'LENC-XGB':
        return 'xgb'
    else:
        return pipeline
    
long_name = {
    'f2': 'f2-score',
    'f': 'f-score',
    'acc': 'accuracy',
    'adj_bal_acc': 'balanced accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'nof_rules': 'number of rules',
    'sum_preds': 'sum of predicates',
    'avg_preds': 'average of predicates',
    'max_preds': 'maximum of predicates'
}

In [1041]:
def single_chart(pipelines, metric, df, df_long_name, x_sel='data_set', x_title=None, x_log=False):
    fig = go.Figure(layout=dict(title=dict(text=long_name[metric]+' on '+df_long_name)))
    for p in pipelines:
        m = p + '_' + metric
        fig.add_scatter(name=rename(p), 
                        x=df[x_sel], 
                        y=df[m], 
                        mode='markers+lines', 
                        line=dict(width=2,color=style_map[p][0],dash=style_map[p][1]),
                        marker=dict(symbol=style_map[p][2]), # color='red', size=2, 
                        showlegend=True)
    fig.update_layout(yaxis_title=long_name[metric], plot_bgcolor='whitesmoke')
    if x_title is not None:
        fig.update_layout(xaxis_title=x_title)
    if x_log:
        fig.update_xaxes(type="log")
    fig.update_layout(height=400, width=1000)
    fig.show()


def triple_chart(pipelines, performance_metric, complexity_metric, df, df_long_name, x_sel='data_set', x_title=None, x_log=False):

    fig = make_subplots(rows=3, cols=1,
                    shared_xaxes=True,
                    vertical_spacing=0.02)

    for p in ['LENC-XGB', 'NATIVE-CONST'] + pipelines:
        m = p + '_' + performance_metric
        scatter = go.Scatter(name=p, 
                        x=df[x_sel], 
                        y=df[m], 
                        mode='markers+lines', 
                        line=dict(width=2,color=style_map[p][0],dash=style_map[p][1]),
                        marker=dict(symbol=style_map[p][2]), # color='red', size=2, 
                        showlegend=True)
        fig.add_trace(scatter, row=3, col=1)
    fig.update_yaxes(title=long_name[performance_metric], row=3, col=1)
    if x_title is not None:
        fig.update_xaxes(title=x_title, row=3, col=1)

    for p in pipelines:
        m = p + '_' + complexity_metric
        scatter = go.Scatter(name=p, 
                        x=df[x_sel],
                        y=df[m]+1, 
                        mode='markers+lines', 
                        line=dict(width=2,color=style_map[p][0],dash=style_map[p][1]),
                        marker=dict(symbol=style_map[p][2]), # color='red', size=2, 
                        showlegend=False)
        fig.add_trace(scatter, row=2, col=1)
    fig.update_yaxes(type="log", row=2, col=1)
    fig.update_yaxes(title=long_name[complexity_metric], row=2, col=1)

    metric = 'runtime'
    for p in ['LENC-XGB'] + pipelines:
        m = p + '_' + metric
        scatter = go.Scatter(name=p, 
                        x=df[x_sel],
                        y=df[m]+1, 
                        mode='markers+lines', 
                        line=dict(width=2,color=style_map[p][0],dash=style_map[p][1]),
                        marker=dict(symbol=style_map[p][2]), # color='red', size=2, 
                        showlegend=False)
        fig.add_trace(scatter, row=1, col=1)
    fig.update_yaxes(type="log", row=1, col=1)
    fig.update_yaxes(title='time (secs)', row=1, col=1)

    if x_log:
        fig.update_xaxes(type="log")

    fig.update_layout(title_text=long_name[performance_metric]+', '+ long_name[complexity_metric] + ' and training time on ' + df_long_name, plot_bgcolor='whitesmoke')
    fig.update_layout(height=600, width=1000)
    fig.show()


def dual_chart(pipelines, performance_metric, complexity_metric, df, df_long_name, x_sel='data_set', x_title=None, x_log=False):

    fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.02)

    for p in ['LENC-XGB'] + pipelines:
        m = p + '_' + performance_metric
        scatter = go.Scatter(name=p, 
                        x=df[x_sel], 
                        y=df[m], 
                        mode='markers+lines', 
                        line=dict(width=2,color=style_map[p][0],dash=style_map[p][1]),
                        marker=dict(symbol=style_map[p][2]), # color='red', size=2, 
                        showlegend=True)
        fig.add_trace(scatter, row=2, col=1)
    fig.update_yaxes(title=long_name[performance_metric], row=2, col=1)
    if x_title is not None:
        fig.update_xaxes(title=x_title, row=2, col=1)

    for p in pipelines:
        m = p + '_' + complexity_metric
        scatter = go.Scatter(name=p, 
                        x=df[x_sel],
                        y=df[m]+1, 
                        mode='markers+lines', 
                        line=dict(width=2,color=style_map[p][0],dash=style_map[p][1]),
                        marker=dict(symbol=style_map[p][2]), # color='red', size=2, 
                        showlegend=False)
        fig.add_trace(scatter, row=1, col=1)
    fig.update_yaxes(type="log", row=1, col=1)
    fig.update_yaxes(title=long_name[complexity_metric], row=1, col=1)

    if x_log:
        fig.update_xaxes(type="log")

    fig.update_layout(title_text=long_name[performance_metric]+' and '+ long_name[complexity_metric] + ' on ' + df_long_name, plot_bgcolor='whitesmoke')
    fig.update_layout(height=500, width=1000)
    fig.show()

def dual_performance_chart(pipelines, performance_metric_1, performance_metric_2, df, df_long_name, x_sel='data_set', x_title=None, x_log=False):

    fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.02)

    for p in ['LENC-XGB'] + pipelines:
        m = p + '_' + performance_metric_1
        scatter = go.Scatter(name=p, 
                        x=df[x_sel], 
                        y=df[m], 
                        mode='markers+lines', 
                        line=dict(width=2,color=style_map[p][0],dash=style_map[p][1]),
                        marker=dict(symbol=style_map[p][2]), # color='red', size=2, 
                        showlegend=True)
        fig.add_trace(scatter, row=2, col=1)
    fig.update_yaxes(title=long_name[performance_metric_1], row=2, col=1)
    if x_title is not None:
        fig.update_xaxes(title=x_title, row=2, col=1)

    for p in ['LENC-XGB'] + pipelines:
        m = p + '_' + performance_metric_2
        scatter = go.Scatter(name=p, 
                        x=df[x_sel],
                        y=df[m], 
                        mode='markers+lines', 
                        line=dict(width=2,color=style_map[p][0],dash=style_map[p][1]),
                        marker=dict(symbol=style_map[p][2]), # color='red', size=2, 
                        showlegend=False)
        fig.add_trace(scatter, row=1, col=1)
    fig.update_yaxes(title=long_name[performance_metric_2], row=1, col=1)

    if x_log:
        fig.update_xaxes(type="log")

    fig.update_layout(title_text=long_name[performance_metric_1]+' and '+ long_name[performance_metric_2] + ' on ' + df_long_name, plot_bgcolor='whitesmoke')
    fig.update_layout(height=500, width=1000)
    fig.show()

def quad_chart(pipelines, performance_metric_1, performance_metric_2, complexity_metric, df, df_long_name, x_sel='data_set', x_title=None, x_log=False):

    fig = make_subplots(rows=4, cols=1, shared_xaxes=True, vertical_spacing=0.02)

    for p in ['LENC-XGB', 'NATIVE-CONST'] + pipelines:
        m = p + '_' + performance_metric_1
        scatter = go.Scatter(name=p, 
                        x=df[x_sel], 
                        y=df[m], 
                        mode='markers+lines', 
                        line=dict(width=2,color=style_map[p][0],dash=style_map[p][1]),
                        marker=dict(symbol=style_map[p][2]), # color='red', size=2, 
                        showlegend=True)
        fig.add_trace(scatter, row=4, col=1)
    fig.update_yaxes(title=long_name[performance_metric_1], row=4, col=1)
    if x_title is not None:
        fig.update_xaxes(title=x_title, row=4, col=1)

    for p in ['LENC-XGB', 'NATIVE-CONST'] + pipelines:
        m = p + '_' + performance_metric_2
        scatter = go.Scatter(name=p, 
                        x=df[x_sel], 
                        y=df[m], 
                        mode='markers+lines', 
                        line=dict(width=2,color=style_map[p][0],dash=style_map[p][1]),
                        marker=dict(symbol=style_map[p][2]), # color='red', size=2, 
                        showlegend=False)
        fig.add_trace(scatter, row=3, col=1)
    fig.update_yaxes(title=long_name[performance_metric_2], row=3, col=1)
    if x_title is not None:
        fig.update_xaxes(title=x_title, row=3, col=1)

    for p in pipelines:
        m = p + '_' + complexity_metric
        scatter = go.Scatter(name=p, 
                        x=df[x_sel],
                        y=df[m]+1, 
                        mode='markers+lines', 
                        line=dict(width=2,color=style_map[p][0],dash=style_map[p][1]),
                        marker=dict(symbol=style_map[p][2]), # color='red', size=2, 
                        showlegend=False)
        fig.add_trace(scatter, row=2, col=1)
    fig.update_yaxes(type="log", row=2, col=1)
    fig.update_yaxes(title=long_name[complexity_metric], row=2, col=1)

    metric = 'runtime'
    for p in ['LENC-XGB'] + pipelines:
        m = p + '_' + metric
        scatter = go.Scatter(name=p, 
                        x=df[x_sel],
                        y=df[m]+1, 
                        mode='markers+lines', 
                        line=dict(width=2,color=style_map[p][0],dash=style_map[p][1]),
                        marker=dict(symbol=style_map[p][2]), # color='red', size=2, 
                        showlegend=False)
        fig.add_trace(scatter, row=1, col=1)
    fig.update_yaxes(type="log", row=1, col=1)
    fig.update_yaxes(title='time (secs)', row=1, col=1)

    if x_log:
        fig.update_xaxes(type="log")

    fig.update_layout(title_text=long_name[performance_metric_1]+', '+ long_name[performance_metric_2]+', '+ long_name[complexity_metric] + ' and training time on ' + df_long_name, plot_bgcolor='whitesmoke')
    fig.update_layout(height=600, width=1000)
    fig.show()

In [1042]:
full_df = pd.read_csv('results_1.csv')
full_df

Unnamed: 0,data_set,nof_rows,nof_col,nof_num_features,nof_cat_features,nof_pos,pos_ratio,use_case,origin,LENC_nof_bin_cols,...,TREES-BRCG2exception,TREES-BRCGexception,TREES2-BRCGexception,NATIVE-CONST_runtime,NATIVE-CONST_acc,NATIVE-CONST_adj_bal_acc,NATIVE-CONST_recall,NATIVE-CONST_precision,NATIVE-CONST_f2,NATIVE-CONST_f
0,german_credit,1000,21,7,13,300,0.3,credit,organic,20.0,...,,,,0.0,0.29,0.0,1.0,0.29,0.67,0.45
1,taiwan_credit,30000,24,14,9,6636,0.2212,credit,organic,23.0,...,,,,0.0,0.22,0.0,1.0,0.22,0.59,0.36
2,compas,7210,8,0,7,3250,0.4508,recidivism,organic,7.0,...,,,,0.0,0.46,0.0,1.0,0.46,0.81,0.63
3,fraud_detection,284807,31,30,0,492,0.0017,transaction-fraud,organic,30.0,...,,,,0.0,0.0,0.0,1.0,0.0,0.01,0.0
4,fraud_oracle,15420,33,8,24,923,0.0599,insurance-fraud,organic,32.0,...,,,,0.0,0.07,0.0,1.0,0.07,0.26,0.13
5,bike_75,8760,14,9,4,61,0.007,demand-prediction,organic,13.0,...,,,,0.0,0.01,0.0,1.0,0.01,0.03,0.01
6,bike_mean,8760,14,9,4,3523,0.4022,demand-prediction,organic,13.0,...,,,,0.0,0.42,0.0,1.0,0.42,0.78,0.59
7,orange_up,50000,213,174,38,3682,0.0736,product recommendation,organic,212.0,...,,,,0.0,0.07,0.0,1.0,0.07,0.28,0.13
8,orange_churn,50000,213,174,38,3672,0.0734,churn,organic,212.0,...,,,,0.01,0.08,0.0,1.0,0.08,0.29,0.14
9,adult,32561,13,4,8,7841,0.2408,income_prediction,organic,12.0,...,,,,0.01,0.23,0.0,1.0,0.23,0.6,0.37


In [1043]:
# Enriching result table

def max_performance(row, metric='f', max_rules=None):
    if max_rules is None:
        l = [ row[p+'_'+metric] for p in ALL_RI_PIPELINES ]
    else:
        l = [ row[p+'_'+metric] for p in ALL_RI_PIPELINES if row[p+'_nof_rules'] <= max_rules ]
    if len(l) == 0:
        return 0
    else:
        return np.nanmax(l)   

for m in ['f', 'f2', 'adj_bal_acc', 'recall', 'precision']:
    metric = 'maxRI_'+m
    full_df[metric] = full_df.apply(max_performance, metric=m, axis=1)
    for r in [5, 10, 25, 50]:
        metric = 'maxRI'+str(r)+'_'+m
        full_df[metric] = full_df.apply(max_performance, metric=m, max_rules=r, axis=1)


## Core Suite Definition and Stats


In [1044]:
df_core_suite = full_df[(full_df['origin']=='organic') & (full_df['use_case']!='other')]
df_core_suite = df_core_suite.sort_values('pos_ratio', ignore_index=True)

df_core_suite

Unnamed: 0,data_set,nof_rows,nof_col,nof_num_features,nof_cat_features,nof_pos,pos_ratio,use_case,origin,LENC_nof_bin_cols,...,maxRI_recall,maxRI5_recall,maxRI10_recall,maxRI25_recall,maxRI50_recall,maxRI_precision,maxRI5_precision,maxRI10_precision,maxRI25_precision,maxRI50_precision
0,fraud_detection,284807,31,30,0,492,0.0017,transaction-fraud,organic,30.0,...,0.73,0.67,0.73,0.73,0.73,1.0,1.0,1.0,1.0,1.0
1,bike_75,8760,14,9,4,61,0.007,demand-prediction,organic,13.0,...,0.93,0.93,0.93,0.93,0.93,1.0,1.0,1.0,1.0,1.0
2,fraud_oracle,15420,33,8,24,923,0.0599,insurance-fraud,organic,32.0,...,0.98,0.05,0.05,0.98,0.98,1.0,1.0,1.0,1.0,1.0
3,orange_churn,50000,213,174,38,3672,0.0734,churn,organic,212.0,...,0.15,0.02,0.02,0.04,0.04,1.0,1.0,1.0,1.0,1.0
4,orange_up,50000,213,174,38,3682,0.0736,product recommendation,organic,212.0,...,0.45,0.43,0.43,0.45,0.45,1.0,1.0,1.0,1.0,1.0
5,bank_marketing,4118,21,10,10,451,0.1095,product_recommendation,organic,20.0,...,0.53,0.36,0.48,0.48,0.48,1.0,1.0,1.0,1.0,1.0
6,telco_churn,3333,20,16,3,483,0.1449,churn,organic,19.0,...,0.77,0.6,0.77,0.77,0.77,1.0,1.0,1.0,1.0,1.0
7,taiwan_credit,30000,24,14,9,6636,0.2212,credit,organic,23.0,...,0.42,0.32,0.32,0.33,0.33,1.0,1.0,1.0,1.0,1.0
8,adult,32561,13,4,8,7841,0.2408,income_prediction,organic,12.0,...,0.85,0.85,0.85,0.85,0.85,1.0,1.0,1.0,1.0,1.0
9,house,22784,17,16,0,6744,0.296,price_prediction,organic,16.0,...,0.91,0.91,0.91,0.91,0.91,1.0,1.0,1.0,1.0,1.0


In [1045]:
df_basic_stats = df_core_suite[['data_set', 'use_case', 'nof_rows', 'nof_col', 'nof_num_features', 'nof_cat_features', 'pos_ratio', 'LENC-XGB_f2']]
df_basic_stats = df_basic_stats.sort_values('LENC-XGB_f2', ignore_index=True)
df_basic_stats
# latex_table = df_basic_stats.to_latex(index=False)
# latex_table = latex_table.replace('\_', ' ')
# latex_table = latex_table.replace('-', ' ')
# print(latex_table)

# for x in trans_map:
#         latex_table = latex_table.replace(x, trans_map[x])

Unnamed: 0,data_set,use_case,nof_rows,nof_col,nof_num_features,nof_cat_features,pos_ratio,LENC-XGB_f2
0,orange_churn,churn,50000,213,174,38,0.0734,0.03
1,fraud_oracle,insurance-fraud,15420,33,8,24,0.0599,0.36
2,taiwan_credit,credit,30000,24,14,9,0.2212,0.4
3,german_credit,credit,1000,21,7,13,0.3,0.44
4,orange_up,product recommendation,50000,213,174,38,0.0736,0.47
5,bike_75,demand-prediction,8760,14,9,4,0.007,0.48
6,bank_marketing,product_recommendation,4118,21,10,10,0.1095,0.49
7,compas,recidivism,7210,8,0,7,0.4508,0.57
8,heloc,credit,10459,24,23,0,0.4781,0.67
9,telco_churn_2,churn,2000,14,9,4,0.5,0.68


In [1046]:
fig = px.line(df_core_suite, x='data_set', y=['pos_ratio'], log_y=True, title='Core Suite: Imbalance')
fig.update_traces(mode='markers+lines')
fig.update_layout(height=500, width=1000)
fig.show()

## General Performance on Core Suite

### Result Table

In [1047]:
trans_map = {
    'LENC-XGB': 'xgb/',
    'NATIVE-RIPPER': 'rip/',
    'TREES-BRCG': 'brcg/',
    'TREES-CORELS': 'cor/',
    'NATIVE-R2N': 'r2n/',
    '\_f2': 'f2',
    '\_recall': 're',
    '\_precision': 'pr',
    '\_adj\_bal\_acc': 'ba',
    '\_acc': 'acc',
    '\_nof\_rules': 'nor',
    '\_sum\_preds': 'sop'
}

# Create table
pipelines = DEFAULT_PIPELINES
metrics = ['f2', 'recall', 'precision', 'adj_bal_acc']  # 'f', 'acc'
cols = ['data_set'] + [p+'_'+m for m in metrics for p in pipelines]
df = df_core_suite[cols]
df = df.sort_values('LENC-XGB_f2', ignore_index=True)
# latex_table = df.to_latex(index=False, longtable=True)
# for x in trans_map:
#         latex_table = latex_table.replace(x, trans_map[x])
# # print(latex_table)
df

Unnamed: 0,data_set,NATIVE-RIPPER_f2,LENC-CART:4_f2,TREES-BRCG_f2,TREES-CORELS_f2,NATIVE-R2N_f2,LENC-XGB_f2,NATIVE-RIPPER_recall,LENC-CART:4_recall,TREES-BRCG_recall,...,TREES-BRCG_precision,TREES-CORELS_precision,NATIVE-R2N_precision,LENC-XGB_precision,NATIVE-RIPPER_adj_bal_acc,LENC-CART:4_adj_bal_acc,TREES-BRCG_adj_bal_acc,TREES-CORELS_adj_bal_acc,NATIVE-R2N_adj_bal_acc,LENC-XGB_adj_bal_acc
0,orange_churn,0.0,0.02,0.0,0.0,,0.03,0.0,0.02,0.0,...,1.0,1.0,,0.47,0.0,0.02,0.0,0.0,,0.03
1,fraud_oracle,0.0,0.04,0.0,0.0,0.0,0.36,0.0,0.04,0.0,...,1.0,1.0,1.0,0.8,0.0,0.03,0.0,0.0,0.0,0.31
2,taiwan_credit,0.37,0.33,0.34,0.34,0.33,0.4,0.33,0.3,0.3,...,0.7,0.7,0.7,0.66,0.28,0.26,0.26,0.26,0.26,0.31
3,german_credit,0.2,0.52,0.23,0.11,0.0,0.44,0.17,0.51,0.2,...,0.68,0.67,1.0,0.51,0.12,0.36,0.16,0.07,0.0,0.26
4,orange_up,0.45,0.48,0.48,0.48,,0.47,0.41,0.43,0.43,...,0.81,0.8,,0.84,0.4,0.42,0.42,0.42,,0.41
5,bike_75,0.57,0.56,0.0,0.0,0.0,0.48,0.6,0.6,0.0,...,1.0,1.0,1.0,0.54,0.6,0.6,0.0,0.0,0.0,0.46
6,bank_marketing,0.42,0.41,0.37,0.37,0.32,0.49,0.4,0.39,0.34,...,0.56,0.58,0.62,0.54,0.35,0.35,0.31,0.31,0.27,0.43
7,compas,0.0,0.57,0.57,0.52,0.52,0.57,0.0,0.55,0.55,...,0.68,0.67,0.67,0.68,0.0,0.32,0.33,0.28,0.28,0.33
8,heloc,0.7,0.65,0.63,0.63,0.82,0.67,0.71,0.64,0.61,...,0.74,0.74,0.49,0.71,0.42,0.42,0.41,0.41,0.02,0.41
9,telco_churn_2,0.66,0.61,0.63,0.65,0.84,0.68,0.65,0.58,0.61,...,0.77,0.75,0.58,0.76,0.37,0.42,0.42,0.42,0.25,0.45


### Performance of rule induction vs. XGBoost

In [1048]:
main_metric = 'f'
sorted_df = df_core_suite.sort_values('LENC-XGB_'+main_metric, ignore_index=True)
single_chart(pipelines=['LENC-XGB', 'NATIVE-CONST', 'maxRI', 'maxRI50', 'maxRI25', 'maxRI10', 'maxRI5'], metric=main_metric, df=sorted_df, df_long_name='core suite')

In [1049]:
sorted_df = df_core_suite.sort_values('LENC-XGB_adj_bal_acc', ignore_index=True)
single_chart(pipelines=['LENC-XGB', 'NATIVE-CONST', 'maxRI', 'maxRI50', 'maxRI25', 'maxRI10', 'maxRI5'], metric='adj_bal_acc', df=sorted_df, df_long_name='core suite')

In [1050]:
sorted_df = df_core_suite.sort_values('LENC-XGB_f2', ignore_index=True)
single_chart(pipelines=['LENC-XGB', 'NATIVE-CONST', 'maxRI', 'maxRI50', 'maxRI25', 'maxRI10', 'maxRI5'], metric='f2', df=sorted_df, df_long_name='core suite')

### Pipelines with standard settings on core suite

In [1051]:
perf_metric = 'f'
sorted_df = df_core_suite.sort_values('LENC-XGB_'+perf_metric, ignore_index=True)
triple_chart(pipelines=DEFAULT_RI_PIPELINES, performance_metric=perf_metric, complexity_metric='nof_rules', df=sorted_df, df_long_name='core suite')

In [1052]:
main_metric = 'recall'
sorted_df = df_core_suite.sort_values('LENC-XGB_'+main_metric, ignore_index=True)
quad_chart(pipelines=DEFAULT_RI_PIPELINES, performance_metric_1=main_metric, performance_metric_2='precision', complexity_metric='nof_rules', df=sorted_df, df_long_name='core suite')

In [1053]:
perf_metric = 'f2'
sorted_df = df_core_suite.sort_values('LENC-XGB_'+perf_metric, ignore_index=True)
triple_chart(pipelines=DEFAULT_RI_PIPELINES, performance_metric=perf_metric, complexity_metric='nof_rules', df=sorted_df, df_long_name='core suite')

In [1054]:
perf_metric = 'adj_bal_acc'
sorted_df = df_core_suite.sort_values('LENC-XGB_'+perf_metric, ignore_index=True)
triple_chart(pipelines=DEFAULT_RI_PIPELINES, performance_metric=perf_metric, complexity_metric='nof_rules', df=sorted_df, df_long_name='core suite')

In [1055]:
perf_metric = 'f'
sorted_df = df_core_suite.sort_values('LENC-XGB_'+perf_metric, ignore_index=True)
triple_chart(pipelines=DEFAULT_RI_PIPELINES, performance_metric=perf_metric, complexity_metric='sum_preds', df=sorted_df, df_long_name='core suite')

In [1056]:
perf_metric = 'f'
sorted_df = df_core_suite.sort_values('LENC-XGB_'+perf_metric, ignore_index=True)
triple_chart(pipelines=DEFAULT_RI_PIPELINES, performance_metric=perf_metric, complexity_metric='avg_preds', df=sorted_df, df_long_name='core suite')

## Pipelines with non-standard configurations on core suite

### Overview: All pipelines

In [1057]:
perf_metric = 'f'
sorted_df = df_core_suite.sort_values('LENC-XGB_'+perf_metric, ignore_index=True)
triple_chart(pipelines=ALL_RI_PIPELINES, performance_metric=perf_metric, complexity_metric='nof_rules', df=sorted_df, df_long_name='core suite')

### CART

In [1058]:
perf_metric = 'f'
sorted_df = df_core_suite.sort_values('LENC-XGB_'+perf_metric, ignore_index=True)
triple_chart(pipelines=CART_PIPELINES, performance_metric=perf_metric, complexity_metric='nof_rules', df=sorted_df, df_long_name='core suite')
# dual_chart(pipelines=CART_PIPELINES, performance_metric=perf_metric, complexity_metric='nof_rules', df=sorted_df, df_long_name='core suite')

### BRCG

In [1059]:
perf_metric = 'f'
sorted_df = df_core_suite.sort_values('LENC-XGB_'+perf_metric, ignore_index=True)
triple_chart(pipelines=BRCG_PIPELINES, performance_metric=perf_metric, complexity_metric='nof_rules', df=sorted_df, df_long_name='core suite')

### CORELS

In [1060]:
perf_metric = 'f'
sorted_df = df_core_suite.sort_values('LENC-XGB_'+perf_metric, ignore_index=True)
triple_chart(pipelines=CORELS_PIPELINES, performance_metric=perf_metric, complexity_metric='nof_rules', df=sorted_df, df_long_name='core suite')

### RIPPER

In [1061]:
perf_metric = 'f'
sorted_df = df_core_suite.sort_values('LENC-XGB_'+perf_metric, ignore_index=True)
triple_chart(pipelines=RIPPER_PIPELINES, performance_metric=perf_metric, complexity_metric='nof_rules', df=sorted_df, df_long_name='core suite')

## Synthetic Suites

### Synthetic Imbalance Suite

In [1062]:
synth_imbalance_suite = [name for name in full_df['data_set'] if name.startswith('synth_ib')]
synth_imbalance_suite_df = full_df[full_df['data_set'].isin(synth_imbalance_suite)]
synth_imbalance_suite_df

Unnamed: 0,data_set,nof_rows,nof_col,nof_num_features,nof_cat_features,nof_pos,pos_ratio,use_case,origin,LENC_nof_bin_cols,...,maxRI_recall,maxRI5_recall,maxRI10_recall,maxRI25_recall,maxRI50_recall,maxRI_precision,maxRI5_precision,maxRI10_precision,maxRI25_precision,maxRI50_precision


In [1063]:
perf_metric = 'f'
fig = single_chart(pipelines=DEFAULT_RI_PIPELINES+['QUANTILE-BRCG', 'TREES-BRCG2'], 
                   metric='f2', 
                   df=synth_imbalance_suite_df,
                   df_long_name='synthetic imbalance suite', 
                   x_sel='pos_ratio',
                   x_title='imbalance (pos ratio)',
                   x_log=True)

### Synthetic Noise Suite

In [1064]:
synth_noise_suite = [name for name in full_df['data_set'] if name.startswith('synth_noise')]
synth_noise_suite_df = full_df[full_df['data_set'].isin(synth_noise_suite)].copy()
synth_noise_suite_df

Unnamed: 0,data_set,nof_rows,nof_col,nof_num_features,nof_cat_features,nof_pos,pos_ratio,use_case,origin,LENC_nof_bin_cols,...,maxRI_recall,maxRI5_recall,maxRI10_recall,maxRI25_recall,maxRI50_recall,maxRI_precision,maxRI5_precision,maxRI10_precision,maxRI25_precision,maxRI50_precision


In [1065]:
def extract(name):
    number = name.split('_')[2]
    noise_level = float('0.'+number)
    return noise_level

synth_noise_suite_df['error_rate'] = synth_noise_suite_df['data_set'].map(extract)
dual_chart(pipelines=DEFAULT_RI_PIPELINES+['QUANTILE-BRCG', 'LENC-CART:*'],
           performance_metric='acc',
           complexity_metric='nof_rules',
           df=synth_noise_suite_df,
           df_long_name='synthetic noise suite',
           x_sel='error_rate',
           x_title='error rate')

### Synthetic Disjunctive Complexity Suite

In [1066]:
suite = [name for name in full_df['data_set'] if name.startswith('synth_disj_')]
disj_df = full_df[full_df['data_set'].isin(suite)].copy()
disj_df

Unnamed: 0,data_set,nof_rows,nof_col,nof_num_features,nof_cat_features,nof_pos,pos_ratio,use_case,origin,LENC_nof_bin_cols,...,maxRI_recall,maxRI5_recall,maxRI10_recall,maxRI25_recall,maxRI50_recall,maxRI_precision,maxRI5_precision,maxRI10_precision,maxRI25_precision,maxRI50_precision


In [1067]:
def extract(name):
    return int(name.split('_')[2])

disj_df['label_complexity'] = disj_df['data_set'].map(extract)
disj_df = disj_df.sort_values('label_complexity', ignore_index=True)

dual_chart(pipelines=['LENC-CART:*', 'TREES2-BRCG'] + DEFAULT_RI_PIPELINES,
           performance_metric='acc',
           complexity_metric='nof_rules',
           df=disj_df,
           df_long_name='synthetic disjunctive complexity suite',
           x_sel='label_complexity',
           x_title='number of rules used to create label',
           x_log=True)

# single_chart(pipelines=['LENC-XGB', 'LENC-CART:*', 'TREES2-BRCG'] + DEFAULT_RI_PIPELINES,
#              metric='acc',
#              df=disj_df,
#              df_long_name='synthetic disjunctive complexity suite',
#              x_sel='label_complexity',
#              x_title='number of rules used to create label',
#              x_log=True)


### Synthetic Linear Complexity Suite

In [1068]:
synth_linear_suite = [name for name in full_df['data_set'] if name.startswith('synth_linear_')]
synth_linear_suite_df = full_df[full_df['data_set'].isin(synth_linear_suite)].copy()
synth_linear_suite_df

Unnamed: 0,data_set,nof_rows,nof_col,nof_num_features,nof_cat_features,nof_pos,pos_ratio,use_case,origin,LENC_nof_bin_cols,...,maxRI_recall,maxRI5_recall,maxRI10_recall,maxRI25_recall,maxRI50_recall,maxRI_precision,maxRI5_precision,maxRI10_precision,maxRI25_precision,maxRI50_precision


In [1069]:
def extract(name):
    return int(name.split('_')[2])+1

synth_linear_suite_df['label_complexity'] = synth_linear_suite_df['data_set'].map(extract)
synth_linear_suite_df = synth_linear_suite_df.sort_values('label_complexity', ignore_index=True)

dual_chart(pipelines=['LENC-CART:*'] + DEFAULT_RI_PIPELINES,
           performance_metric='acc',
           complexity_metric='sum_preds',
           df=synth_linear_suite_df,
           df_long_name='synthetic linear complexity suite',
           x_sel='label_complexity',
           x_title='number of features used in sum to create label',
           x_log=False)

# single_chart(pipelines=['LENC-XGB', 'LENC-CART:*'] + DEFAULT_RI_PIPELINES,
#              metric='acc',
#              df=synth_linear_suite_df,
#              df_long_name='synthetic linear complexity suite',
#              x_sel='label_complexity',
#              x_title='number of features used in sum to create label',
#              x_log=False)


### Synthetic Conjunctive Complexity Suite

In [1070]:
synth_conj_suite = [name for name in full_df['data_set'] if name.startswith('synth_conj_')]
synth_conj_suite_df = full_df[full_df['data_set'].isin(synth_conj_suite)].copy()
synth_conj_suite_df

Unnamed: 0,data_set,nof_rows,nof_col,nof_num_features,nof_cat_features,nof_pos,pos_ratio,use_case,origin,LENC_nof_bin_cols,...,maxRI_recall,maxRI5_recall,maxRI10_recall,maxRI25_recall,maxRI50_recall,maxRI_precision,maxRI5_precision,maxRI10_precision,maxRI25_precision,maxRI50_precision


In [1071]:
def extract(name):
    return int(name.split('_')[2])+1

synth_conj_suite_df['label_complexity'] = synth_conj_suite_df['data_set'].map(extract)
synth_conj_suite_df = synth_conj_suite_df.sort_values('label_complexity', ignore_index=True)

dual_chart(pipelines=['LENC-CART:*', 'TREES2-BRCG'] + DEFAULT_RI_PIPELINES,
           performance_metric='acc',
           complexity_metric='max_preds',
           df=synth_conj_suite_df,
           df_long_name='synthetic conjunctive complexity suite',
           x_sel='label_complexity',
           x_title='number of conjuncts used to create label',
           x_log=True)

# single_chart(pipelines=['LENC-XGB', 'LENC-CART:*', 'TREES2-BRCG'] + DEFAULT_RI_PIPELINES,
#              metric='acc',
#              df=synth_conj_suite_df,
#              df_long_name='synthetic conjunctive complexity suite',
#              x_sel='label_complexity',
#              x_title='number of conjuncts used to create label',
#              x_log=True)


### Synthetic Size Variation Suite

In [1072]:
synth_size_suite = [name for name in full_df['data_set'] if name.startswith('synth_size_')]
synth_size_suite_df = full_df[full_df['data_set'].isin(synth_size_suite)].copy()
synth_size_suite_df

Unnamed: 0,data_set,nof_rows,nof_col,nof_num_features,nof_cat_features,nof_pos,pos_ratio,use_case,origin,LENC_nof_bin_cols,...,maxRI_recall,maxRI5_recall,maxRI10_recall,maxRI25_recall,maxRI50_recall,maxRI_precision,maxRI5_precision,maxRI10_precision,maxRI25_precision,maxRI50_precision


In [1073]:
def extract(name):
    return int(name.split('_')[2])+1

synth_size_suite_df['label_complexity'] = synth_size_suite_df['data_set'].map(extract)
synth_size_suite_df = synth_size_suite_df.sort_values('label_complexity', ignore_index=True)

triple_chart(pipelines=ALL_RI_PIPELINES,
           performance_metric='acc',
           complexity_metric='nof_rules',
           df=synth_size_suite_df,
           df_long_name='synthetic size variation suite',
           x_sel='label_complexity',
           x_title='number of rows in data set',
           x_log=True)

## Trade-Off Predictive Performance vs. Model Complexity Chart for a single data set

In [1074]:
data_set = 'electricity'
df = full_df[full_df['data_set'] == data_set]
# print(df)

pipelines = ALL_RI_PIPELINES # , 'QUANTILE-BRCG', 'QUANTILE-CORELS'
metric = 'f' # adj_bal_acc

metrics = [ p + '_' + metric for p in pipelines]
df_f2 = df[metrics]
# print(df_f2.head())
data_set_key = df_f2.index[0]

column_transform = {x:x.split('_')[0] for x in df_f2.columns.to_list()}
df_f2 = df_f2.rename(columns=column_transform).transpose()

# print(df_f2.head())

metric = 'sum_preds' # nof_rules sum_preds
# metrics = ['NATIVE-RIPPER_sum_preds', 'TREES-BRCG_sum_preds', 'TREES-CORELS_sum_preds', 'NATIVE-R2N_sum_preds', 'TREES-BRCG2_sum_preds']
metrics = [ p + '_' + metric for p in pipelines]

df_sum_preds = df[metrics]
column_transform = {x:x.split('_')[0] for x in df_sum_preds.columns.to_list()}
df_sum_preds = df_sum_preds.rename(columns=column_transform).transpose()

# print(df_sum_preds.head())
# print(df_sum_preds.index)


df_scatter = pd.DataFrame()
df_scatter['f_score'] = df_f2[data_set_key]
df_scatter['sum_preds'] = df_sum_preds[data_set_key]
df_scatter.head()

fig = px.scatter(df_scatter, x="sum_preds", y="f_score", 
                color=df_scatter.index,
                symbol=df_scatter.index,
                color_discrete_sequence=[style_map[p][0] for p in df_scatter.index],  # ['crimson', 'limegreen', 'orange', 'blue' ]
                symbol_sequence=[style_map[p][2] for p in df_scatter.index],
                title="Trade-off between f-score and sum of predicates for {} set".format(data_set)) #  color="species",
fig.update_xaxes(type="log")
fig.update_layout(height=600, width=1000)
fig.show()