In [51]:
import pandas as pd
import pm4py
from logview.utils import LogViewBuilder
from logview.predicate import Query, EqToConstant, NotEqToConstant, GreaterEqualToConstant, LessThanConstant, StartWith, EndWith, DurationWithin
import plotly.express as px
import time
from datetime import timedelta
import string
import plotly.graph_objects as go
from plotly.colors import sample_colorscale
import numpy as np



In [52]:
# Load data
bpi_data = pd.read_csv("C:/Users/cshek/OneDrive/Bureaublad/Thesis/BPI_Challenge_2017.csv", sep=',', quotechar='"')
bpi_data.columns = bpi_data.columns.str.strip()
bpi_data['time'] = pd.to_datetime(bpi_data['time'], format='%Y/%m/%d %H:%M:%S.%f')
log = pm4py.format_dataframe(bpi_data, case_id='case', activity_key='event', timestamp_key='time')
display(log)

# Build LogView
log_view = LogViewBuilder.build_log_view(log)

Unnamed: 0,case,event,time,lifecycle:transition,ApplicationType,LoanGoal,RequestedAmount,MonthlyCost,org:resource,Selected,...,Accepted,CreditScore,NumberOfTerms,EventOrigin,OfferedAmount,case:concept:name,concept:name,time:timestamp,@@index,@@case_index
0,Application_1000086665,A_Create Application,2016-08-03 17:57:21.673000+00:00,COMPLETE,New credit,"Other, see explanation",5000.0,,User_1,,...,,,,Application,,Application_1000086665,A_Create Application,2016-08-03 17:57:21.673000+00:00,0,0
1,Application_1000086665,A_Submitted,2016-08-03 17:57:21.734000+00:00,COMPLETE,New credit,"Other, see explanation",5000.0,,User_1,,...,,,,Application,,Application_1000086665,A_Submitted,2016-08-03 17:57:21.734000+00:00,1,0
2,Application_1000086665,W_Handle leads,2016-08-03 17:57:21.963000+00:00,SCHEDULE,New credit,"Other, see explanation",5000.0,,User_1,,...,,,,Workflow,,Application_1000086665,W_Handle leads,2016-08-03 17:57:21.963000+00:00,2,0
3,Application_1000086665,W_Handle leads,2016-08-03 17:58:28.286000+00:00,WITHDRAW,New credit,"Other, see explanation",5000.0,,User_1,,...,,,,Workflow,,Application_1000086665,W_Handle leads,2016-08-03 17:58:28.286000+00:00,3,0
4,Application_1000086665,W_Complete application,2016-08-03 17:58:28.293000+00:00,SCHEDULE,New credit,"Other, see explanation",5000.0,,User_1,,...,,,,Workflow,,Application_1000086665,W_Complete application,2016-08-03 17:58:28.293000+00:00,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202262,Application_999993812,W_Call incomplete files,2016-10-20 10:19:28.812000+00:00,RESUME,New credit,Caravan / Camper,30000.0,,User_41,,...,,,,Workflow,,Application_999993812,W_Call incomplete files,2016-10-20 10:19:28.812000+00:00,1202262,31508
1202263,Application_999993812,W_Call incomplete files,2016-10-20 10:21:59.667000+00:00,SUSPEND,New credit,Caravan / Camper,30000.0,,User_41,,...,,,,Workflow,,Application_999993812,W_Call incomplete files,2016-10-20 10:21:59.667000+00:00,1202263,31508
1202264,Application_999993812,O_Accepted,2016-10-24 08:24:30.056000+00:00,COMPLETE,New credit,Caravan / Camper,30000.0,,User_68,,...,,,,Offer,,Application_999993812,O_Accepted,2016-10-24 08:24:30.056000+00:00,1202264,31508
1202265,Application_999993812,A_Pending,2016-10-24 08:24:30.059000+00:00,COMPLETE,New credit,Caravan / Camper,30000.0,,User_68,,...,,,,Application,,Application_999993812,A_Pending,2016-10-24 08:24:30.059000+00:00,1202265,31508


In [53]:
# CreditScore ‚â• 600
query_1 = Query('GoodCredit', [GreaterEqualToConstant('CreditScore', 600)])
result_set_1, complement_1 = log_view.evaluate_query('rs_GoodCredit', log, query_1)
log_view.label_result_set(result_set_1, 'GoodCredit')

# RequestedAmount ‚â• 10000
query_2 = Query('LoanOverThreshold', [GreaterEqualToConstant('RequestedAmount', 10000)])
result_set_2, complement_2 = log_view.evaluate_query('rs_LoanOverThreshold', result_set_1, query_2)
log_view.label_result_set(result_set_2, 'LoanOverThreshold')

# RequestedAmount < 15000
query_3 = Query('SmallAmount', [LessThanConstant('RequestedAmount', 15000)])
result_set_3, complement_3 = log_view.evaluate_query('rs_SmallAmount', result_set_2, query_3)
log_view.label_result_set(result_set_3, 'SmallAmount')

# ApplicationType = 'New credit'
query_4 = Query('IsNewCredit', [EqToConstant('ApplicationType', 'New credit')])
result_set_4, complement_4 = log_view.evaluate_query('rs_IsNewCredit', result_set_3, query_4)
log_view.label_result_set(result_set_4, 'IsNewCredit')

# Starts with A_Create Application
query_5 = Query('StartWithCreate', [StartWith(['A_Create Application'])])
result_set_5, complement_5 = log_view.evaluate_query('rs_StartWithCreate', log, query_5)
log_view.label_result_set(result_set_5, 'StartWithCreate')

# Duration between 2 and 7 days
query_6 = Query('ModerateDuration', [DurationWithin(172800, 604800)])
result_set_6, complement_6 = log_view.evaluate_query('rs_ModerateDuration', log, query_6)
log_view.label_result_set(result_set_6, 'ModerateDuration')

summary = log_view.get_summary()
summary

+----+----------------------+-------------------+----------------------+-----------------------+
|    | source_log           | query             | result_set           | labels                |
|----+----------------------+-------------------+----------------------+-----------------------|
|  0 | initial_source_log   | GoodCredit        | rs_GoodCredit        | ['GoodCredit']        |
|  1 | rs_GoodCredit        | LoanOverThreshold | rs_LoanOverThreshold | ['LoanOverThreshold'] |
|  2 | rs_LoanOverThreshold | SmallAmount       | rs_SmallAmount       | ['SmallAmount']       |
|  3 | rs_SmallAmount       | IsNewCredit       | rs_IsNewCredit       | ['IsNewCredit']       |
|  4 | initial_source_log   | StartWithCreate   | rs_StartWithCreate   | ['StartWithCreate']   |
|  5 | initial_source_log   | ModerateDuration  | rs_ModerateDuration  | ['ModerateDuration']  |
+----+----------------------+-------------------+----------------------+-----------------------+
+----+-------------------+----

{'evaluations':              source_log              query            result_set  \
 0    initial_source_log         GoodCredit         rs_GoodCredit   
 1         rs_GoodCredit  LoanOverThreshold  rs_LoanOverThreshold   
 2  rs_LoanOverThreshold        SmallAmount        rs_SmallAmount   
 3        rs_SmallAmount        IsNewCredit        rs_IsNewCredit   
 4    initial_source_log    StartWithCreate    rs_StartWithCreate   
 5    initial_source_log   ModerateDuration   rs_ModerateDuration   
 
                 labels  
 0         [GoodCredit]  
 1  [LoanOverThreshold]  
 2        [SmallAmount]  
 3        [IsNewCredit]  
 4    [StartWithCreate]  
 5   [ModerateDuration]  ,
 'queries':                query                              predicates
 0         GoodCredit                    (CreditScore >= 600)
 1  LoanOverThreshold              (RequestedAmount >= 10000)
 2        SmallAmount               (RequestedAmount < 15000)
 3        IsNewCredit   (ApplicationType in { 'New credit'

In [54]:
def get_lineage(registry, result_set_name):
    """
    Given a registry and a result_set name, return a filtered DataFrame
    showing the lineage of how that result_set was derived.
    """
    evaluations = registry['evaluations']
    
    lineage_rows = []

    def trace_back(current_result_set):
        for _, row in evaluations.iterrows():
            if row['result_set'] == current_result_set:
                lineage_rows.append(row)
                trace_back(row['source_log'])

    trace_back(result_set_name)
    
    # Reverse the result to show forward lineage
    lineage_df = pd.DataFrame(lineage_rows[::-1])
    return lineage_df

def add_case_durations(df):
    grouped = df.groupby("case:concept:name")["time:timestamp"]
    durations = (grouped.max() - grouped.min()).dt.total_seconds()
    df = df.copy()
    df["case_duration"] = df["case:concept:name"].map(durations)
    return df

def add_event_counts(df):
    event_counts = df.groupby("case:concept:name").size()
    df = df.copy()
    df["num_events"] = df["case:concept:name"].map(event_counts)
    return df

def add_avg_time_between_events(df):
    def avg_diff(x):
        diffs = x.sort_values().diff().dropna()
        return diffs.mean().total_seconds() if len(diffs) > 0 else 0
    avg_diffs = df.groupby("case:concept:name")["time:timestamp"].apply(avg_diff)
    df = df.copy()
    df["avg_time_between_events"] = df["case:concept:name"].map(avg_diffs)
    return df

def precompute_case_durations(log_df):
    """
    Adds a 'case_duration' column to the original log dataframe.
    """
    case_durations = (
        log_df.groupby("case:concept:name")["time:timestamp"]
        .agg(["min", "max"])
        .apply(lambda row: (row["max"] - row["min"]).total_seconds(), axis=1)
    )
    log_df = log_df.copy()
    log_df["case_duration"] = log_df["case:concept:name"].map(case_durations)
    return log_df

def format_seconds(seconds):
    if pd.isna(seconds):
        return "N/A"
    seconds = int(seconds)
    days, rem = divmod(seconds, 86400)
    hours, rem = divmod(rem, 3600)
    minutes = rem // 60
    parts = []
    if days: parts.append(f"{days}d")
    if hours: parts.append(f"{hours}h")
    if minutes: parts.append(f"{minutes}m")
    return " ".join(parts) if parts else "0m"

def compute_hover_data(df, metric):
    import numpy as np

    path_cols = [col for col in df.columns if col.startswith("Level")]
    
    # Compute ID and parent ID using cleaned labels
    def clean(label):
        return label.replace("üü° ", "").strip()

    cleaned_path = df[path_cols].map(clean)
    df["id"] = cleaned_path.agg(" ‚Üí ".join, axis=1)
    df["parent_id"] = cleaned_path.shift(axis=1).fillna("").agg(" ‚Üí ".join, axis=1)
    
    df.loc[df["parent_id"].str.strip() == "", "parent_id"] = None

    # Create lookup table for parent metrics
    parent_lookup = df.set_index("id")[[metric, "num_cases"]].to_dict("index")

    hover_cases = []
    hover_pct = []
    hover_metric = []
    hover_delta = []

    for _, row in df.iterrows():
        cur_val = row[metric]
        cur_cases = row["num_cases"]
        parent_info = parent_lookup.get(row["parent_id"])

        # Cases
        hover_cases.append(f"{cur_cases:,}")

        # Percentage of parent
        if parent_info and parent_info["num_cases"] > 0:
            pct = (cur_cases / parent_info["num_cases"]) * 100
            hover_pct.append(f"{pct:.1f}%")
        else:
            hover_pct.append("100%")

        # Metric
        if "duration" in metric or "time" in metric:
            # Format time in human readable form
            sec = cur_val
            d = int(sec // 86400)
            h = int((sec % 86400) // 3600)
            m = int((sec % 3600) // 60)
            hover_metric.append(f"{d}d {h}h {m}m")
        else:
            hover_metric.append(f"{cur_val:.2f}")

        # Delta
        if parent_info:
            diff = cur_val - parent_info[metric]
            if "duration" in metric or "time" in metric:
                sign = "-" if diff < 0 else "+"
                diff = abs(diff)
                d = int(diff // 86400)
                h = int((diff % 86400) // 3600)
                m = int((diff % 3600) // 60)
                hover_delta.append(f"{sign}{d}d {h}h {m}m")
            else:
                sign = "-" if diff < 0 else "+"
                hover_delta.append(f"{sign}{abs(diff):.2f}")
        else:
            hover_delta.append("‚Äî")

    df["hover_cases"] = hover_cases
    df["hover_pct"] = hover_pct
    df["hover_metric"] = hover_metric
    df["hover_delta"] = hover_delta
    return df

def compute_case_stats(df, name, label_path, metric):
    if df.empty:
        return {
            "subset_name": name,
            "label_path": " ‚Üí ".join(label_path),
            "num_cases": 0,
            metric: 0
        }

    df = df.drop_duplicates("case:concept:name")

    column = {
        "avg_case_duration_seconds": "case_duration",
        "avg_events_per_case": "num_events",
        "avg_time_between_events": "avg_time_between_events"
    }[metric]

    return {
        "subset_name": name,
        "label_path": " ‚Üí ".join(label_path),
        "num_cases": df["case:concept:name"].nunique(),
        metric: df[column].mean()
    }

def split_subsets(subsets, query_obj, filter_label, step_index, query_evaluator, filter_cache):
    new_subsets = []

    for subset in subsets:
        subset_df = subset["df"]
        subset_name = subset["name"]
        path = subset["label_path"]
        order_path = subset.get("order_path", [])  # track ordering

        cache_key = (subset_name, query_obj.name)
        if cache_key in filter_cache:
            df_filtered, df_complement = filter_cache[cache_key]
        else:
            df_filtered, df_complement = query_evaluator.evaluate(subset_df, query_obj)
            filter_cache[cache_key] = (df_filtered, df_complement)

        new_subsets.append({
            "df": df_filtered,
            "name": f"{subset_name}_F{step_index+1}",
            "label_path": path + [f"{filter_label} ‚úì"],
            "order_path": order_path + [0]
        })
        new_subsets.append({
            "df": df_complement,
            "name": f"{subset_name}_C{step_index+1}",
            "label_path": path + [f"{filter_label} ‚úó"],
            "order_path": order_path + [1]
        })

    return new_subsets

def recursively_apply_filters(selected_sequence_df, log_view, metric):
    initial_log_name = selected_sequence_df.iloc[0]['source_log']
    base_df = log_view.result_set_name_cache[initial_log_name]

    if metric == "avg_case_duration_seconds":
        initial_df = add_case_durations(base_df)
    elif metric == "avg_events_per_case":
        initial_df = add_event_counts(base_df)
    elif metric == "avg_time_between_events":
        initial_df = add_avg_time_between_events(base_df)
    else:
        raise ValueError(f"Unsupported metric: {metric}")

    main_path_leaf = None

    current_subsets = [{
        "df": initial_df,
        "name": initial_log_name,
        "label_path": ["Initial Source"],
        "order_path": [],
        "is_main_path": True
    }]

    filter_cache = {}

    query_map = {
        evaluation["query"].name: evaluation["query"]
        for result_set_id in log_view.query_registry.get_registered_result_set_ids()
        for evaluation in [log_view.query_registry.get_evaluation(result_set_id)]
    }

    query_expression_map = {
        evaluation["query"].name: evaluation["query"].as_string()
        for result_set_id in log_view.query_registry.get_registered_result_set_ids()
        for evaluation in [log_view.query_registry.get_evaluation(result_set_id)]
    }

    for i, row in selected_sequence_df.iterrows():
        query_obj = query_map.get(row["query"])
        query_expr = query_expression_map.get(row["query"], row["labels"])
        next_subsets = []

        for subset in current_subsets:
            df = subset["df"]
            if df.empty:
                continue

            path = subset["label_path"]
            order_path = subset["order_path"]
            is_main = subset["is_main_path"]

            cache_key = (subset["name"], query_obj.name)
            if cache_key in filter_cache:
                df_filtered, df_complement = filter_cache[cache_key]
            else:
                df_filtered, df_complement = log_view.query_evaluator.evaluate(df, query_obj)
                filter_cache[cache_key] = (df_filtered, df_complement)

            next_subsets.append({
                "df": df_filtered,
                "name": f"{subset['name']}_F{i+1}",
                "label_path": path + [f"{query_expr} ‚úî"],
                "order_path": order_path + [0],
                "is_main_path": is_main
            })
            next_subsets.append({
                "df": df_complement,
                "name": f"{subset['name']}_C{i+1}",
                "label_path": path + [f"{query_expr} ‚úò"],
                "order_path": order_path + [1],
                "is_main_path": False
            })

        current_subsets = next_subsets

    for subset in current_subsets:
        if subset["is_main_path"] and not subset["df"].empty:
            main_path_leaf = subset
            break

    main_path_label_set = set()
    if main_path_leaf:
        for i in range(len(main_path_leaf["label_path"])):
            main_path_label_set.add(tuple(main_path_leaf["label_path"][:i + 1]))

    result_rows = []
    for subset in current_subsets:
        df = subset["df"]
        if df.empty:
            continue

        label_path = subset["label_path"]
        display_path = []

        for i in range(len(label_path)):
            path_prefix = tuple(label_path[:i + 1])
            if path_prefix in main_path_label_set:
                display_path.append("üü° " + label_path[i])
            else:
                display_path.append(label_path[i])

        stats = compute_case_stats(df, subset["name"], display_path, metric)
        row = {
            **{f"Level{i+1}": label for i, label in enumerate(display_path)},
            "num_cases": stats["num_cases"],
            metric: stats[metric],
            "order_path": subset["order_path"]
        }
        result_rows.append(row)

    result_df = pd.DataFrame(result_rows)
    result_df = result_df.sort_values(by="order_path")

    return result_df, main_path_leaf["label_path"] if main_path_leaf else []

def icicle(result_set_name, metric="avg_case_duration_seconds", show_time=False):
    import time
    import plotly.express as px

    times = {}
    start_all = time.time()

    # Get lineage
    t0 = time.time()
    lineage = get_lineage(log_view.query_registry.summary(), result_set_name)
    times['lineage'] = time.time() - t0

    # Compute filtered subsets and stats
    t1 = time.time()
    icicle_df, main_path = recursively_apply_filters(lineage, log_view, metric=metric)
    icicle_df = compute_hover_data(icicle_df, metric)
    times['apply_filters'] = time.time() - t1

    # Define color settings
    color_schemes = {
        "avg_case_duration_seconds": "Blues",
        "avg_events_per_case": "Reds",
        "avg_time_between_events": "Greens"
    }
    color_labels = {
        "avg_case_duration_seconds": "Avg Case Duration (s)",
        "avg_events_per_case": "Avg Events per Case",
        "avg_time_between_events": "Avg Time Between Events (s)"
    }

    # Plot icicle
    fig = px.icicle(
        icicle_df,
        path=[col for col in icicle_df.columns if col.startswith("Level")],
        values="num_cases",
        color=metric,
        custom_data=["hover_cases", "hover_pct", "hover_metric", "hover_delta"],
        color_continuous_scale=color_schemes.get(metric, "Blues"),
        title=f"Icicle Chart for: {result_set_name}"
    )

    # Update hovertemplate
    fig.update_traces(
        hovertemplate=(
            "<b>%{label}</b><br>" +
            "Cases: %{customdata[0]}<br>" +
            "Share of parent: %{customdata[1]}<br>" +
            f"{color_labels.get(metric, metric)}: " + "%{customdata[2]}<br>" +
            "Œî from parent: %{customdata[3]}<extra></extra>"
        )
    )

    # Update layout
    fig.update_layout(
        margin=dict(t=40, l=0, r=0, b=0),
        coloraxis_colorbar=dict(title=color_labels.get(metric, metric))
    )

    # Show
    fig.show()
    times['plot'] = time.time() - t1
    times['total'] = time.time() - start_all

    # Show main path
    if main_path:
        print("\nMain Analysis:", " ‚Üí ".join(f"üü° {step}" for step in main_path))

    # Show timing
    if show_time:
        print("\n--- Execution Time (seconds) ---")
        for k, v in times.items():
            print(f"{k:25}: {v:.4f}")

def get_sibling_subsets(result_set_name, log_view):
    lineage_df = get_lineage(log_view.query_registry.summary(), result_set_name)
    if len(lineage_df) < 1:
        raise ValueError("Lineage not found.")

    last_query_row = lineage_df.iloc[-1]
    parent_log = last_query_row["source_log"]
    query_name = last_query_row["query"]
    label = last_query_row["labels"]
    step_index = len(lineage_df) - 1

    # Lookup actual query object from registry
    query_obj = None
    for rs_id in log_view.query_registry.get_registered_result_set_ids():
        evaluation = log_view.query_registry.get_evaluation(rs_id)
        if evaluation["query"].name == query_name:
            query_obj = evaluation["query"]
            break

    if query_obj is None:
        raise ValueError(f"Query object for '{query_name}' not found.")

    return parent_log, query_obj, label, step_index, lineage_df

def pie(result_set_name, metric="avg_case_duration_seconds"):
    from plotly.colors import sample_colorscale

    # Get lineage and query
    parent_log, query_obj, label, step_index, lineage_df = get_sibling_subsets(result_set_name, log_view)

    # Metric setup
    full_log = log_view.query_registry.get_initial_source_log()
    if metric == "avg_case_duration_seconds":
        full_log = add_case_durations(full_log)
        value_col = "case_duration"
        color_scheme = "Blues"
        color_title = "Avg Duration (s)"
    elif metric == "avg_events_per_case":
        full_log = add_event_counts(full_log)
        value_col = "num_events"
        color_scheme = "Reds"
        color_title = "Avg Events/Case"
    elif metric == "avg_time_between_events":
        full_log = add_avg_time_between_events(full_log)
        value_col = "avg_time_between_events"
        color_scheme = "Greens"
        color_title = "Avg Time Between Events (s)"
    else:
        raise ValueError("Unsupported metric")

    # Evaluate last query
    filtered, _ = log_view.query_evaluator.evaluate(full_log, query_obj)

    # Get query expressions
    query_expr_map = {
        ev["query"].name: ev["query"].as_string()
        for rs_id in log_view.query_registry.get_registered_result_set_ids()
        for ev in [log_view.query_registry.get_evaluation(rs_id)]
    }

    # Assign lineage path to cases
    case_paths = {}
    for _, row in lineage_df.iterrows():
        qname = row["query"]
        qexpr = query_expr_map.get(qname, qname)
        step_obj = next(
            (log_view.query_registry.get_evaluation(rs)["query"]
             for rs in log_view.query_registry.get_registered_result_set_ids()
             if log_view.query_registry.get_evaluation(rs)["query"].name == qname),
            None
        )
        if step_obj:
            passed_df, _ = log_view.query_evaluator.evaluate(full_log, step_obj)
            passed_cases = set(passed_df["case:concept:name"])
            for cid in filtered["case:concept:name"].unique():
                case_paths.setdefault(cid, [])
                case_paths[cid].append(f"{qexpr} ‚úÖ" if cid in passed_cases else f"{qexpr} ‚ùå")

    filtered = filtered.copy()
    filtered["path_label"] = filtered["case:concept:name"].map(
        lambda cid: " ‚Üí ".join(case_paths.get(cid, []))
    )

    # Aggregate
    grouped = (
        filtered.groupby("path_label")[["case:concept:name", value_col]]
        .agg(num_cases=("case:concept:name", "nunique"), avg_metric=(value_col, "mean"))
        .reset_index()
    )

    # Normalize color
    min_val, max_val = grouped["avg_metric"].min(), grouped["avg_metric"].max()
    normed = (grouped["avg_metric"] - min_val) / (max_val - min_val + 1e-9)
    color_values = sample_colorscale(color_scheme, normed.tolist())

    # Clean slice labels
    total_cases = grouped["num_cases"].sum()
    grouped["slice_label"] = grouped.apply(
        lambda r: f"{int(r['num_cases']):,} cases ({r['num_cases'] / total_cases:.0%})", axis=1
    )

    # Build chart
    fig = go.Figure(
        data=[go.Pie(
            labels=grouped["slice_label"],
            values=grouped["num_cases"],
            textinfo="label",
            customdata=grouped[["path_label", "num_cases", "avg_metric"]],
            hovertemplate=(
            "<b>%{customdata[0]}</b><br>" +
            "Cases: %{customdata[1]:,}<br>" +
            color_title + ": %{customdata[2]:.2f}<extra></extra>"
            ),
            marker=dict(colors=color_values)
        )]
    )

    # Add colorbar
    fig.add_trace(go.Scatter(
        x=[None], y=[None], mode='markers',
        marker=dict(
            colorscale=color_scheme,
            cmin=min_val,
            cmax=max_val,
            colorbar=dict(title=color_title, len=0.8, thickness=15),
            color=[min_val],
            showscale=True
        ),
        hoverinfo='none',
        showlegend=False
    ))

    fig.update_layout(
        title=f"Breakdown of Filter: {query_obj.as_string()}",
        width=800,
        height=700,
        showlegend=False,
        paper_bgcolor='white',
        plot_bgcolor='white',
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
    )

    fig.show()

    # Print details
    print("\nFilter Paths:\n")
    for _, row in grouped.iterrows():
        print(f"- {row['slice_label']}: {row['path_label']} | Avg: {row['avg_metric']:.2f}")

In [55]:
icicle(result_set_name='rs_IsNewCredit', metric='avg_case_duration_seconds')
pie("rs_IsNewCredit", metric="avg_case_duration_seconds")


Main Analysis: üü° Initial Source ‚Üí üü° (CreditScore >= 600) ‚úî ‚Üí üü° (RequestedAmount >= 10000) ‚úî ‚Üí üü° (RequestedAmount < 15000) ‚úî ‚Üí üü° (ApplicationType in { 'New credit' }) ‚úî



Filter Paths:

- 2,461 cases (9%): (CreditScore >= 600) ‚úÖ ‚Üí (RequestedAmount >= 10000) ‚úÖ ‚Üí (RequestedAmount < 15000) ‚úÖ ‚Üí (ApplicationType in { 'New credit' }) ‚úÖ | Avg: 1690296.09
- 7,288 cases (26%): (CreditScore >= 600) ‚úÖ ‚Üí (RequestedAmount >= 10000) ‚úÖ ‚Üí (RequestedAmount < 15000) ‚ùå ‚Üí (ApplicationType in { 'New credit' }) ‚úÖ | Avg: 1952329.13
- 4,976 cases (18%): (CreditScore >= 600) ‚úÖ ‚Üí (RequestedAmount >= 10000) ‚ùå ‚Üí (RequestedAmount < 15000) ‚úÖ ‚Üí (ApplicationType in { 'New credit' }) ‚úÖ | Avg: 1625787.80
- 2,158 cases (8%): (CreditScore >= 600) ‚ùå ‚Üí (RequestedAmount >= 10000) ‚úÖ ‚Üí (RequestedAmount < 15000) ‚úÖ ‚Üí (ApplicationType in { 'New credit' }) ‚úÖ | Avg: 2296449.44
- 5,845 cases (21%): (CreditScore >= 600) ‚ùå ‚Üí (RequestedAmount >= 10000) ‚úÖ ‚Üí (RequestedAmount < 15000) ‚ùå ‚Üí (ApplicationType in { 'New credit' }) ‚úÖ | Avg: 2392653.21
- 5,392 cases (19%): (CreditScore >= 600) ‚ùå ‚Üí (RequestedAmount >= 10000) ‚ùå ‚Üí (Reque

In [56]:
icicle(result_set_name='rs_IsNewCredit', metric='avg_events_per_case')
pie("rs_IsNewCredit", metric="avg_events_per_case")


Main Analysis: üü° Initial Source ‚Üí üü° (CreditScore >= 600) ‚úî ‚Üí üü° (RequestedAmount >= 10000) ‚úî ‚Üí üü° (RequestedAmount < 15000) ‚úî ‚Üí üü° (ApplicationType in { 'New credit' }) ‚úî



Filter Paths:

- 2,461 cases (9%): (CreditScore >= 600) ‚úÖ ‚Üí (RequestedAmount >= 10000) ‚úÖ ‚Üí (RequestedAmount < 15000) ‚úÖ ‚Üí (ApplicationType in { 'New credit' }) ‚úÖ | Avg: 51.00
- 7,288 cases (26%): (CreditScore >= 600) ‚úÖ ‚Üí (RequestedAmount >= 10000) ‚úÖ ‚Üí (RequestedAmount < 15000) ‚ùå ‚Üí (ApplicationType in { 'New credit' }) ‚úÖ | Avg: 54.12
- 4,976 cases (18%): (CreditScore >= 600) ‚úÖ ‚Üí (RequestedAmount >= 10000) ‚ùå ‚Üí (RequestedAmount < 15000) ‚úÖ ‚Üí (ApplicationType in { 'New credit' }) ‚úÖ | Avg: 50.12
- 2,158 cases (8%): (CreditScore >= 600) ‚ùå ‚Üí (RequestedAmount >= 10000) ‚úÖ ‚Üí (RequestedAmount < 15000) ‚úÖ ‚Üí (ApplicationType in { 'New credit' }) ‚úÖ | Avg: 34.30
- 5,845 cases (21%): (CreditScore >= 600) ‚ùå ‚Üí (RequestedAmount >= 10000) ‚úÖ ‚Üí (RequestedAmount < 15000) ‚ùå ‚Üí (ApplicationType in { 'New credit' }) ‚úÖ | Avg: 36.01
- 5,392 cases (19%): (CreditScore >= 600) ‚ùå ‚Üí (RequestedAmount >= 10000) ‚ùå ‚Üí (RequestedAmount < 15000) ‚úÖ ‚

In [57]:
icicle(result_set_name='rs_IsNewCredit', metric='avg_time_between_events')
pie("rs_IsNewCredit", metric="avg_time_between_events")


Main Analysis: üü° Initial Source ‚Üí üü° (CreditScore >= 600) ‚úî ‚Üí üü° (RequestedAmount >= 10000) ‚úî ‚Üí üü° (RequestedAmount < 15000) ‚úî ‚Üí üü° (ApplicationType in { 'New credit' }) ‚úî



Filter Paths:

- 2,461 cases (9%): (CreditScore >= 600) ‚úÖ ‚Üí (RequestedAmount >= 10000) ‚úÖ ‚Üí (RequestedAmount < 15000) ‚úÖ ‚Üí (ApplicationType in { 'New credit' }) ‚úÖ | Avg: 34382.90
- 7,288 cases (26%): (CreditScore >= 600) ‚úÖ ‚Üí (RequestedAmount >= 10000) ‚úÖ ‚Üí (RequestedAmount < 15000) ‚ùå ‚Üí (ApplicationType in { 'New credit' }) ‚úÖ | Avg: 37496.79
- 4,976 cases (18%): (CreditScore >= 600) ‚úÖ ‚Üí (RequestedAmount >= 10000) ‚ùå ‚Üí (RequestedAmount < 15000) ‚úÖ ‚Üí (ApplicationType in { 'New credit' }) ‚úÖ | Avg: 33576.44
- 2,158 cases (8%): (CreditScore >= 600) ‚ùå ‚Üí (RequestedAmount >= 10000) ‚úÖ ‚Üí (RequestedAmount < 15000) ‚úÖ ‚Üí (ApplicationType in { 'New credit' }) ‚úÖ | Avg: 80848.06
- 5,845 cases (21%): (CreditScore >= 600) ‚ùå ‚Üí (RequestedAmount >= 10000) ‚úÖ ‚Üí (RequestedAmount < 15000) ‚ùå ‚Üí (ApplicationType in { 'New credit' }) ‚úÖ | Avg: 80560.97
- 5,392 cases (19%): (CreditScore >= 600) ‚ùå ‚Üí (RequestedAmount >= 10000) ‚ùå ‚Üí (RequestedAmount