In [66]:
import pandas as pd
import pm4py
from logview.utils import LogViewBuilder
from logview.predicate import Query, EqToConstant, NotEqToConstant, GreaterEqualToConstant, LessThanConstant, StartWith, EndWith, DurationWithin
import plotly.express as px
import time

In [67]:
# Load data
bpi_data = pd.read_csv("C:/Users/cshek/OneDrive/Bureaublad/Thesis/BPI_Challenge_2017.csv", sep=',', quotechar='"')
bpi_data.columns = bpi_data.columns.str.strip()
bpi_data['time'] = pd.to_datetime(bpi_data['time'], format='%Y/%m/%d %H:%M:%S.%f')
log = pm4py.format_dataframe(bpi_data, case_id='case', activity_key='event', timestamp_key='time')
display(log)

# Build LogView
log_view = LogViewBuilder.build_log_view(log)

Unnamed: 0,case,event,time,lifecycle:transition,ApplicationType,LoanGoal,RequestedAmount,MonthlyCost,org:resource,Selected,...,Accepted,CreditScore,NumberOfTerms,EventOrigin,OfferedAmount,case:concept:name,concept:name,time:timestamp,@@index,@@case_index
0,Application_1000086665,A_Create Application,2016-08-03 17:57:21.673000+00:00,COMPLETE,New credit,"Other, see explanation",5000.0,,User_1,,...,,,,Application,,Application_1000086665,A_Create Application,2016-08-03 17:57:21.673000+00:00,0,0
1,Application_1000086665,A_Submitted,2016-08-03 17:57:21.734000+00:00,COMPLETE,New credit,"Other, see explanation",5000.0,,User_1,,...,,,,Application,,Application_1000086665,A_Submitted,2016-08-03 17:57:21.734000+00:00,1,0
2,Application_1000086665,W_Handle leads,2016-08-03 17:57:21.963000+00:00,SCHEDULE,New credit,"Other, see explanation",5000.0,,User_1,,...,,,,Workflow,,Application_1000086665,W_Handle leads,2016-08-03 17:57:21.963000+00:00,2,0
3,Application_1000086665,W_Handle leads,2016-08-03 17:58:28.286000+00:00,WITHDRAW,New credit,"Other, see explanation",5000.0,,User_1,,...,,,,Workflow,,Application_1000086665,W_Handle leads,2016-08-03 17:58:28.286000+00:00,3,0
4,Application_1000086665,W_Complete application,2016-08-03 17:58:28.293000+00:00,SCHEDULE,New credit,"Other, see explanation",5000.0,,User_1,,...,,,,Workflow,,Application_1000086665,W_Complete application,2016-08-03 17:58:28.293000+00:00,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202262,Application_999993812,W_Call incomplete files,2016-10-20 10:19:28.812000+00:00,RESUME,New credit,Caravan / Camper,30000.0,,User_41,,...,,,,Workflow,,Application_999993812,W_Call incomplete files,2016-10-20 10:19:28.812000+00:00,1202262,31508
1202263,Application_999993812,W_Call incomplete files,2016-10-20 10:21:59.667000+00:00,SUSPEND,New credit,Caravan / Camper,30000.0,,User_41,,...,,,,Workflow,,Application_999993812,W_Call incomplete files,2016-10-20 10:21:59.667000+00:00,1202263,31508
1202264,Application_999993812,O_Accepted,2016-10-24 08:24:30.056000+00:00,COMPLETE,New credit,Caravan / Camper,30000.0,,User_68,,...,,,,Offer,,Application_999993812,O_Accepted,2016-10-24 08:24:30.056000+00:00,1202264,31508
1202265,Application_999993812,A_Pending,2016-10-24 08:24:30.059000+00:00,COMPLETE,New credit,Caravan / Camper,30000.0,,User_68,,...,,,,Application,,Application_999993812,A_Pending,2016-10-24 08:24:30.059000+00:00,1202265,31508


In [68]:
# CreditScore ≥ 600
query_1 = Query('GoodCredit', [GreaterEqualToConstant('CreditScore', 600)])
result_set_1, complement_1 = log_view.evaluate_query('rs_GoodCredit', log, query_1)
log_view.label_result_set(result_set_1, 'GoodCredit')

# RequestedAmount ≥ 10000
query_2 = Query('LoanOverThreshold', [GreaterEqualToConstant('RequestedAmount', 10000)])
result_set_2, complement_2 = log_view.evaluate_query('rs_LoanOverThreshold', result_set_1, query_2)
log_view.label_result_set(result_set_2, 'LoanOverThreshold')

# RequestedAmount < 15000
query_3 = Query('SmallAmount', [LessThanConstant('RequestedAmount', 15000)])
result_set_3, complement_3 = log_view.evaluate_query('rs_SmallAmount', result_set_2, query_3)
log_view.label_result_set(result_set_3, 'SmallAmount')

# ApplicationType = 'New credit'
query_4 = Query('IsNewCredit', [EqToConstant('ApplicationType', 'New credit')])
result_set_4, complement_4 = log_view.evaluate_query('rs_IsNewCredit', result_set_3, query_4)
log_view.label_result_set(result_set_4, 'IsNewCredit')

# Starts with A_Create Application
query_5 = Query('StartWithCreate', [StartWith(['A_Create Application'])])
result_set_5, complement_5 = log_view.evaluate_query('rs_StartWithCreate', log, query_5)
log_view.label_result_set(result_set_5, 'StartWithCreate')

# Duration between 2 and 7 days
query_6 = Query('ModerateDuration', [DurationWithin(172800, 604800)])
result_set_6, complement_6 = log_view.evaluate_query('rs_ModerateDuration', log, query_6)
log_view.label_result_set(result_set_6, 'ModerateDuration')

In [None]:
def get_lineage(registry, result_set_name):
    """
    Given a registry and a result_set name, return a filtered DataFrame
    showing the lineage of how that result_set was derived.
    """
    evaluations = registry['evaluations']
    
    lineage_rows = []

    def trace_back(current_result_set):
        for _, row in evaluations.iterrows():
            if row['result_set'] == current_result_set:
                lineage_rows.append(row)
                trace_back(row['source_log'])

    trace_back(result_set_name)
    
    # Reverse the result to show forward lineage
    lineage_df = pd.DataFrame(lineage_rows[::-1])
    return lineage_df

def precompute_case_durations(log_df):
    """
    Adds a 'case_duration' column to the original log dataframe.
    """
    case_durations = (
        log_df.groupby("case:concept:name")["time:timestamp"]
        .agg(["min", "max"])
        .apply(lambda row: (row["max"] - row["min"]).total_seconds(), axis=1)
    )
    log_df = log_df.copy()
    log_df["case_duration"] = log_df["case:concept:name"].map(case_durations)
    return log_df

def compute_case_stats(df, name, label_path):
    """
    Computes statistics for a filtered DataFrame subset.
    """
    if df.empty:
        return {
            "subset_name": name,
            "label_path": " → ".join(label_path),
            "num_cases": 0,
            "avg_case_duration_seconds": 0
        }

    durations = df.drop_duplicates("case:concept:name")["case_duration"]
    return {
        "subset_name": name,
        "label_path": " → ".join(label_path),
        "num_cases": df["case:concept:name"].nunique(),
        "avg_case_duration_seconds": durations.mean()
    }

def split_subsets(subsets, query_obj, filter_label, step_index, query_evaluator, filter_cache):
    """
    Applies a filter to each subset and splits it into matching and non-matching subsets.
    """
    new_subsets = []

    for subset in subsets:
        subset_df = subset["df"]
        subset_name = subset["name"]
        path = subset["label_path"]

        cache_key = (subset_name, query_obj.name)
        if cache_key in filter_cache:
            df_filtered, df_complement = filter_cache[cache_key]
        else:
            df_filtered, df_complement = query_evaluator.evaluate(subset_df, query_obj)
            filter_cache[cache_key] = (df_filtered, df_complement)

        # Create new names and paths
        filtered_name = f"{subset_name}_F{step_index+1}"
        complement_name = f"{subset_name}_C{step_index+1}"
        path_filtered = path + [f"{filter_label} ✓"]
        path_complement = path + [f"{filter_label} ✗"]

        # Append next subsets
        new_subsets.append({
            "df": df_filtered,
            "name": filtered_name,
            "label_path": path_filtered
        })
        new_subsets.append({
            "df": df_complement,
            "name": complement_name,
            "label_path": path_complement
        })

    return new_subsets

def recursively_apply_filters(selected_sequence_df, log_view):
    """
    Applies a sequence of filters recursively and returns a DataFrame
    ready for icicle/sunburst visualization, with hierarchical levels and stats.
    """
    # Precompute query map
    query_map = {
        evaluation["query"].name: evaluation["query"]
        for result_set_id in log_view.query_registry.get_registered_result_set_ids()
        for evaluation in [log_view.query_registry.get_evaluation(result_set_id)]
    }

    # Get the initial log and precompute durations
    initial_log_name = selected_sequence_df.iloc[0]['source_log']
    initial_df = precompute_case_durations(log_view.result_set_name_cache[initial_log_name])

    current_subsets = [{
        "df": initial_df,
        "name": initial_log_name,
        "label_path": ["Initial Source"]
    }]

    filter_cache = {}

    # Apply filters iteratively
    for i, row in selected_sequence_df.iterrows():
        query_name = row["query"]
        filter_label = row["labels"]
        query_obj = query_map.get(query_name)
        if query_obj is None:
            raise ValueError(f"Query '{query_name}' not found in registry.")

        current_subsets = split_subsets(
            current_subsets,
            query_obj,
            filter_label,
            i,
            log_view.query_evaluator,
            filter_cache
        )

    # Compute stats
    result_rows = []
    for subset in current_subsets:
        df = subset["df"]
        if df.empty:
            continue
        label_path = subset["label_path"]
        stats = compute_case_stats(df, subset["name"], label_path)
        row = {
            **{f"Level{i+1}": label for i, label in enumerate(label_path)},
            "num_cases": stats["num_cases"],
            "avg_case_duration_seconds": stats["avg_case_duration_seconds"]
        }
        result_rows.append(row)

    return pd.DataFrame(result_rows)

def icicle(result_set_name, show_time=True):
    import time
    import plotly.express as px

    times = {}
    start_all = time.time()

    # Lineage
    t0 = time.time()
    lineage = get_lineage(log_view.query_registry.summary(), result_set_name)
    times['lineage'] = time.time() - t0

    # Apply filters and prepare icicle data
    t1 = time.time()
    icicle_df = recursively_apply_filters(lineage, log_view)
    times['apply_filters'] = time.time() - t1

    # Plot
    t2 = time.time()
    fig = px.icicle(
        icicle_df,
        path=[col for col in icicle_df.columns if col.startswith("Level")],
        values='num_cases',
        color='avg_case_duration_seconds',
        color_continuous_scale='Blues',
        title=f'Icicle Chart for Result Set: {result_set_name}'
    )
    
    fig.update_layout(
    margin=dict(t=40, l=0, r=0, b=0),
    coloraxis_colorbar=dict(title="Average Case Duration (seconds)"))


    fig.show()
    times['plot'] = time.time() - t2

    times['total'] = time.time() - start_all

    if show_time:
        print("\n--- Execution Time (seconds) ---")
        for k, v in times.items():
            print(f"{k:25}: {v:.4f}")


In [84]:
icicle('rs_IsNewCredit')


--- Execution Time (seconds) ---
lineage                  : 0.0077
apply_filters            : 10.5565
plot                     : 0.2171
total                    : 10.7813
