In [1]:
from typing import Tuple, Any

import dice_ml
from dice_ml import Dice
from dice_ml.utils.exception import UserConfigValidationException

from sklearn.datasets import load_iris, fetch_california_housing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from src.transition_system import transition_system, indexs_for_window, list_to_str

import pandas as pd
import os
import random
import utils
from math import ceil
from timeout_decorator import timeout, TimeoutError
from collections import Counter
%load_ext autoreload
%autoreload 2

# Dataset: VINST (Volvo Belgium)
link: https://www.win.tue.nl/bpi/doku.php?id=2013:challenge

## Setup Variables

In [2]:
pd.options.display.max_columns= None
SECONDS_TO_HOURS = 60 * 60
SECONDS_TO_DAYS = 60 * 60 * 24
case_id_name = 'SR_Number'  # The case identifier column name.
start_date_name = 'Change_Date+Time'  # Maybe change to start_et (start even time)
activity_column_name = "ACTIVITY"

## Load Data

In [3]:
data_dir = "../preprocessed_datasets/"
train_dataset_file = "train-set-cfe.csv"
test_dataset_file = "test-set-cfe.csv"
df = pd.read_csv("../data/VINST cases incidents.csv")  # Use full dataset for transition systens
df_train = pd.read_csv(os.path.join(data_dir, train_dataset_file))
df_test = pd.read_csv(os.path.join(data_dir, test_dataset_file))

### Basic Preprocessing of the dataset

In [4]:
df = df.fillna("missing")

df_train = df_train[:20_000]  # 31_289
# df_test = df_test[: 19_041]
print(f"Rows in df_train: {len(df_train)}")
# df_train.info()

Rows in df_train: 20000


### Helper functions

In [5]:
def get_case_id(df, case_id_name=case_id_name, multi=False):
    return df[case_id_name].unique().item()

def get_query_instance(sidx=14, eidx=16):
    assert eidx - sidx == 2, "One row represents the current action and the next one represents the suggested action"
    current_step = X_train[sidx: sidx+1]
    expected_next_step = X_train[eidx-1: eidx]
    return current_step, expected_next_step

def activity_n_resources(df, resources_columns=None, threshold_percentage=100):
    """
    Creates a set of tuples, each tuple has elements from the columns specified through `resource_columns`.
    E.g. { ('action_1', 'rresource_1'), ... (activity_3, resource_5) }
    Args:
        df (pd.DataFrame):
        resources_columns (list): columns that contains the activity and resources.
    Returns:
        Set of tuples. A single element contains the activity and resources of a single row from the
        dataframe.
    """
    if resources_columns is None:
        # raise TypeError("Please specify the columns that have resources")
        resources_columns = [activity_column_name, 'Involved_ST_Function_Div', 'Involved_Org_line_3',
                             'Involved_ST', 'Country', 'Owner_Country']

    threshold = threshold_percentage / 100

    valid_activity_n_resource = set( df[resources_columns].apply(tuple, axis='columns') )

    # combo: combination
    resource_combo_frequency = {}

    valid_resource_combo = df[resources_columns].apply(tuple, axis='columns')

    for elem in valid_resource_combo:
        if resource_combo_frequency.get(elem):
            resource_combo_frequency[elem] += 1
        else:
            resource_combo_frequency[elem] = 1
    # Creates a list of (combo, counts)
    resource_combo_counts = [ (k, v) for k, v in resource_combo_frequency.items() ]
    sorted_resource_combo_counts = sorted( resource_combo_counts, key=lambda item: item[1], reverse=True )
    sorted_combos = [combo for combo, _ in sorted_resource_combo_counts ]
    amount_of_combos_to_select = int( len(sorted_combos) * threshold ) + 1
    valid_activity_n_resources = sorted_combos[:amount_of_combos_to_select]
    return valid_activity_n_resources

# current_step, query_instances = get_query_instance(14, 16)

# valid_resources = activity_n_resources(df_train, ["ACTIVITY", "Involved_ST_Function_Div", "Involved_Org_line_3", "Involved_ST"])
resource_columns_to_validate = [activity_column_name, 'Involved_ST_Function_Div', 'Involved_Org_line_3',
                                'Involved_ST', 'Country', 'Owner_Country']
valid_resources = activity_n_resources(df, resource_columns_to_validate)
# len(valid_resources)

In [6]:
# threshold = 0.8
# resource_combo_frequency = {}
# resource_column_names = [activity_column_name, 'Involved_ST_Function_Div', 'Involved_Org_line_3',
#                          'Involved_ST', 'Country', 'Owner_Country']
# valid_resource_combo = df[resource_column_names].apply(tuple, axis='columns')
#
# for elem in valid_resource_combo:
#     if resource_combo_frequency.get(elem):
#         resource_combo_frequency[elem] += 1
#     else:
#         resource_combo_frequency[elem] = 1
# # Creates a list of (combo, counts)
# resource_combo_counts = [ (k, v) for k, v in resource_combo_frequency.items() ]
# sorted_resource_combo_counts = sorted( resource_combo_counts, key=lambda item: item[1], reverse=True )
# sorted_combos = [combo for combo, _ in sorted_resource_combo_counts ]
# amount_of_combos_to_select = int( len(sorted_combos) * threshold ) + 1
# high_frequency_combos = sorted_combos[:amount_of_combos_to_select]

### Prepare the Test Dataset

In [7]:
def get_test(df, case_id_name):
    # df_result = pd.DataFrame(columns=df.columns)
    result_lst = []

    for idx in df[case_id_name].unique():
        df_trace = df[df[case_id_name] == idx]
        # ceil enables cases with 1 row to pass through
        cut = ceil(len(df_trace) * random.uniform(0.5, 0.7)) #+ 2  # 2 because one for the floor and one for the pred
        df_trace = df_trace.iloc[:cut].reset_index(drop=True)

        # df_result = pd.concat([df_result, df_trace])
        result_lst.append(df_trace.reset_index(drop=True))
        # break
    # return df_result.reset_index(drop=True)
    return result_lst

test_cases = get_test(df_test, case_id_name)

In [8]:
# ### Features that can vary
# option 1:
# cols_to_vary = [col for col in df_train.columns if col[0] == '#']
# cols_to_vary.extend(["ACTIVITY"])

# option 2:
cols_to_vary = ["ACTIVITY", "Involved_ST_Function_Div", "Involved_Org_line_3", "Involved_ST"]

outcome_name = "lead_time"
def prepare_df_for_ml(df, outcome_name, columns_to_remove=None) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    :param str outcome_name: name of the target column.
    """
    # Before training for ml we need to remove columns that can are not needed for ML model.
    if columns_to_remove is None:
        columns_to_remove = ["Change_Date+Time", "time_remaining"]
    df = df.drop([case_id_name], axis="columns")
    df = df.drop(columns_to_remove, axis="columns")
    X = df.drop([outcome_name], axis=1)
    y = df[outcome_name]
    return X, y

X_train, y_train = prepare_df_for_ml(df_train, outcome_name)
# X_test, y_test = prepare_df_for_ml(df_test, outcome_name)

continuous_features = ["time_from_first", "time_from_previous_et", "time_from_midnight", "# ACTIVITY=In Progress", "# ACTIVITY=Awaiting Assignment",
                       "# ACTIVITY=Resolved", "# ACTIVITY=Assigned", "# ACTIVITY=Closed", "# ACTIVITY=Wait - User", "# ACTIVITY=Wait - Implementation", "# ACTIVITY=Wait",
                       "# ACTIVITY=Wait - Vendor", "# ACTIVITY=In Call", "# ACTIVITY=Wait - Customer", "# ACTIVITY=Unmatched", "# ACTIVITY=Cancelled"]
categorical_features = ["Status", "ACTIVITY", "Involved_ST_Function_Div", "Involved_Org_line_3", "Involved_ST", "SR_Latest_Impact", "Product", "Country", "Owner_Country",
                        "weekday"]

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                           ('classifier', RandomForestRegressor(n_jobs=7))])
model = clf.fit(X_train, y_train)

# ## Create DiCE model
d_iris = dice_ml.Data(dataframe=pd.concat([X_train, y_train], axis="columns"),
                      continuous_features=continuous_features,
                      outcome_name=outcome_name)

# We provide the type of model as a parameter (model_type)
m_iris = dice_ml.Model(model=model, backend="sklearn", model_type='regressor')
method = "random"  # genetic, kdtree, random
exp_genetic_iris = Dice(d_iris, m_iris, method=method)  # Method random does not support features_weights argument in generate_counterfactuals()
# exp_genetic_iris = Dice(d_iris, m_iris, method="kdtree")

## Experiment with Single Queries

In [27]:
sidx = 87
eidx = 97

df_train[sidx: eidx]
# df_train[sidx: eidx]

Unnamed: 0,SR_Number,Change_Date+Time,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,time_remaining,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
87,1-506071646,1299572184,Accepted,In Progress,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,Netherlands,81256.0,8430.0,29784.0,Tuesday,37206306.0,8,5,0,0,0,1,0,0,0,0,0,0,0,37287562.0
88,1-506071646,1299572298,Queued,Awaiting Assignment,A2_1,Org line A2,D8,Medium,PROD542,nl,Netherlands,81370.0,114.0,29898.0,Tuesday,37206192.0,9,5,0,0,0,1,0,0,0,0,0,0,0,37287562.0
89,1-506071646,1299572432,Accepted,In Progress,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,81504.0,134.0,30032.0,Tuesday,37206058.0,9,6,0,0,0,1,0,0,0,0,0,0,0,37287562.0
90,1-506071646,1299572566,Accepted,Wait - User,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,81638.0,134.0,30166.0,Tuesday,37205924.0,10,6,0,0,0,1,0,0,0,0,0,0,0,37287562.0
91,1-506071646,1299651374,Accepted,In Progress,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,160446.0,78808.0,22574.0,Wednesday,37127116.0,10,6,0,0,0,2,0,0,0,0,0,0,0,37287562.0
92,1-506071646,1299651436,Queued,Awaiting Assignment,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,POLAND,160508.0,62.0,22636.0,Wednesday,37127054.0,11,6,0,0,0,2,0,0,0,0,0,0,0,37287562.0
93,1-506071646,1299686621,Accepted,In Progress,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,Netherlands,195693.0,35185.0,57821.0,Wednesday,37091869.0,11,7,0,0,0,2,0,0,0,0,0,0,0,37287562.0
94,1-506071646,1299686712,Queued,Awaiting Assignment,A2_1,Org line A2,D8,Medium,PROD542,nl,Netherlands,195784.0,91.0,57912.0,Wednesday,37091778.0,12,7,0,0,0,2,0,0,0,0,0,0,0,37287562.0
95,1-506071646,1299756545,Accepted,In Progress,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,265617.0,69833.0,41345.0,Thursday,37021945.0,12,8,0,0,0,2,0,0,0,0,0,0,0,37287562.0
96,1-506071646,1299757653,Accepted,Wait - User,A2_1,Org line A2,D8,Medium,PROD542,nl,POLAND,266725.0,1108.0,42453.0,Thursday,37020837.0,13,8,0,0,0,2,0,0,0,0,0,0,0,37287562.0


In [35]:
sidx = 94
eidx = 95
total_time_upper_bound = int( y_train[sidx] * (90 / 100) )  # A percentage of the original total time of the trace
query_instances = X_train[sidx: eidx]
if method == "genetic":  # or method == "kdtree":
    feature_weights = {"ACTIVITY": 1.0, "Involved_ST_Function_Div": 1.0, "Involved_Org_line_3": 1.0, "Involved_ST": 1.0}
    # feature_weights = {"ACTIVITY": 1, "Involved_ST_Function_Div": 1}
    cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=15, desired_range=[0, total_time_upper_bound], features_to_vary=cols_to_vary,
                                                    feature_weights=feature_weights)
else:
    cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=200, desired_range=[0, total_time_upper_bound], features_to_vary=cols_to_vary)

cfe.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:05<00:00,  5.83s/it]

Query instance (original outcome : 32735702)





Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,Queued,Awaiting Assignment,A2_1,Org line A2,D8,Medium,PROD542,nl,Netherlands,195784.0,91.0,57912.0,Wednesday,12,7,0,0,0,2,0,0,0,0,0,0,0,32735702.0



Diverse Counterfactual set (new outcome: [0, 33558805])


Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,-,-,A2_4,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-
1,-,In Progress,-,-,G114 2nd,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,23892532.0
2,-,-,-,Org line V9,M21 2nd,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,22147484.0
3,-,-,E_3,-,V45,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,23880142.0
4,-,-,-,-,V9,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,23627424.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,-,-,-,-,G78,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,23627424.0
196,-,-,-,Other,W16 3rd,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,22147484.0
197,-,-,-,Org line V11,B14,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,22147484.0
198,-,Wait - Vendor,E_7,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-


## Apply Transition System Validation to the above Single Query Results

In [39]:
window_size = 4

# === Load the Transition Graph
_, transition_graph = transition_system(df, case_id_name=case_id_name, activity_column_name=activity_column_name, window_size=window_size)

In [40]:
def prefix_of_activities(expected_activity_index=None, event_log=None, df_single_trace=None, window_size=3, activity_column_name=None):
    """ Retrieves the prefix of activities from the event log. So that later the next activity can be validated using the prefix.
    This function can be used for 2 different cases. 1st, passing different arguments allowing is to single out trace prefix
    from the entire event log (df_train). 2nd, passing it df_single_trace and prefix is extracted from it.
    Args:
        expected_activity_index (int)
        event_log (pd.DataFrame): Dataframe containing many traces. E.g. df_train
        df_single_trace (pd.DataFrame): A dataframe that contains a single trace. E.g. a running trace or a test trace. It is expected
            that the index of this dataframe starts from 0. An assumption is that last activity/ row represents the expected activity,
            so the prefix of activities ends at the 2nd last activity. when using this parameter, query_case_id and related parameters
            are ignored.

    """
    # Error checking
    if activity_column_name is None:
        raise "Please specify activity_column_name"

    if df_single_trace is not None:
        # Check if indexes start from 0
        assert df_single_trace.loc[0] is not None

        # Due to assumption that last activity is the expected activity so the prefix ends at the 2nd last activity
        index_to_previous_activity = df_test_trace.index[-2]

        start_index, end_index = indexs_for_window(index_to_previous_activity, window_size=window_size, end_exclusive=False)
        prefix_of_activities = df_single_trace.loc[start_index: end_index, activity_column_name].to_list()  # loc is used to access the index values inside the dataframe
        prefix_of_activities = list_to_str(prefix_of_activities)

        return prefix_of_activities
    else:
        # if query_case_id is None:
        #     raise "Please specify query_case_id!"
        if event_log is None:
            raise "Please specify event_log!"
        if expected_activity_index is None:
            raise "Please specify expected_activity_index!"

        query_case_id = get_case_id( event_log[expected_activity_index: expected_activity_index+1] )

        # Isolate the query_case_id trace
        df_query = event_log[ event_log[case_id_name] == query_case_id ]

        # Prefix ends before the expected activity timestamp
        index_to_previous_activity = expected_activity_index - 1

        start_index, end_index = indexs_for_window(index_to_previous_activity, window_size=window_size, end_exclusive=False)
        prefix_of_activities = df_query.loc[start_index: end_index, activity_column_name].to_list()  # loc is used to access the index values inside the dataframe
        prefix_of_activities = list_to_str(prefix_of_activities)

        return prefix_of_activities

# Case 1: For experimenting with single query
# prefix_of_activities(expected_activity_index=sidx, event_log=df_train, window_size=5, activity_column_name=activity_column_name)

# Case 2: When running in test mode
# prefix_of_activities(df_single_trace=df_test_trace, window_size=5, activity_column_name=activity_column_name)

In [43]:
def get_prefix_of_activities(expected_activity_index=None, event_log=None, df_single_trace=None, window_size=3, activity_column_name=None):
    """ Retrieves the prefix of activities from the event log. So that later the next activity can be validated using the prefix.
    This function can be used for 2 different cases. 1st, passing different arguments allowing is to single out trace prefix
    from the entire event log (df_train). 2nd, passing it df_single_trace and prefix is extracted from it.
    Args:
        expected_activity_index (int)
        event_log (pd.DataFrame): Dataframe containing many traces. E.g. df_train
        df_single_trace (pd.DataFrame): A dataframe that contains a single trace. E.g. a running trace or a test trace. It is expected
            that the index of this dataframe starts from 0. An assumption is that last activity/ row represents the expected activity,
            so the prefix of activities ends at the 2nd last activity. when using this parameter, query_case_id and related parameters
            are ignored.

    """
    # Error checking
    if activity_column_name is None:
        raise "Please specify activity_column_name"

    if df_single_trace is not None:
        # Check if indexes start from 0
        assert df_single_trace.loc[0] is not None

        # Due to assumption that last activity is the expected activity so the prefix ends at the 2nd last activity
        index_to_previous_activity = df_test_trace.index[-2]

        start_index, end_index = indexs_for_window(index_to_previous_activity, window_size=window_size, end_exclusive=False)
        prefix_of_activities = df_single_trace.loc[start_index: end_index, activity_column_name].to_list()  # loc is used to access the index values inside the dataframe
        prefix_of_activities = list_to_str(prefix_of_activities)

        return prefix_of_activities
    else:
        # if query_case_id is None:
        #     raise "Please specify query_case_id!"
        if event_log is None:
            raise "Please specify event_log!"
        if expected_activity_index is None:
            raise "Please specify expected_activity_index!"

        query_case_id = get_case_id( event_log[expected_activity_index: expected_activity_index+1] )

        # Isolate the query_case_id trace
        df_query = event_log[ event_log[case_id_name] == query_case_id ]

        # Prefix ends before the expected activity timestamp
        index_to_previous_activity = expected_activity_index - 1

        start_index, end_index = indexs_for_window(index_to_previous_activity, window_size=window_size, end_exclusive=False)
        prefix_of_activities = df_query.loc[start_index: end_index, activity_column_name].to_list()  # loc is used to access the index values inside the dataframe
        prefix_of_activities = list_to_str(prefix_of_activities)

        return prefix_of_activities

# Case 1: For experimenting with single query
# prefix_of_activities = get_prefix_of_activities(expected_activity_index=sidx, event_log=df_train, window_size=window_size, activity_column_name=activity_column_name)

# Case 2: When running in test mode
# prefix_of_activities = get_prefix_of_activities(df_single_trace=df_test_trace, window_size=window_size, activity_column_name=activity_column_name)

def validate_transition(cfe, prefix_of_activities=None, transition_graph=None, valid_resources=None):
    """  resource_columns_to_validate=None possible future parameter
    Args:
        cfe (dice_ml.counterfactual_explanations.CounterfactualExplanations): Dice counterfactual explanations object.
        window_size (int): Size of the prefix of trace for which next activity is checked. See `index_for_window` function
                            documentation.
        expected_activity_index (int):
    """
    if cfe is None:
        raise "Please specify cfe!"
    if valid_resources is None:
        raise "Please specify valid_resources!"
    if transition_graph is None:
        raise "Please specify transition_graph"
    if prefix_of_activities is None:
        raise "Please specify prefix_of_activities"

    cf_examples_df = cfe.cf_examples_list[0].final_cfs_df.copy()  # Get the counterfactual explanations dataframe from the object

    # === Verify the next activity
    indexes_to_drop = []
    for i, suggested_next_activity in cf_examples_df[activity_column_name].items():
        if suggested_next_activity not in transition_graph[prefix_of_activities]:
            indexes_to_drop.append(i)
            # print(i, suggested_next_activity)

    cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)

    # === Verify the associated resources
    indexes_to_drop = []
    for i, row in cf_examples_df[ resource_columns_to_validate ].iterrows():
        row_tuple = tuple(row)
        if row_tuple not in valid_resources:
            # print(f"removed row had: {row_tuple}")
            indexes_to_drop.append(i)

    cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)
    return cf_examples_df

# validate_transition(cfe, prefix_of_activities, transition_graph, valid_resources)

In [None]:
cfe.visualize_as_dataframe(show_only_changes=True)
print(f"Valid Counterfactual Set")
cf_examples_df


### Check Feature Importance score

In [None]:
imp = exp_genetic_iris.local_feature_importance(query_instances, total_CFs=10, desired_range=[0, total_time_upper_bound], features_to_vary=cols_to_vary)
imp.local_importance

## Experiment with Multiple Queries

In [None]:
test_cases[i+1]

In [45]:
@timeout(120)  # Set a timeout of 5 seconds
def generate_cfe(query_instances, total_time_upper_bound):
    if method == "genetic":
        cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=10, desired_range=[0, total_time_upper_bound], features_to_vary=cols_to_vary,
                                                        feature_weights = {"ACTIVITY": 1, "Involved_ST_Function_Div": 1, "Involved_Org_line_3": 1, "Involved_ST": 1},
                                                        proximity_weight=proximity_weight, sparsity_weight=sparsity_weight, diversity_weight=diversity_weight)
    else:
        cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=200, desired_range=[0, total_time_upper_bound], features_to_vary=cols_to_vary,
                                                        proximity_weight=proximity_weight, sparsity_weight=sparsity_weight, diversity_weight=diversity_weight)
    return cfe

In [47]:
# %%capture
i = 0
cfe_before_validation = []
cfe_after_validation = []
cfe_not_found = []
cases_includes_new_data = []
cases_too_small = []
for df_test_trace in test_cases:
    i+= 1
    if i == 20:
        break

    query_case_id = get_case_id(df_test_trace)

    if 0 < len(df_test_trace) <= 2:
        print("too small", i, df_test_trace[case_id_name].unique().item())
        cases_too_small.append( get_case_id(df_test_trace, multi=True) )
        continue

    X_test, y_test = prepare_df_for_ml(df_test_trace, outcome_name)
    # Access the last row of the truncated trace to replicate the behavior of a running trace
    total_time_upper_bound = int( y_test.iloc[-1] * (90 / 100) )  # A percentage of the original total time of the trace
    query_instances = X_test.iloc[-1:]
    proximity_weight = 0.2
    sparsity_weight = 0.2
    diversity_weight = 5.0

    try:
        cfe = generate_cfe( query_instances, total_time_upper_bound )
        cfe_before_validation.append( (query_case_id, cfe) )

        prefix_of_activities = get_prefix_of_activities(df_single_trace=df_test_trace, window_size=0, activity_column_name=activity_column_name)
        cfe_df = validate_transition(cfe, prefix_of_activities=prefix_of_activities, transition_graph=transition_graph, valid_resources=valid_resources)

        if len(cfe_df) > 0:
            cfe_after_validation.append( (query_case_id, cfe_df) )

    except UserConfigValidationException:
        cfe_not_found.append( query_case_id )
    except TimeoutError:  # When function takes too long
        cfe_not_found.append( query_case_id )
    except ValueError:
        # print(f"Includes feature not found in training data: {get_case_id(df_test_trace)}")
        cases_includes_new_data.append( query_case_id )

100%|██████████| 1/1 [00:05<00:00,  5.52s/it]
100%|██████████| 1/1 [00:05<00:00,  5.17s/it]
100%|██████████| 1/1 [00:05<00:00,  5.76s/it]
100%|██████████| 1/1 [00:00<00:00,  4.65it/s]


No Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec


100%|██████████| 1/1 [00:05<00:00,  5.51s/it]
100%|██████████| 1/1 [00:07<00:00,  7.62s/it]
100%|██████████| 1/1 [00:09<00:00,  9.47s/it]
100%|██████████| 1/1 [00:07<00:00,  7.92s/it]
100%|██████████| 1/1 [00:07<00:00,  7.42s/it]
100%|██████████| 1/1 [00:09<00:00,  9.09s/it]
100%|██████████| 1/1 [00:08<00:00,  8.73s/it]
  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:07<00:00,  7.49s/it]
100%|██████████| 1/1 [00:00<00:00,  4.11it/s]


No Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec


  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:06<00:00,  6.86s/it]
100%|██████████| 1/1 [00:06<00:00,  6.96s/it]
100%|██████████| 1/1 [00:00<00:00,  4.01it/s]


No Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec


100%|██████████| 1/1 [00:00<00:00,  3.87it/s]

No Counterfactuals found for the given configuration, perhaps try with different parameters... ; total time taken: 00 min 00 sec





In [48]:
data = { "cfe_before_validation": [ len(cfe_before_validation) ],
         "cfe_after_validation": [ len(cfe_after_validation) ],
         "cfe_not_found": [ len(cfe_not_found) ],
         "cases_includes_new_data": [ len(cases_includes_new_data)],
         "cases_too_small": [ len(cases_too_small) ]}
df_result = pd.DataFrame(data)
df_result

Unnamed: 0,cfe_before_validation,cfe_after_validation,cfe_not_found,cases_includes_new_data,cases_too_small
0,13,9,4,2,0


In [None]:
# for _, cfe in cfes_list:
#     cfe.visualize_as_dataframe(show_only_changes=True)
cfes_list[9][1]

## Observation
The output usually comes within 2 minutes

#### TODOS
- Check how many queries (rows) have valid CFEs
- How to make CFE not, ACTIVITY column not to take the value "Pending Liquidation request"
- Discover how DiCE handle constraints ( For us do it in post-processing step )
- CE_OU - resource column, you can modify it
- Modify just the resource (col: CE_UO) & Activity (col: ACTIVITY)
- build a transition system to validate activity and then resource
- Do the above for total time case
- I
- Implement activity validation and resource validation to the above
-   To do this effectively maybe figure out how the DiCE post process the constraints and implement this in that layer

In [None]:
query_instances = X_train[sidx: eidx]
cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=5, desired_class="opposite", features_to_vary=cols_to_vary)
cfe.visualize_as_dataframe(show_only_changes=True)

genetic_cfes = []

for idx, query_instances in X_train.iterrows():
    query_instances = query_instances.to_frame().transpose()
    # query_instances = X_train[0:1]  # an interesting query`

    cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=5, desired_class="opposite", features_to_vary=cols_to_vary)

    genetic_cfes.append( (idx, cfe) )

-Implement post processing steps
    - Transition transition_system with full traces
    - 4 tuple comparison with country and owner country



## Deprecated Code

In [None]:
def validate_transition(cfe):
    """ Deprecated function.
    Works with transition graph which has single activity as key.
    Args:
        cfe: Dice counterfactual object.
    """
    cf_examples_df = cfe.cf_examples_list[0].final_cfs_df.copy()

    # expected_next_action = cfe.cf_examples_list[0].test_instance_df[activity_column_name].item()
    # print(f"Expected next action: {expected_next_action}")

    current_activity = current_step[activity_column_name].item()
    # === Verify the next activity
    indexes_to_drop = []
    for idx, suggested_next_activity in cf_examples_df[activity_column_name].items():
        # print(f"index: {idx}, Suggest: {suggested_next_activity}")
        if suggested_next_activity not in transition_graph[current_activity]:
            indexes_to_drop.append(idx)

    cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)

    # === Verify the associated resources
    indexes_to_drop = []
    for idx, row in cf_examples_df[ ["ACTIVITY", "Involved_ST_Function_Div"] ].iterrows():
        row_tuple = tuple(row)
        if row_tuple not in valid_resources:
            # print(f"removed row had: {row_tuple}")
            indexes_to_drop.append(idx)

    cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)
    return cf_examples_df