In [25]:
import dice_ml
from dice_ml import Dice
from dice_ml.utils.exception import UserConfigValidationException

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris, fetch_california_housing

from src.transition_system import transition_system, indexs_for_window, list_to_str

import pandas as pd
import pickle
import os
import random
from math import ceil
from wrapt_timeout_decorator import timeout

# from multiprocessing import Process
# from timeout_decorator import timeout, TimeoutError
# import signal
from typing import Tuple, Any, List, Union
from collections import Counter
import utils
from time import sleep

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Dataset: Bank Account Closure

## Setup Variables

In [27]:
pd.options.display.max_columns= None
KPI = "activity_occurrence"  # activity_occurrence, ...
SECONDS_TO_HOURS = 60 * 60
SECONDS_TO_DAYS = 60 * 60 * 24
WINDOW_SIZE = 3
REDUCED_KPI_TIME = 90
TOTAL_CFS = 15                        # Number of CFs DiCE algorithm should produce
TRAIN_DATA_SIZE = 50_335             # 170_335
DICE_METHOD = "random"
RESULTS_FILE_PATH_N_NAME = "experiment_results/random-full-activity_occu.csv"
proximity_weight = 0.2
sparsity_weight = 0.2
diversity_weight = 5.0

case_id_name = 'REQUEST_ID'  # The case identifier column name.
# start_date_name = 'Change_Date+Time'  # Maybe change to start_et (start even time)
activity_column_name = "ACTIVITY"

## Load Data

In [28]:
data_dir = "../preprocessed_datasets/"
train_dataset_file = "bank_acc_train.csv"
test_dataset_file = "bank_acc_test.csv"
test_pickle_dataset_file = "bank_acc-test.pkl"
df = pd.read_csv("../data/completed.csv")  # Use full dataset for transition systens
df_train = pd.read_csv(os.path.join(data_dir, train_dataset_file))
df_test = pd.read_csv(os.path.join(data_dir, test_dataset_file))

### Basic Preprocessing of the dataset

In [29]:
df = df.fillna("missing")
df_train = df_train[:TRAIN_DATA_SIZE]  # 31_066
# df_test = df_test[: 19_041]
print(f"Rows in df_train: {len(df_train):,}")
print(f"Rows in df_test: {len(df_test):,}")
# df_train.info()

Rows in df_train: 50,335
Rows in df_test: 6,391


### Helper functions

In [30]:
def variable_type_analysis(X_train, case_id_name, activity_name):
    """
    Args:
        X_train:
        case_id_name:
        activity_name:

    Returns:
            Tuple[List[int], List[str], List[float]]: The explanation of the lists is as:
            1st list: quantitative_attributes. Names of columns with numeric values.
            2nd List: case_attributes. Names of columns whose
                      value remains same for a single trace. Basically 1 value per trace.
            3rd List: event_attributes. Names of columns with string type.
    """
    quantitative_attributes = list()
    case_attributes = list()
    event_attributes = list()

    for col in X_train.columns:  # for col in tqdm.tqdm(X_train.columns):

        if (col not in [case_id_name, activity_name]) and (col[0] != '#'):
            if type(X_train[col][0]) != str:
                quantitative_attributes.append(col)
            else:
                trace = True
                for idx in X_train[case_id_name].unique():  # 150 has been set to be big enough
                    df = X_train[X_train[case_id_name] == idx]
                    if len(set(df[col].unique())) != 1:
                        trace = False
                if trace == True:
                    case_attributes.append(col)
                else:
                    event_attributes.append(col)

    return quantitative_attributes, case_attributes, event_attributes

def get_case_id(df, case_id_name=case_id_name) -> Union[str, int]:  # multi=False
    return df[case_id_name].unique().item()

def get_query_instance(sidx=14, eidx=16):
    assert eidx - sidx == 2, "One row represents the current action and the next one represents the suggested action"
    current_step = X_train[sidx: sidx+1]
    expected_next_step = X_train[eidx-1: eidx]
    return current_step, expected_next_step

def activity_n_resources(df, resources_columns=None, threshold_percentage=100):
    """
    Creates a set of tuples, each tuple has elements from the columns specified through `resource_columns`.
    E.g. { ('action_1', 'rresource_1'), ... (activity_3, resource_5) }
    Args:
        df (pd.DataFrame):
        resources_columns (list): columns that contains the activity and resources.
    Returns:
        Set of tuples. A single element contains the activity and resources of a single row from the
        dataframe.
    """
    if resources_columns is None:
        # raise TypeError("Please specify the columns that have resources")
        resources_columns = [activity_column_name, 'Involved_ST_Function_Div', 'Involved_Org_line_3',
                             'Involved_ST', 'Country', 'Owner_Country']

    threshold = threshold_percentage / 100

    valid_activity_n_resource = set( df[resources_columns].apply(tuple, axis='columns') )

    # combo: combination
    resource_combo_frequency = {}

    valid_resource_combo = df[resources_columns].apply(tuple, axis='columns')

    for elem in valid_resource_combo:
        if resource_combo_frequency.get(elem):
            resource_combo_frequency[elem] += 1
        else:
            resource_combo_frequency[elem] = 1
    # Creates a list of (combo, counts)
    resource_combo_counts = [ (k, v) for k, v in resource_combo_frequency.items() ]
    sorted_resource_combo_counts = sorted( resource_combo_counts, key=lambda item: item[1], reverse=True )
    sorted_combos = [combo for combo, _ in sorted_resource_combo_counts ]
    amount_of_combos_to_select = int( len(sorted_combos) * threshold ) + 1
    valid_activity_n_resources = sorted_combos[:amount_of_combos_to_select]
    return valid_activity_n_resources

# current_step, query_instances = get_query_instance(14, 16)

# valid_resources = activity_n_resources(df_train, ["ACTIVITY", "Involved_ST_Function_Div", "Involved_Org_line_3", "Involved_ST"])
resource_columns_to_validate = [activity_column_name, 'CE_UO', 'ROLE']
valid_resources = activity_n_resources(df, resource_columns_to_validate)
# len(valid_resources)

In [31]:
# # === Analysis of all the columns in the dataset
# variable_type_analysis(df, case_id_name=case_id_name, activity_name=activity_column_name)

### Prepare the Test Dataset

In [32]:
def get_test(df, case_id_name):
    """
    Returns:
        list of DataFrames
    """
    # df_result = pd.DataFrame(columns=df.columns)
    result_lst = []

    for idx in df[case_id_name].unique():
        df_trace = df.loc[df[case_id_name] == idx]  # Passing index labels in df.loc[]
        # ceil enables cases with 1 row to pass through
        cut = ceil(len(df_trace) * random.uniform(0.5, 0.7)) #+ 2  # 2 because one for the floor and one for the pred
        df_trace = df_trace.iloc[:cut].reset_index(drop=True)

        # df_result = pd.concat([df_result, df_trace])
        result_lst.append(df_trace.reset_index(drop=True))
        # break
    # return df_result.reset_index(drop=True)
    return result_lst

# test_cases = get_test(df_test, case_id_name)

# # === Pickle this dataset for comparison of different methods
# with open(os.path.join(data_dir, test_pickle_dataset_file), 'wb') as file:
#     pickle.dump(test_cases, file)

# === Unpickle the Standard test-set. To standardize the test across different parameters.
with open(os.path.join(data_dir, test_pickle_dataset_file), 'rb') as file:
    test_cases = pickle.load(file)

In [34]:
# ### Features that can vary
# option 1:
# cols_to_vary = [col for col in df_train.columns if col[0] == '#']

cols_to_vary = ["ACTIVITY", "CE_UO", "ROLE"]

outcome_name = "Back-Office Adjustment Requested"

def prepare_df_for_ml(df, outcome_name, columns_to_remove=None) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    :param str outcome_name: name of the target column.
    """
    # Before training for ml we need to remove columns that can are not needed for ML model.
    if columns_to_remove is None:
        columns_to_remove = ["Change_Date+Time", "time_remaining"]
    df = df.drop([case_id_name], axis="columns")
    df = df.drop(columns_to_remove, axis="columns")
    X = df.drop([outcome_name], axis=1)
    y = df[outcome_name]
    return X, y

X_train, y_train = prepare_df_for_ml(df_train, outcome_name, columns_to_remove=["START_DATE", "END_DATE", "time_remaining"])
# X_test, y_test = prepare_df_for_ml(df_test, outcome_name)

continuous_features = ["time_from_first", "time_from_previous_et", "time_from_midnight", "activity_duration", '# ACTIVITY=Service closure Request with network responsibility',
                    '# ACTIVITY=Service closure Request with BO responsibility', '# ACTIVITY=Pending Request for Reservation Closure', '# ACTIVITY=Pending Liquidation Request',
                    '# ACTIVITY=Request completed with account closure', '# ACTIVITY=Request created', '# ACTIVITY=Authorization Requested',
                    '# ACTIVITY=Evaluating Request (NO registered letter)', '# ACTIVITY=Network Adjustment Requested', '# ACTIVITY=Pending Request for acquittance of heirs',
                    '# ACTIVITY=Request deleted', '# ACTIVITY=Back-Office Adjustment Requested', '# ACTIVITY=Evaluating Request (WITH registered letter)',
                    '# ACTIVITY=Request completed with customer recovery', '# ACTIVITY=Pending Request for Network Information',]
categorical_features = ["CLOSURE_TYPE", "CLOSURE_REASON", "ACTIVITY", "CE_UO", "ROLE", "weekday"]

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                           ('classifier', RandomForestClassifier(n_jobs=7))])
model = clf.fit(X_train, y_train)

# ## Create DiCE model
d_iris = dice_ml.Data(dataframe=pd.concat([X_train, y_train], axis="columns"),
                      continuous_features=continuous_features,
                      outcome_name=outcome_name)

# We provide the type of model as a parameter (model_type)
m_iris = dice_ml.Model(model=model, backend="sklearn", model_type='classifier')
method = DICE_METHOD  # genetic, kdtree, random
exp_genetic_iris = Dice(d_iris, m_iris, method=method)  # Categorical features do not support features_weights argument in generate_counterfactuals()
# exp_genetic_iris = Dice(d_iris, m_iris, method="kdtree")

## Experiment with Single Queries

In [35]:
sidx = 612
eidx = 622

df_train[sidx: eidx][[ "REQUEST_ID", "CLOSURE_TYPE", "CLOSURE_REASON", "ACTIVITY", "CE_UO", "ROLE", "Back-Office Adjustment Requested"] ]
# df_train[sidx: eidx]

Unnamed: 0,REQUEST_ID,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,CE_UO,ROLE,Back-Office Adjustment Requested
612,20182001337,Bank Recess,1 - Client lost,Service closure Request with BO responsibility,BOC,BACK-OFFICE,0
613,20182001337,Bank Recess,1 - Client lost,Pending Request for Reservation Closure,BOC,BACK-OFFICE,0
614,20182001337,Bank Recess,1 - Client lost,Pending Liquidation Request,BOC,BACK-OFFICE,0
615,20182001491,Client Recess,1 - Client lost,Request created,00877,APPLICANT,1
616,20182001491,Client Recess,1 - Client lost,Evaluating Request (NO registered letter),00037,DIRECTOR,1
617,20182001491,Client Recess,1 - Client lost,Service closure Request with network responsib...,00037,APPLICANT,1
618,20182001491,Client Recess,1 - Client lost,Service closure Request with BO responsibility,BOC,BACK-OFFICE,1
619,20182001491,Client Recess,1 - Client lost,Pending Request for Reservation Closure,BOC,BACK-OFFICE,1
620,20182001491,Client Recess,1 - Client lost,Back-Office Adjustment Requested,BOC,BACK-OFFICE,0
621,20182001491,Client Recess,1 - Client lost,Pending Request for Reservation Closure,BOC,BACK-OFFICE,0


In [36]:
sidx = 619
eidx = 620

query_instances = X_train[sidx: eidx]
cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=15, desired_class="opposite", features_to_vary=cols_to_vary,
                                                permitted_range = {"ACTIVITY": ['Service closure Request with network responsibility',
                                                                            'Service closure Request with BO responsibility',
                                                                            'Pending Request for Reservation Closure', 'Pending Liquidation Request',
                                                                            'Request created','Authorization Requested', 'Evaluating Request (NO registered letter)',
                                                                            'Network Adjustment Requested', 'Evaluating Request (WITH registered letter)',
                                                                            'Pending Request for Network Information']})  # 'Back-Office Adjustment Requested'

cfe.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:01<00:00,  1.39s/it]

Query instance (original outcome : 1)





Unnamed: 0,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,CE_UO,ROLE,time_from_first,time_from_previous_et,time_from_midnight,weekday,activity_duration,# ACTIVITY=Service closure Request with network responsibility,# ACTIVITY=Service closure Request with BO responsibility,# ACTIVITY=Pending Request for Reservation Closure,# ACTIVITY=Pending Liquidation Request,# ACTIVITY=Request completed with account closure,# ACTIVITY=Request created,# ACTIVITY=Authorization Requested,# ACTIVITY=Evaluating Request (NO registered letter),# ACTIVITY=Network Adjustment Requested,# ACTIVITY=Pending Request for acquittance of heirs,# ACTIVITY=Request deleted,# ACTIVITY=Back-Office Adjustment Requested,# ACTIVITY=Evaluating Request (WITH registered letter),# ACTIVITY=Request completed with customer recovery,# ACTIVITY=Pending Request for Network Information,Back-Office Adjustment Requested
0,Client Recess,1 - Client lost,Pending Request for Reservation Closure,BOC,BACK-OFFICE,26354849,810,45978,Friday,64,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1



Diverse Counterfactual set (new outcome: 0.0)


Unnamed: 0,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,CE_UO,ROLE,time_from_first,time_from_previous_et,time_from_midnight,weekday,activity_duration,# ACTIVITY=Service closure Request with network responsibility,# ACTIVITY=Service closure Request with BO responsibility,# ACTIVITY=Pending Request for Reservation Closure,# ACTIVITY=Pending Liquidation Request,# ACTIVITY=Request completed with account closure,# ACTIVITY=Request created,# ACTIVITY=Authorization Requested,# ACTIVITY=Evaluating Request (NO registered letter),# ACTIVITY=Network Adjustment Requested,# ACTIVITY=Pending Request for acquittance of heirs,# ACTIVITY=Request deleted,# ACTIVITY=Back-Office Adjustment Requested,# ACTIVITY=Evaluating Request (WITH registered letter),# ACTIVITY=Request completed with customer recovery,# ACTIVITY=Pending Request for Network Information,Back-Office Adjustment Requested
0,-,-,-,-,APPLICANT,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
1,-,-,Service closure Request with BO responsibility,00096,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
2,-,-,Evaluating Request (WITH registered letter),00208,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
3,-,-,-,00207,DIRECTOR,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
4,-,-,-,00396,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
5,-,-,-,00594,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
6,-,-,Service closure Request with network responsib...,00533,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
7,-,-,-,00204,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
8,-,-,-,00535,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
9,-,-,-,00652,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0


## Apply Transition System Validation to the above Single Query Results

In [37]:
## Apply Transition System Validation to the above Single Query Results
# === Load the Transition Graph
_, transition_graph = transition_system(df, case_id_name=case_id_name, activity_column_name=activity_column_name, window_size=WINDOW_SIZE)

def get_prefix_of_activities(expected_activity_index=None, event_log=None, df_single_trace=None, window_size=3, activity_column_name=None):
    """ Retrieves the prefix of activities from the event log. So that later the next activity can be validated using the prefix.
    This function can be used for 2 different cases. 1st, passing different arguments allowing is to single out trace prefix
    from the entire event log (df_train). 2nd, passing it df_single_trace and prefix is extracted from it.
    Args:
        expected_activity_index (int)
        event_log (pd.DataFrame): Dataframe containing many traces. E.g. df_train
        df_single_trace (pd.DataFrame): A dataframe that contains a single trace. E.g. a running trace or a test trace. It is expected
            that the index of this dataframe starts from 0. An assumption is that last activity/ row represents the expected activity,
            so the prefix of activities ends at the 2nd last activity. when using this parameter, query_case_id and related parameters
            are ignored.
    """

    # Error checking
    if activity_column_name is None:
        raise "Please specify activity_column_name"

    if df_single_trace is not None:
        # Check if indexes start from 0
        assert df_single_trace.loc[0] is not None

        # Due to assumption that last activity is the expected activity so the prefix ends at the 2nd last activity
        index_to_previous_activity = df_single_trace.index[-2]

        start_index, end_index = indexs_for_window(index_to_previous_activity, window_size=window_size, end_exclusive=False)
        prefix_of_activities = df_single_trace.loc[start_index: end_index, activity_column_name].to_list()  # loc is used to access the index values inside the dataframe
        prefix_of_activities = list_to_str(prefix_of_activities)

        return prefix_of_activities
    else:
        # if query_case_id is None:
        #     raise "Please specify query_case_id!"
        if event_log is None:
            raise "Please specify event_log!"
        if expected_activity_index is None:
            raise "Please specify expected_activity_index!"

        query_case_id = get_case_id( event_log[expected_activity_index: expected_activity_index+1] )

        # Isolate the query_case_id trace
        df_query = event_log[ event_log[case_id_name] == query_case_id ]

        # Prefix ends before the expected activity timestamp
        index_to_previous_activity = expected_activity_index - 1

        start_index, end_index = indexs_for_window(index_to_previous_activity, window_size=window_size, end_exclusive=False)
        prefix_of_activities = df_query.loc[start_index: end_index, activity_column_name].to_list()  # loc is used to access the index values inside the dataframe
        prefix_of_activities = list_to_str(prefix_of_activities)

        return prefix_of_activities


def validate_transition(cfe, prefix_of_activities=None, transition_graph=None, valid_resources=None):
    """  resource_columns_to_validate=None possible future parameter
    Args:
        cfe (dice_ml.counterfactual_explanations.CounterfactualExplanations): Dice counterfactual explanations object.
        window_size (int): Size of the prefix of trace for which next activity is checked. See `index_for_window` function
                            documentation.
        expected_activity_index (int):
    """
    if cfe is None:
        raise "Please specify cfe!"
    if valid_resources is None:
        raise "Please specify valid_resources!"
    if transition_graph is None:
        raise "Please specify transition_graph"
    if prefix_of_activities is None:
        raise "Please specify prefix_of_activities"

    cf_examples_df = cfe.cf_examples_list[0].final_cfs_df.copy()  # Get the counterfactual explanations dataframe from the object

    # === Verify the next activity
    indexes_to_drop = []
    for i, suggested_next_activity in cf_examples_df[activity_column_name].items():
        if suggested_next_activity not in transition_graph[prefix_of_activities]:
            indexes_to_drop.append(i)
            # print(i, suggested_next_activity)

    cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)

    # === Verify the associated resources
    indexes_to_drop = []
    for i, row in cf_examples_df[ resource_columns_to_validate ].iterrows():
        row_tuple = tuple(row)
        if row_tuple not in valid_resources:
            # print(f"removed row had: {row_tuple}")
            indexes_to_drop.append(i)

    cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)
    return cf_examples_df

In [38]:
# list( filter(lambda x: x[4] == "nl" and x[5] == "Netherlands", valid_resources) )

In [230]:
cfe.visualize_as_dataframe(show_only_changes=True)
print(f"Valid Counterfactual Set")
cf_examples_df

Query instance (original outcome : 41260204)


Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,Accepted,Assigned,E_10,Org line C,G140 2nd,Medium,PROD453,se,Sweden,3617038.0,321053.0,24980.0,Monday,7,2,0,1,0,2,2,1,0,0,0,0,0,41260204.0



Diverse Counterfactual set (new outcome: [0, 37134182])


Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,-,-,V3_3,Org line G1,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,32859970.0
1,-,-,E_6,Org line G1,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,32859970.0
2,-,-,E_7,Org line V10,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,32859970.0
3,-,-,-,Org line H,V50 2nd,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,32859970.0
4,-,-,A2_4,Org line V11,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,32859970.0
5,-,Wait,-,Org line V2,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,32859970.0
6,-,-,A2_1,Org line V9,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,32859970.0


Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,Accepted,Assigned,E_7,Org line V10,G140 2nd,Medium,PROD453,se,Sweden,3617038.0,321053.0,24980.0,Monday,7,2,0,1,0,2,2,1,0,0,0,0,0,32859970.0
1,Accepted,Assigned,E_10,Org line H,V50 2nd,Medium,PROD453,se,Sweden,3617038.0,321053.0,24980.0,Monday,7,2,0,1,0,2,2,1,0,0,0,0,0,32859970.0
2,Accepted,Assigned,A2_4,Org line V11,G140 2nd,Medium,PROD453,se,Sweden,3617038.0,321053.0,24980.0,Monday,7,2,0,1,0,2,2,1,0,0,0,0,0,32859970.0
3,Accepted,Wait,E_10,Org line V2,G140 2nd,Medium,PROD453,se,Sweden,3617038.0,321053.0,24980.0,Monday,7,2,0,1,0,2,2,1,0,0,0,0,0,32859970.0
4,Accepted,Assigned,A2_1,Org line V9,G140 2nd,Medium,PROD453,se,Sweden,3617038.0,321053.0,24980.0,Monday,7,2,0,1,0,2,2,1,0,0,0,0,0,32859970.0


### Check Feature Importance score

In [None]:
imp = exp_genetic_iris.local_feature_importance(query_instances, total_CFs=10, desired_range=[0, total_time_upper_bound], features_to_vary=cols_to_vary)
imp.local_importance

## Experiment with Multiple Queries

In [17]:
test_cases[i+1]

Unnamed: 0,SR_Number,Change_Date+Time,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,time_remaining,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,1-643733486,1326813444,Accepted,In Progress,A2_1,Org line C,D6,Medium,PROD542,nl,Belgium,0.0,0.0,55044.0,Tuesday,9965027.0,0,0,0,0,0,0,0,0,0,0,0,0,0,9965027.0
1,1-643733486,1326813477,Accepted,In Progress,A2_1,Org line C,D6,Medium,PROD542,nl,Belgium,33.0,33.0,55077.0,Tuesday,9964994.0,1,0,0,0,0,0,0,0,0,0,0,0,0,9965027.0
2,1-643733486,1326813558,Queued,Awaiting Assignment,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,Belgium,114.0,81.0,55158.0,Tuesday,9964913.0,2,0,0,0,0,0,0,0,0,0,0,0,0,9965027.0
3,1-643733486,1326977851,Accepted,In Progress,missing,Org line V7n,V37 2nd,Medium,PROD542,nl,Netherlands,164407.0,164293.0,46651.0,Thursday,9800620.0,2,1,0,0,0,0,0,0,0,0,0,0,0,9965027.0
4,1-643733486,1326977939,Queued,Awaiting Assignment,A2_1,Org line C,D6,Medium,PROD542,nl,Netherlands,164495.0,88.0,46739.0,Thursday,9800532.0,3,1,0,0,0,0,0,0,0,0,0,0,0,9965027.0
5,1-643733486,1326982060,Accepted,In Progress,A2_1,Org line C,D6,Medium,PROD542,nl,Belgium,168616.0,4121.0,50860.0,Thursday,9796411.0,3,2,0,0,0,0,0,0,0,0,0,0,0,9965027.0
6,1-643733486,1326982072,Accepted,Assigned,A2_1,Org line C,D6,Medium,PROD542,nl,Belgium,168628.0,12.0,50872.0,Thursday,9796399.0,4,2,0,0,0,0,0,0,0,0,0,0,0,9965027.0
7,1-643733486,1326983196,Accepted,In Progress,A2_1,Org line C,D6,Medium,PROD542,nl,Belgium,169752.0,1124.0,51996.0,Thursday,9795275.0,4,2,0,1,0,0,0,0,0,0,0,0,0,9965027.0


In [45]:
@timeout(120)  # Timeout unit seconds
def generate_cfe(query_instances, total_cfs=50):
    """
    Args:
        query_instances (pd.DataFrame):
        total_time_upper_bound (int): The upper value of the target (y) label.
        total_cfs (int): Number of Counterfactual examples (CFEs) to produce via `generate_counterfactuals()`

    Returns:
        cfe (dice_ml.counterfactual_explanations.CounterfactualExplanations): Dice counterfactual explanations object.
    """
    if KPI == "activity_occurrence":
        cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=15, desired_class="opposite", features_to_vary=cols_to_vary,
                                                        permitted_range = {"ACTIVITY": ['Service closure Request with network responsibility',
                                                                                'Service closure Request with BO responsibility',
                                                                                'Pending Request for Reservation Closure', 'Pending Liquidation Request',
                                                                                'Request created','Authorization Requested', 'Evaluating Request (NO registered letter)',
                                                                                'Network Adjustment Requested', 'Evaluating Request (WITH registered letter)',
                                                                                'Pending Request for Network Information']})  # 'Back-Office Adjustment Requested'
    return cfe

In [None]:
# %%capture
cfe_before_validation = []
cfe_after_validation = []
cfe_not_found = []
cases_includes_new_data = []
cases_too_small = []
cases_zero_in_y = []
i = 0
for df_test_trace in test_cases:

    query_case_id = get_case_id(df_test_trace)

    if 0 < len(df_test_trace) <= 2:
        print("too small", i, df_test_trace[case_id_name].unique().item())
        cases_too_small.append( query_case_id )  # , multi=True
        continue

    X_test, y_test = prepare_df_for_ml(df_test_trace, outcome_name, columns_to_remove=["START_DATE", "END_DATE", "time_remaining"])

    # Check if y_test is 0 then don't generate CFE
    if y_test.iloc[-1] == 0:
        cases_zero_in_y.append( query_case_id )
        continue

    # Access the last row of the truncated trace to replicate the behavior of a running trace
    query_instances = X_test.iloc[-1:]

    try:
        cfe = generate_cfe( query_instances, total_cfs=TOTAL_CFS )
        cfe_before_validation.append( (query_case_id, cfe) )

        prefix_of_activities = get_prefix_of_activities(df_single_trace=df_test_trace, window_size=0, activity_column_name=activity_column_name)
        cfe_df = validate_transition(cfe, prefix_of_activities=prefix_of_activities, transition_graph=transition_graph, valid_resources=valid_resources)

        if len(cfe_df) > 0:
            cfe_after_validation.append( (query_case_id, cfe_df) )

    except UserConfigValidationException:
        cfe_not_found.append( query_case_id )
    except TimeoutError as err:  # When function takes too long
        cfe_not_found.append( query_case_id )
        print("TimeoutError caught:", err)
    except ValueError:
        # print(f"Includes feature not found in training data: {get_case_id(df_test_trace)}")
        cases_includes_new_data.append( query_case_id )
    # This error is seen occurring on when running lots of loops on the server
    except AttributeError as e:
        print("AttributeError caught:", e)
    # except Exception as err:
    #     print(f"Broadest Exception handler invoked", err)

    print(f"====== Start Saving the result ======")
    print(f"====== End Saving the result ======")

    i+= 1
    if i == 20:
        break


In [47]:
data = { "cfe_before_validation": [ len(cfe_before_validation) ],
         "cfe_after_validation": [ len(cfe_after_validation) ],
         "cfe_not_found": [ len(cfe_not_found) ],
         "cases_includes_new_data": [ len(cases_includes_new_data)],
         "cases_too_small": [ len(cases_too_small) ],
         "cases_zero_in_y": [ len(cases_zero_in_y)]}
df_result = pd.DataFrame(data)
df_result

Unnamed: 0,cfe_before_validation,cfe_after_validation,cfe_not_found,cases_includes_new_data,cases_too_small,cases_zero_in_y
0,2,0,18,0,0,2


In [387]:
# for _, cfe in cfes_list:
#     cfe.visualize_as_dataframe(show_only_changes=True)
cfes_list[9][1]

Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,Accepted,Resolved,V3_2,Org line C,G7 3rd,Medium,PROD562,se,Sweden,600134.0,401.0,45412.0,Monday,6,7,0,0,0,1,0,0,0,0,0,0,0,1188408.0
1,Accepted,In Progress,V3_2,Org line H,S45 2nd,Medium,PROD562,se,Sweden,600134.0,401.0,45412.0,Monday,6,7,0,0,0,1,0,0,0,0,0,0,0,1810092.0
2,Accepted,Wait,V3_2,Org line C,V33,Medium,PROD562,se,Sweden,600134.0,401.0,45412.0,Monday,6,7,0,0,0,1,0,0,0,0,0,0,0,1833501.0
3,Accepted,In Progress,V3_2,Org line V11,G49,Medium,PROD562,se,Sweden,600134.0,401.0,45412.0,Monday,6,7,0,0,0,1,0,0,0,0,0,0,0,2512714.0
4,Accepted,In Progress,V3_2,Org line C,L4 2nd,Medium,PROD562,se,Sweden,600134.0,401.0,45412.0,Monday,6,7,0,0,0,1,0,0,0,0,0,0,0,1681719.0
5,Accepted,In Progress,missing,Org line C,G49,Medium,PROD562,se,Sweden,600134.0,401.0,45412.0,Monday,6,7,0,0,0,1,0,0,0,0,0,0,0,1667908.0
6,Accepted,Wait - User,V3_2,Org line V8,G49,Medium,PROD562,se,Sweden,600134.0,401.0,45412.0,Monday,6,7,0,0,0,1,0,0,0,0,0,0,0,1819502.0


## Observation
The output usually comes within 2 minutes

In [None]:
query_instances = X_train[sidx: eidx]
cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=5, desired_class="opposite", features_to_vary=cols_to_vary)
cfe.visualize_as_dataframe(show_only_changes=True)

genetic_cfes = []

for idx, query_instances in X_train.iterrows():
    query_instances = query_instances.to_frame().transpose()
    # query_instances = X_train[0:1]  # an interesting query`

    cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=5, desired_class="opposite", features_to_vary=cols_to_vary)

    genetic_cfes.append( (idx, cfe) )

## Deprecated Code

### Implement Test-Code progress "save & load"

In [19]:
save_load_dir = "../" + RESULTS_FILE_PATH_N_NAME.split(".")[0]
save_load_path = save_load_dir + ".pkl"
i = 2

# # === Pickle this dataset for comparison of different methods
# with open( save_load_path, 'wb' ) as file:
#     pickle.dump(i, file)
# print(f"Progress Bar: {i}")

# # === Unpickle the Standard test-set. To standardize the test across different parameters.
# with open( save_load_path, 'rb' ) as file:
#     i = pickle.load(file)
# print(f"Progress Bar: {i}")

Progress Bar: 200


In [22]:
a = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]
list( zip(*a) )

[(1, 4, 7), (2, 5, 8), (3, 6, 9)]

In [None]:
def validate_transition(cfe):
    """ Deprecated function.
    Works with transition graph which has single activity as key.
    Args:
        cfe: Dice counterfactual object.
    """
    cf_examples_df = cfe.cf_examples_list[0].final_cfs_df.copy()

    # expected_next_action = cfe.cf_examples_list[0].test_instance_df[activity_column_name].item()
    # print(f"Expected next action: {expected_next_action}")

    current_activity = current_step[activity_column_name].item()
    # === Verify the next activity
    indexes_to_drop = []
    for idx, suggested_next_activity in cf_examples_df[activity_column_name].items():
        # print(f"index: {idx}, Suggest: {suggested_next_activity}")
        if suggested_next_activity not in transition_graph[current_activity]:
            indexes_to_drop.append(idx)

    cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)

    # === Verify the associated resources
    indexes_to_drop = []
    for idx, row in cf_examples_df[ ["ACTIVITY", "Involved_ST_Function_Div"] ].iterrows():
        row_tuple = tuple(row)
        if row_tuple not in valid_resources:
            # print(f"removed row had: {row_tuple}")
            indexes_to_drop.append(idx)

    cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)
    return cf_examples_df