In [24]:
from typing import Tuple, Any

import dice_ml
from dice_ml import Dice
from dice_ml.utils.exception import UserConfigValidationException

from sklearn.datasets import load_iris, fetch_california_housing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from src.transition_system import transition_system

import pandas as pd
import os
import random
import utils
from math import ceil
from timeout_decorator import timeout, TimeoutError
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Dataset: Bank Account Closure

## Setup Variables

In [2]:
pd.options.display.max_columns= None
SECONDS_TO_HOURS = 60 * 60
SECONDS_TO_DAYS = 60 * 60 * 24
case_id_name = 'SR_Number'  # The case identifier column name.
start_date_name = 'Change_Date+Time'  # Maybe change to start_et (start even time)
activity_column_name = "ACTIVITY"

## Load Data

In [3]:
data_dir = "../preprocessed_datasets/"
train_dataset_file = "train-set-cfe.csv"
test_dataset_file = "test-set-cfe.csv"
df_train = pd.read_csv(os.path.join(data_dir, train_dataset_file))
df_test = pd.read_csv(os.path.join(data_dir, test_dataset_file))

# === Load the Transition Graph
_, transition_graph = transition_system(df_train, case_id_name=case_id_name, activity_column_name=activity_column_name)

### Basic Preprocessing of the dataset

In [17]:
len(df_train)

10000

In [5]:
df_train = df_train[:10_000]  # 31_289
# df_test = df_test[: 19_041]
print(f"Rows in df_train: {len(df_train)}")
df_train.info()

Rows in df_train: 10000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 30 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   SR_Number                         10000 non-null  object 
 1   Change_Date+Time                  10000 non-null  int64  
 2   Status                            10000 non-null  object 
 3   ACTIVITY                          10000 non-null  object 
 4   Involved_ST_Function_Div          10000 non-null  object 
 5   Involved_Org_line_3               10000 non-null  object 
 6   Involved_ST                       10000 non-null  object 
 7   SR_Latest_Impact                  10000 non-null  object 
 8   Product                           10000 non-null  object 
 9   Country                           10000 non-null  object 
 10  Owner_Country                     10000 non-null  object 
 11  time_from_first                   10000 non-

### Prepare the Test Dataset

In [6]:
def get_test(df, case_id_name):
    # df_result = pd.DataFrame(columns=df.columns)
    result_lst = []

    for idx in df[case_id_name].unique():
        df_trace = df[df[case_id_name] == idx]
        # ceil enables cases with 1 row to pass through
        cut = ceil(len(df_trace) * random.uniform(0.5, 0.7)) #+ 2  # 2 because one for the floor and one for the pred
        df_trace = df_trace.iloc[:cut].reset_index(drop=True)

        # df_result = pd.concat([df_result, df_trace])
        result_lst.append(df_trace.reset_index(drop=True))
        # break
    # return df_result.reset_index(drop=True)
    return result_lst

test_cases = get_test(df_test, case_id_name)

In [7]:
# ### Features that can vary
# option 1:
# cols_to_vary = [col for col in df_train.columns if col[0] == '#']
# cols_to_vary.extend(["ACTIVITY"])

# option 2:
cols_to_vary = ["ACTIVITY", "Involved_ST_Function_Div", "Involved_Org_line_3", "Involved_ST"]

outcome_name = "lead_time"
def prepare_df_for_ml(df, outcome_name, columns_to_remove=None) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    :param str outcome_name: name of the target column.
    """
    # Before training for ml we need to remove columns that can are not needed for ML model.
    if columns_to_remove is None:
        columns_to_remove = ["Change_Date+Time", "time_remaining"]
    df = df.drop([case_id_name], axis="columns")
    df = df.drop(columns_to_remove, axis="columns")
    X = df.drop([outcome_name], axis=1)
    y = df[outcome_name]
    return X, y

X_train, y_train = prepare_df_for_ml(df_train, outcome_name)
# X_test, y_test = prepare_df_for_ml(df_test, outcome_name)

continuous_features = ["time_from_first", "time_from_previous_et", "time_from_midnight", "# ACTIVITY=In Progress", "# ACTIVITY=Awaiting Assignment",
                       "# ACTIVITY=Resolved", "# ACTIVITY=Assigned", "# ACTIVITY=Closed", "# ACTIVITY=Wait - User", "# ACTIVITY=Wait - Implementation", "# ACTIVITY=Wait",
                       "# ACTIVITY=Wait - Vendor", "# ACTIVITY=In Call", "# ACTIVITY=Wait - Customer", "# ACTIVITY=Unmatched", "# ACTIVITY=Cancelled"]
categorical_features = ["Status", "ACTIVITY", "Involved_ST_Function_Div", "Involved_Org_line_3", "Involved_ST", "SR_Latest_Impact", "Product", "Country", "Owner_Country",
                        "weekday"]

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                           ('classifier', RandomForestRegressor(n_jobs=7))])
model = clf.fit(X_train, y_train)

# ## Create DiCE model
d_iris = dice_ml.Data(dataframe=pd.concat([X_train, y_train], axis="columns"),
                      continuous_features=continuous_features,
                      outcome_name=outcome_name)

# We provide the type of model as a parameter (model_type)
m_iris = dice_ml.Model(model=model, backend="sklearn", model_type='regressor')
method = "genetic"  # genetic, kdtree, random
exp_genetic_iris = Dice(d_iris, m_iris, method="genetic")  # Method random does not support features_weights argument in generate_counterfactuals()
# exp_genetic_iris = Dice(d_iris, m_iris, method="kdtree")

In [8]:
def get_query_instance(sidx=14, eidx=16):
    assert eidx - sidx == 2, "One row represents the current action and the next one represents the suggested action"
    current_step = X_train[sidx: sidx+1]
    expected_next_step = X_train[eidx-1: eidx]
    return current_step, expected_next_step


def activity_n_resources(df, resources_columns=None):
    """
    Args:
        df (pd.DataFrame):
        resources_columns (list): columns that contains the activity and resources.
    Returns:
        Set of tuples. A single element contains the activity and resources of a single row from the
        dataframe.
    """
    if resources_columns is None:
        raise TypeError("Please specify the columns that have resources")

    valid_activity_n_resource = set( df[resources_columns].apply(tuple, axis='columns') )
    return valid_activity_n_resource

current_step, query_instances = get_query_instance(14, 16)

# valid_resources = activity_n_resources(df_train, ["ACTIVITY", "Involved_ST_Function_Div", "Involved_Org_line_3", "Involved_ST"])
valid_resources = activity_n_resources(df_train, ["ACTIVITY", "Involved_ST_Function_Div"])
# len(valid_resources)

## Experiment with Single Queries

In [9]:
sidx = 38
eidx = 48

X_train[sidx: eidx]
# df_train[sidx: eidx]

Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled
38,Completed,Resolved,V3_2,Org line C,S42,Medium,PROD453,se,Sweden,40628717.0,48.0,57459.0,Tuesday,18,6,0,4,0,3,3,4,0,0,0,0,0
39,Accepted,In Progress,A2_1,Org line C,D5,Medium,PROD706,nl,Belgium,0.0,0.0,55066.0,Thursday,0,0,0,0,0,0,0,0,0,0,0,0,0
40,Accepted,In Progress,A2_1,Org line C,D5,Medium,PROD706,nl,Belgium,6.0,6.0,55072.0,Thursday,1,0,0,0,0,0,0,0,0,0,0,0,0
41,Queued,Awaiting Assignment,missing,Org line V7n,V37 2nd,Medium,PROD706,nl,Belgium,109.0,103.0,55175.0,Thursday,2,0,0,0,0,0,0,0,0,0,0,0,0
42,Accepted,In Progress,missing,Org line V7n,V37 2nd,Medium,PROD706,nl,Netherlands,336532.0,336423.0,45998.0,Monday,2,1,0,0,0,0,0,0,0,0,0,0,0
43,Queued,Awaiting Assignment,A2_1,Org line C,D5,Medium,PROD706,nl,Netherlands,336576.0,44.0,46042.0,Monday,3,1,0,0,0,0,0,0,0,0,0,0,0
44,Accepted,In Progress,A2_1,Org line C,D5,Medium,PROD706,nl,Belgium,336626.0,50.0,46092.0,Monday,3,2,0,0,0,0,0,0,0,0,0,0,0
45,Queued,Awaiting Assignment,missing,Org line V7n,V32 2nd,Medium,PROD706,nl,Belgium,336938.0,312.0,46404.0,Monday,4,2,0,0,0,0,0,0,0,0,0,0,0
46,Accepted,In Progress,missing,Org line V7n,V32 2nd,Medium,PROD706,nl,Netherlands,5964591.0,5627653.0,58057.0,Wednesday,4,3,0,0,0,0,0,0,0,0,0,0,0
47,Queued,Awaiting Assignment,missing,Org line V7n,V32 2nd,Medium,PROD706,nl,Netherlands,30925908.0,24961317.0,49774.0,Friday,5,3,0,0,0,0,0,0,0,0,0,0,0


In [25]:
sidx = 15
eidx = 16
total_time_upper_bound = int( y_train[sidx] * (90 / 100) )  # A percentage of the original total time of the trace
query_instances = X_train[sidx: eidx]
if method == "genetic":
    feature_weights = {"ACTIVITY": 1.0, "Involved_ST_Function_Div": 1.0, "Involved_Org_line_3": 1.0, "Involved_ST": 1.0}
    # feature_weights = {"ACTIVITY": 1, "Involved_ST_Function_Div": 1}
    cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=7, desired_range=[0, total_time_upper_bound], features_to_vary=cols_to_vary,
                                                    feature_weights=feature_weights)

else:
    cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=7, desired_range=[0, total_time_upper_bound], features_to_vary=cols_to_vary)

cfe.visualize_as_dataframe(show_only_changes=True)

  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
  query_instance_df_dummies[col] = 0
 

Query instance (original outcome : 41260203)





Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,Accepted,Assigned,E_10,Org line C,G140 2nd,Medium,PROD453,se,Sweden,3617038.0,321053.0,24980.0,Monday,7.0,2.0,0.0,1.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,41260203.0



Diverse Counterfactual set (new outcome: [0, 37134182])


Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,-,Resolved,V3_2,-,G273 3rd,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,35054133.42
0,-,Resolved,A2_1,-,D5,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,35054133.42
0,-,Resolved,A2_1,-,G49,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,35054133.42
0,-,Resolved,A2_1,-,V26,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,35054133.42
0,-,Resolved,-,Org line G3,G46 2nd,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,34648118.1
0,-,Resolved,V3_2,-,G97,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,35054133.42
0,-,Resolved,V3_2,-,N42,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,35054133.42


## Apply Transition System Validation to Single Query Results

In [18]:
def validate_transition(cfe):
    cf_examples_df = cfe.cf_examples_list[0].final_cfs_df.copy()

    expected_next_action = cfe.cf_examples_list[0].test_instance_df[activity_column_name].item()
    # print(f"Expected next action: {expected_next_action}")

    current_activity = current_step[activity_column_name].item()
    # === Verify the next activity
    indexes_to_drop = []
    for idx, suggested_next_activity in cf_examples_df[activity_column_name].items():
        # print(f"index: {idx}, Suggest: {suggested_next_activity}")
        if suggested_next_activity not in transition_graph[current_activity]:
            indexes_to_drop.append(idx)

    cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)

    # === Verify the associated resources
    indexes_to_drop = []
    for idx, row in cf_examples_df[["ACTIVITY", "Involved_ST_Function_Div"]].iterrows():
        row_tuple = tuple(row)
        if row_tuple not in valid_resources:
            # print(f"removed row had: {row_tuple}")
            indexes_to_drop.append(idx)

    cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)
    return cf_examples_df
#
# cf_examples_df = cfe.cf_examples_list[0].final_cfs_df.copy()
#
# expected_next_action = cfe.cf_examples_list[0].test_instance_df[activity_column_name].item()
# print(f"Expected next action: {expected_next_action}")
#
# current_activity = current_step[activity_column_name].item()
# # === Verify the next activity
# indexes_to_drop = []
# for idx, suggested_next_activity in cf_examples_df[activity_column_name].items():
#     # print(f"index: {idx}, Suggest: {suggested_next_activity}")
#     if suggested_next_activity not in transition_graph[current_activity]:
#         indexes_to_drop.append(idx)
#
# cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)
#
# # === Verify the associated resources
# indexes_to_drop = []
# for idx, row in cf_examples_df[["ACTIVITY", "Involved_ST_Function_Div"]].iterrows():
#     row_tuple = tuple(row)
#     if row_tuple not in valid_resources:
#         # print(f"removed row had: {row_tuple}")
#         indexes_to_drop.append(idx)
#
# cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)

In [371]:
cfe.cf_examples_list[0].test_instance_df

Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,Accepted,In Progress,V3_2,Org line C,G97,Medium,PROD424,se,POLAND,1031939.0,429.0,35391.0,Monday,9,4,0,2,0,2,1,0,0,0,0,0,0,7098648.0


In [230]:
cfe.visualize_as_dataframe(show_only_changes=True)
print(f"Valid Counterfactual Set")
cf_examples_df

Query instance (original outcome : 41260204)


Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,Accepted,Assigned,E_10,Org line C,G140 2nd,Medium,PROD453,se,Sweden,3617038.0,321053.0,24980.0,Monday,7,2,0,1,0,2,2,1,0,0,0,0,0,41260204.0



Diverse Counterfactual set (new outcome: [0, 37134182])


Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,-,-,V3_3,Org line G1,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,32859970.0
1,-,-,E_6,Org line G1,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,32859970.0
2,-,-,E_7,Org line V10,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,32859970.0
3,-,-,-,Org line H,V50 2nd,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,32859970.0
4,-,-,A2_4,Org line V11,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,32859970.0
5,-,Wait,-,Org line V2,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,32859970.0
6,-,-,A2_1,Org line V9,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,32859970.0


Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,Accepted,Assigned,E_7,Org line V10,G140 2nd,Medium,PROD453,se,Sweden,3617038.0,321053.0,24980.0,Monday,7,2,0,1,0,2,2,1,0,0,0,0,0,32859970.0
1,Accepted,Assigned,E_10,Org line H,V50 2nd,Medium,PROD453,se,Sweden,3617038.0,321053.0,24980.0,Monday,7,2,0,1,0,2,2,1,0,0,0,0,0,32859970.0
2,Accepted,Assigned,A2_4,Org line V11,G140 2nd,Medium,PROD453,se,Sweden,3617038.0,321053.0,24980.0,Monday,7,2,0,1,0,2,2,1,0,0,0,0,0,32859970.0
3,Accepted,Wait,E_10,Org line V2,G140 2nd,Medium,PROD453,se,Sweden,3617038.0,321053.0,24980.0,Monday,7,2,0,1,0,2,2,1,0,0,0,0,0,32859970.0
4,Accepted,Assigned,A2_1,Org line V9,G140 2nd,Medium,PROD453,se,Sweden,3617038.0,321053.0,24980.0,Monday,7,2,0,1,0,2,2,1,0,0,0,0,0,32859970.0


### Check Feature Importance score

In [47]:
imp = exp_genetic_iris.local_feature_importance(query_instances, total_CFs=10, desired_range=[0, total_time_upper_bound], features_to_vary=cols_to_vary)
imp.local_importance

100%|██████████| 1/1 [00:00<00:00,  1.63it/s]


[{'Involved_ST': 1.0,
  'ACTIVITY': 0.3,
  'Involved_ST_Function_Div': 0.3,
  'Involved_Org_line_3': 0.1,
  'Status': 0.0,
  'SR_Latest_Impact': 0.0,
  'Product': 0.0,
  'Country': 0.0,
  'Owner_Country': 0.0,
  'weekday': 0.0,
  'time_from_first': 0.0,
  'time_from_previous_et': 0.0,
  'time_from_midnight': 0.0,
  '# ACTIVITY=In Progress': 0.0,
  '# ACTIVITY=Awaiting Assignment': 0.0,
  '# ACTIVITY=Resolved': 0.0,
  '# ACTIVITY=Assigned': 0.0,
  '# ACTIVITY=Closed': 0.0,
  '# ACTIVITY=Wait - User': 0.0,
  '# ACTIVITY=Wait - Implementation': 0.0,
  '# ACTIVITY=Wait': 0.0,
  '# ACTIVITY=Wait - Vendor': 0.0,
  '# ACTIVITY=In Call': 0.0,
  '# ACTIVITY=Wait - Customer': 0.0,
  '# ACTIVITY=Unmatched': 0.0,
  '# ACTIVITY=Cancelled': 0.0}]

## Experiment with Multiple Queries

In [46]:
test_cases[i+1]

Unnamed: 0,SR_Number,Change_Date+Time,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,time_remaining,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,1-737405220,1335902801,Queued,Awaiting Assignment,A2_3,Org line A2,G331 3rd,Medium,PROD5,se,0,0.0,0.0,72401.0,Tuesday,702537.0,0,0,0,0,0,0,0,0,0,0,0,0,0,702537.0


In [36]:
def get_case_id(df, case_id_name=case_id_name, multi=False):
    return df[case_id_name].unique().item()

@timeout(120)  # Set a timeout of 5 seconds
def generate_cfe(query_instances, total_time_upper_bound):
    if method == "genetic":
        cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=10, desired_range=[0, total_time_upper_bound], features_to_vary=cols_to_vary,
                                                        feature_weights = {"ACTIVITY": 1, "Involved_ST_Function_Div": 1, "Involved_Org_line_3": 1, "Involved_ST": 1},
                                                        proximity_weight=proximity_weight, sparsity_weight=sparsity_weight, diversity_weight=diversity_weight)
    else:
        cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=7, desired_range=[0, total_time_upper_bound], features_to_vary=cols_to_vary,
                                                        proximity_weight=proximity_weight, sparsity_weight=sparsity_weight, diversity_weight=diversity_weight)
    return cfe

In [None]:
%%capture
i = 0
cfes_list = []
cases_cfe_not_found = []
cases_includes_new_data = []
cases_too_small = []
for df_test_trace in test_cases:
    # i+= 1
    # if i == 20:
    #     break

    if 0 < len(df_test_trace) <= 2:
        print("too small", i, df_test_trace[case_id_name].unique().item())
        cases_too_small.append( get_case_id(df_test_trace, multi=True) )
        continue

    X_test, y_test = prepare_df_for_ml(df_test_trace, outcome_name)
    # Access the last row of the truncated trace to replicate the behavior of a running trace
    total_time_upper_bound = int( y_test.iloc[-1] * (90 / 100) )  # A percentage of the original total time of the trace
    query_instances = X_test.iloc[-1:]
    proximity_weight = 0.2
    sparsity_weight = 0.2
    diversity_weight = 5.0
    try:

        cfe = generate_cfe(query_instances, total_time_upper_bound)

        cfe = validate_transition(cfe)

        if len(cfe) > 0:
            cfes_list.append( (get_case_id(df_test_trace), cfe) )

    except UserConfigValidationException:
        cases_cfe_not_found.append(get_case_id(df_test_trace))
    except TimeoutError:  # When function takes too long
        cases_cfe_not_found.append(get_case_id(df_test_trace))
    except ValueError:
        # print(f"Includes feature not found in training data: {get_case_id(df_test_trace)}")
        cases_includes_new_data.append(get_case_id(df_test_trace))



In [None]:
len(cfes_list)

In [387]:
# for _, cfe in cfes_list:
#     cfe.visualize_as_dataframe(show_only_changes=True)
cfes_list[9][1]

Unnamed: 0,Status,ACTIVITY,Involved_ST_Function_Div,Involved_Org_line_3,Involved_ST,SR_Latest_Impact,Product,Country,Owner_Country,time_from_first,time_from_previous_et,time_from_midnight,weekday,# ACTIVITY=In Progress,# ACTIVITY=Awaiting Assignment,# ACTIVITY=Resolved,# ACTIVITY=Assigned,# ACTIVITY=Closed,# ACTIVITY=Wait - User,# ACTIVITY=Wait - Implementation,# ACTIVITY=Wait,# ACTIVITY=Wait - Vendor,# ACTIVITY=In Call,# ACTIVITY=Wait - Customer,# ACTIVITY=Unmatched,# ACTIVITY=Cancelled,lead_time
0,Accepted,Resolved,V3_2,Org line C,G7 3rd,Medium,PROD562,se,Sweden,600134.0,401.0,45412.0,Monday,6,7,0,0,0,1,0,0,0,0,0,0,0,1188408.0
1,Accepted,In Progress,V3_2,Org line H,S45 2nd,Medium,PROD562,se,Sweden,600134.0,401.0,45412.0,Monday,6,7,0,0,0,1,0,0,0,0,0,0,0,1810092.0
2,Accepted,Wait,V3_2,Org line C,V33,Medium,PROD562,se,Sweden,600134.0,401.0,45412.0,Monday,6,7,0,0,0,1,0,0,0,0,0,0,0,1833501.0
3,Accepted,In Progress,V3_2,Org line V11,G49,Medium,PROD562,se,Sweden,600134.0,401.0,45412.0,Monday,6,7,0,0,0,1,0,0,0,0,0,0,0,2512714.0
4,Accepted,In Progress,V3_2,Org line C,L4 2nd,Medium,PROD562,se,Sweden,600134.0,401.0,45412.0,Monday,6,7,0,0,0,1,0,0,0,0,0,0,0,1681719.0
5,Accepted,In Progress,missing,Org line C,G49,Medium,PROD562,se,Sweden,600134.0,401.0,45412.0,Monday,6,7,0,0,0,1,0,0,0,0,0,0,0,1667908.0
6,Accepted,Wait - User,V3_2,Org line V8,G49,Medium,PROD562,se,Sweden,600134.0,401.0,45412.0,Monday,6,7,0,0,0,1,0,0,0,0,0,0,0,1819502.0


## Observation
The output usually comes within 2 minutes

#### TODOS
- Check how many queries (rows) have valid CFEs
- How to make CFE not, ACTIVITY column not to take the value "Pending Liquidation request"
- Discover how DiCE handle constraints ( For us do it in post-processing step )
- CE_OU - resource column, you can modify it
- Modify just the resource (col: CE_UO) & Activity (col: ACTIVITY)
- build a transition system to validate activity and then resource
- Do the above for total time case
- I
- Implement activity validation and resource validation to the above
-   To do this effectively maybe figure out how the DiCE post process the constraints and implement this in that layer

In [None]:
query_instances = X_train[sidx: eidx]
cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=5, desired_class="opposite", features_to_vary=cols_to_vary)
cfe.visualize_as_dataframe(show_only_changes=True)

genetic_cfes = []

for idx, query_instances in X_train.iterrows():
    query_instances = query_instances.to_frame().transpose()
    # query_instances = X_train[0:1]  # an interesting query`

    cfe = exp_genetic_iris.generate_counterfactuals(query_instances, total_CFs=5, desired_class="opposite", features_to_vary=cols_to_vary)

    genetic_cfes.append( (idx, cfe) )