In [1]:
import dice_ml
from dice_ml import Dice
from dice_ml.utils.exception import UserConfigValidationException

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris, fetch_california_housing

from src.transition_system import transition_system, indexs_for_window, list_to_str
from src.function_store import StoreTestRun, extract_algo_name, generate_cfe, get_case_id, prepare_df_for_ml, \
    activity_n_resources, get_test_cases, get_prefix_of_activities, validate_transition

import pandas as pd
import os
import pickle
import random
import subprocess
from IPython.display import display
from math import ceil
import warnings
from wrapt_timeout_decorator import timeout

# from multiprocessing import Process
# from timeout_decorator import timeout, TimeoutError
# import signal
from typing import Tuple, Any, List, Union
from collections import Counter
import utils
from time import sleep
if "src" in os.getcwd():
    os.chdir("../")

# Suppress all warnings
warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2

# Dataset: Bank Account Closure

## Setup Variables

In [8]:
pd.options.display.max_columns= None
KPI = "activity_occurrence"  # activity_occurrence, ...
SECONDS_TO_HOURS = 60 * 60
SECONDS_TO_DAYS = 60 * 60 * 24
WINDOW_SIZE = 3
REDUCED_KPI_TIME = 90
TOTAL_CFS = 15                        # Number of CFs DiCE algorithm should produce
TRAIN_DATA_SIZE = 17_335             # 170_335
DICE_METHOD = "random"
RESULTS_FILE_PATH_N_NAME = "experiment_results/random-a01.csv"
# ToDo store the model configurations in the pkl file as well and retrieve them from here
proximity_weight = 0.2
sparsity_weight = 0.2
diversity_weight = 5.0

case_id_name = 'REQUEST_ID'  # The case identifier column name.
# start_date_name = 'Change_Date+Time'  # Maybe change to start_et (start even time)
activity_column_name = "ACTIVITY"

## Load Data

In [9]:
data_dir = "./preprocessed_datasets/"
train_dataset_file = "bank_acc_train.csv"
test_dataset_file = "bank_acc_test.csv"
test_pickle_dataset_file = "bank_acc-test.pkl"
df = pd.read_csv("./data/completed.csv")  # Use full dataset for transition systens
df_train = pd.read_csv(os.path.join(data_dir, train_dataset_file))
df_test = pd.read_csv(os.path.join(data_dir, test_dataset_file))

### Basic Preprocessing of the dataset

In [10]:
df = df.fillna("missing")
df_train = df_train[:TRAIN_DATA_SIZE]  # 31_066
# df_test = df_test[: 19_041]
print(f"Rows in df_train: {len(df_train):,}")
print(f"Rows in df_test: {len(df_test):,}")
# df_train.info()

Rows in df_train: 17,335
Rows in df_test: 6,391


### Helper functions

In [11]:
# valid_resources = activity_n_resources(df_train, ["ACTIVITY", "Involved_ST_Function_Div", "Involved_Org_line_3", "Involved_ST"])
resource_columns_to_validate = [activity_column_name, 'CE_UO', 'ROLE']
valid_resources = activity_n_resources(df, resource_columns_to_validate, threshold_percentage=90)
# len(valid_resources)

In [12]:
# # === Analysis of all the columns in the dataset
# variable_type_analysis(df, case_id_name=case_id_name, activity_name=activity_column_name)

### Prepare the Test Dataset

In [13]:
# === Unpickle the Standard test-set. To standardize the test across different parameters.
test_cases = get_test_cases(None, None, load_dataset=True, path_and_filename=os.path.join(data_dir, test_pickle_dataset_file))
print(f"Number of cases in the test set: {len(test_cases):,}")

# # === Pickle dataset for comparison of different methods
# with open(os.path.join(data_dir, test_pickle_dataset_file), 'wb') as file:
#     pickle.dump(test_cases, file)

Number of cases in the test set: 1096


In [14]:
# ### Features that can vary
# option 1:
# cols_to_vary = [col for col in df_train.columns if col[0] == '#']

cols_to_vary = ["ACTIVITY", "CE_UO", "ROLE"]

outcome_name = "Back-Office Adjustment Requested"

X_train, y_train = prepare_df_for_ml(df_train, case_id_name, outcome_name, columns_to_remove=["START_DATE", "END_DATE", "time_remaining"])

# X_test, y_test = prepare_df_for_ml(df_test, outcome_name)

continuous_features = ["time_from_first", "time_from_previous_et", "time_from_midnight", "activity_duration", '# ACTIVITY=Service closure Request with network responsibility',
                    '# ACTIVITY=Service closure Request with BO responsibility', '# ACTIVITY=Pending Request for Reservation Closure', '# ACTIVITY=Pending Liquidation Request',
                    '# ACTIVITY=Request completed with account closure', '# ACTIVITY=Request created', '# ACTIVITY=Authorization Requested',
                    '# ACTIVITY=Evaluating Request (NO registered letter)', '# ACTIVITY=Network Adjustment Requested', '# ACTIVITY=Pending Request for acquittance of heirs',
                    '# ACTIVITY=Request deleted', '# ACTIVITY=Back-Office Adjustment Requested', '# ACTIVITY=Evaluating Request (WITH registered letter)',
                    '# ACTIVITY=Request completed with customer recovery', '# ACTIVITY=Pending Request for Network Information',]
categorical_features = ["CLOSURE_TYPE", "CLOSURE_REASON", "ACTIVITY", "CE_UO", "ROLE", "weekday"]

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                           ('classifier', RandomForestClassifier(n_jobs=7))])
model = clf.fit(X_train, y_train)

# ## Create DiCE model
data_model = dice_ml.Data(dataframe=pd.concat([X_train, y_train], axis="columns"),
                      continuous_features=continuous_features,
                      outcome_name=outcome_name)

# We provide the type of model as a parameter (model_type)
ml_backend = dice_ml.Model(model=model, backend="sklearn", model_type='classifier')
method = DICE_METHOD  # genetic, kdtree, random
explainer = Dice(data_model, ml_backend, method=method)  # Categorical features do not support features_weights argument in generate_counterfactuals()
# exp_genetic_iris = Dice(d_iris, m_iris, method="kdtree")

## Experiment with Single Queries

In [9]:
sidx = 612
eidx = 622

df_train[sidx: eidx][[ "REQUEST_ID", "CLOSURE_TYPE", "CLOSURE_REASON", "ACTIVITY", "CE_UO", "ROLE", "Back-Office Adjustment Requested"] ]
# df_train[sidx: eidx]

Unnamed: 0,REQUEST_ID,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,CE_UO,ROLE,Back-Office Adjustment Requested
612,20182001337,Bank Recess,1 - Client lost,Service closure Request with BO responsibility,BOC,BACK-OFFICE,0
613,20182001337,Bank Recess,1 - Client lost,Pending Request for Reservation Closure,BOC,BACK-OFFICE,0
614,20182001337,Bank Recess,1 - Client lost,Pending Liquidation Request,BOC,BACK-OFFICE,0
615,20182001491,Client Recess,1 - Client lost,Request created,00877,APPLICANT,1
616,20182001491,Client Recess,1 - Client lost,Evaluating Request (NO registered letter),00037,DIRECTOR,1
617,20182001491,Client Recess,1 - Client lost,Service closure Request with network responsib...,00037,APPLICANT,1
618,20182001491,Client Recess,1 - Client lost,Service closure Request with BO responsibility,BOC,BACK-OFFICE,1
619,20182001491,Client Recess,1 - Client lost,Pending Request for Reservation Closure,BOC,BACK-OFFICE,1
620,20182001491,Client Recess,1 - Client lost,Back-Office Adjustment Requested,BOC,BACK-OFFICE,0
621,20182001491,Client Recess,1 - Client lost,Pending Request for Reservation Closure,BOC,BACK-OFFICE,0


In [15]:
sidx = 619
eidx = 620

query_instances = X_train[sidx: eidx]
cfe = explainer.generate_counterfactuals(query_instances, total_CFs=15, desired_class="opposite", features_to_vary=cols_to_vary,
                                                permitted_range = {"ACTIVITY": ['Service closure Request with network responsibility',
                                                                            'Service closure Request with BO responsibility',
                                                                            'Pending Request for Reservation Closure', 'Pending Liquidation Request',
                                                                            'Request created','Authorization Requested', 'Evaluating Request (NO registered letter)',
                                                                            'Network Adjustment Requested', 'Evaluating Request (WITH registered letter)',
                                                                            'Pending Request for Network Information']})  # 'Back-Office Adjustment Requested'

cfe.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:00<00:00,  1.60it/s]

Query instance (original outcome : 1)





Unnamed: 0,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,CE_UO,ROLE,time_from_first,time_from_previous_et,time_from_midnight,weekday,activity_duration,# ACTIVITY=Service closure Request with network responsibility,# ACTIVITY=Service closure Request with BO responsibility,# ACTIVITY=Pending Request for Reservation Closure,# ACTIVITY=Pending Liquidation Request,# ACTIVITY=Request completed with account closure,# ACTIVITY=Request created,# ACTIVITY=Authorization Requested,# ACTIVITY=Evaluating Request (NO registered letter),# ACTIVITY=Network Adjustment Requested,# ACTIVITY=Pending Request for acquittance of heirs,# ACTIVITY=Request deleted,# ACTIVITY=Back-Office Adjustment Requested,# ACTIVITY=Evaluating Request (WITH registered letter),# ACTIVITY=Request completed with customer recovery,# ACTIVITY=Pending Request for Network Information,Back-Office Adjustment Requested
0,Client Recess,1 - Client lost,Pending Request for Reservation Closure,BOC,BACK-OFFICE,26354849,810,45978,Friday,64,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1



Diverse Counterfactual set (new outcome: 0.0)


Unnamed: 0,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,CE_UO,ROLE,time_from_first,time_from_previous_et,time_from_midnight,weekday,activity_duration,# ACTIVITY=Service closure Request with network responsibility,# ACTIVITY=Service closure Request with BO responsibility,# ACTIVITY=Pending Request for Reservation Closure,# ACTIVITY=Pending Liquidation Request,# ACTIVITY=Request completed with account closure,# ACTIVITY=Request created,# ACTIVITY=Authorization Requested,# ACTIVITY=Evaluating Request (NO registered letter),# ACTIVITY=Network Adjustment Requested,# ACTIVITY=Pending Request for acquittance of heirs,# ACTIVITY=Request deleted,# ACTIVITY=Back-Office Adjustment Requested,# ACTIVITY=Evaluating Request (WITH registered letter),# ACTIVITY=Request completed with customer recovery,# ACTIVITY=Pending Request for Network Information,Back-Office Adjustment Requested
0,-,-,Evaluating Request (WITH registered letter),-,DIRECTOR,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
1,-,-,Network Adjustment Requested,00345,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
2,-,-,-,00191,DIRECTOR,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
3,-,-,-,00640,DIRECTOR,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
4,-,-,Service closure Request with BO responsibility,00009,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
5,-,-,Request created,00040,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
6,-,-,-,00255,APPLICANT,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
7,-,-,Authorization Requested,00277,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
8,-,-,-,00217,DIRECTOR,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0
9,-,-,Request created,00837,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0


## Apply Transition System Validation to the above Single Query Results

In [16]:
## Apply Transition System Validation to the above Single Query Results
# === Load the Transition Graph
_, transition_graph = transition_system(df, case_id_name=case_id_name, activity_column_name=activity_column_name, window_size=WINDOW_SIZE)

### Check Feature Importance score

In [None]:
# imp = explainer.local_feature_importance(query_instances, total_CFs=10, desired_range=[0, total_time_upper_bound], features_to_vary=cols_to_vary)
# imp.local_importance

## Experiment with Multiple Queries

In [17]:
state_obj = StoreTestRun(save_load_path=RESULTS_FILE_PATH_N_NAME)
save_load_path = state_obj.get_save_load_path()

if os.path.exists(save_load_path):
    state_obj.load_state()
    cases_done = state_obj.run_state["cases_done"]
else:
    cases_done = 0

state_obj.run_state["cases_done"]

1096

In [18]:
# %%capture
for df_test_trace in test_cases[cases_done:]:
    # print("Case number:", cases_done)
    # print(f"Case number is state_obj: {state_obj.run_state['cases_done']}")
    query_case_id = get_case_id(df_test_trace, case_id_name)

    if 0 < len(df_test_trace) <= 2:
        print("too small", cases_done, query_case_id)
        result_value = query_case_id
        state_obj.add_cfe_to_results( ("cases_too_small", result_value) )
        cases_stored = state_obj.save_state()
        cases_done += 1
        continue

    X_test, y_test = prepare_df_for_ml(df_test_trace, outcome_name, columns_to_remove=["START_DATE", "END_DATE", "time_remaining"])

    # Check if y_test is 0 then don't generate CFE
    if y_test.iloc[-1] == 0:
        result_value = query_case_id
        state_obj.add_cfe_to_results(("cases_zero_in_y", result_value))
        cases_stored = state_obj.save_state()
        cases_done += 1
        continue

    # Access the last row of the truncated trace to replicate the behavior of a running trace
    query_instances = X_test.iloc[-1:]

    try:
        cfe = generate_cfe(explainer, query_instances, None, total_cfs=TOTAL_CFS, features_to_vary=cols_to_vary, kpi=KPI,
                           proximity_weight=proximity_weight, sparsity_weight=sparsity_weight, diversity_weight=diversity_weight )

        result_value = (query_case_id, cfe)
        state_obj.add_cfe_to_results(("cfe_before_validation", result_value))  # save after cfe validation

        prefix_of_activities = get_prefix_of_activities(df_single_trace=df_test_trace, window_size=WINDOW_SIZE, activity_column_name=activity_column_name,)
        cfe_df = validate_transition(cfe, prefix_of_activities=prefix_of_activities, transition_graph=transition_graph, valid_resources=valid_resources,
                                         activity_column_name=activity_column_name, resource_columns_to_validate=resource_columns_to_validate)
        if len(cfe_df) > 0:
            result_value = (query_case_id, cfe_df)
            state_obj.add_cfe_to_results(("cfe_after_validation", result_value))

        cases_stored = state_obj.save_state()

    except UserConfigValidationException:
        result_value = query_case_id
        state_obj.add_cfe_to_results(("cfe_not_found", result_value))
        cases_stored = state_obj.save_state()
    except TimeoutError as err:  # When function takes too long
        result_value = query_case_id
        print("TimeoutError caught:", err)
        state_obj.add_cfe_to_results(("cfe_not_found", result_value))
        cases_stored = state_obj.save_state()
    except ValueError:
        # print(f"Includes feature not found in training data: {get_case_id(df_test_trace)}")
        result_value = query_case_id
        state_obj.add_cfe_to_results(("cases_includes_new_data", result_value))
        cases_stored = state_obj.save_state()
    # This error is seen occurring on when running lots of loops on the server
    except AttributeError as e:
        print("AttributeError caught:", e)
        state_obj.add_cfe_to_results(("exceptions", query_case_id))
        cases_stored = state_obj.save_state()
    # except Exception as err:
    #     print(f"Broadest Exception handler invoked", err)
    #     state_obj.add_cfe_to_results(("exceptions", query_case_id))
    #     cases_stored = state_obj.save_state()

    cases_done += 1
    # Just for a sanity check
    assert cases_done == cases_stored
    if cases_done >= 20:
        break


In [19]:
state_obj.get_run_state_df()

Unnamed: 0,cfe_before_validation,cfe_after_validation,cfe_not_found,cases_includes_new_data,cases_too_small,cases_zero_in_y,exceptions,cases_done
0,647,296,423,0,2,24,0,1096


# Results:

### View Valid cases and their Counterfactual Examples (CFEs)
You can load saved configs and run them independently of the above code

In [20]:
pickle_file = "random-a03.pkl"
result = subprocess.run(['scp', f'labnum08:git_repos/explainable-prescriptive-analytics/experiment_results/{pickle_file}', 'experiment_results/'], capture_output=True, text=True)

# return code of 0 means the command executed successfully
if result.returncode != 0:
    print("There is an Error in the command")
else:
    print("successful")

successful


In [21]:
RESULTS_FILE_PATH_N_NAME = f"experiment_results/{pickle_file.split('.')[0]}.csv"

state_obj = StoreTestRun(save_load_path=RESULTS_FILE_PATH_N_NAME)
save_load_path = state_obj.get_save_load_path()

if os.path.exists(save_load_path):
    state_obj.load_state()
    cases_done = state_obj.run_state["cases_done"]
else:
    cases_done = 0

state_obj.run_state["cases_done"]

1096

In [22]:
def print_results():
    for case_id, cfe_df in state_obj.run_state["cfe_after_validation"]:
        yield case_id, cfe_df

generator = print_results()

In [24]:
case_id, cfe_df = next(generator)

print(f"Original Test Case:")
# === Find test_case with case_id
for df_test_trace in test_cases:
    if get_case_id(df_test_trace, case_id_name) == case_id:
        display(df_test_trace)

print(f"Counterfactuals for the last row:")
cfe_df

Original Test Case:


Unnamed: 0,REQUEST_ID,START_DATE,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,END_DATE,CE_UO,ROLE,time_from_first,time_from_previous_et,time_from_midnight,weekday,activity_duration,time_remaining,# ACTIVITY=Service closure Request with network responsibility,# ACTIVITY=Service closure Request with BO responsibility,# ACTIVITY=Pending Request for Reservation Closure,# ACTIVITY=Pending Liquidation Request,# ACTIVITY=Request completed with account closure,# ACTIVITY=Request created,# ACTIVITY=Authorization Requested,# ACTIVITY=Evaluating Request (NO registered letter),# ACTIVITY=Network Adjustment Requested,# ACTIVITY=Pending Request for acquittance of heirs,# ACTIVITY=Request deleted,# ACTIVITY=Back-Office Adjustment Requested,# ACTIVITY=Evaluating Request (WITH registered letter),# ACTIVITY=Request completed with customer recovery,# ACTIVITY=Pending Request for Network Information,Back-Office Adjustment Requested
0,20186007419,1530549548,Client Recess,1 - Client lost,Request created,1530549580,SB83,APPLICANT,0,0,59948,Monday,32,8617209,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,20186007419,1530549580,Client Recess,1 - Client lost,Evaluating Request (NO registered letter),1531732845,SB77,DIRECTOR,32,32,59980,Monday,1183265,8617177,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,20186007419,1531737088,Client Recess,1 - Client lost,Service closure Request with network responsib...,1531760961,SB77,APPLICANT,1187540,1187508,37888,Monday,23873,7429669,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1
3,20186007419,1531760961,Client Recess,1 - Client lost,Service closure Request with BO responsibility,1535131054,BOC,BACK-OFFICE,1211413,23873,61761,Monday,3370093,7405796,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1
4,20186007419,1535131054,Client Recess,1 - Client lost,Pending Request for Reservation Closure,1535131191,BOC,BACK-OFFICE,4581506,3370093,62254,Friday,137,4035703,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1


Counterfactuals for the last row:


Unnamed: 0,CLOSURE_TYPE,CLOSURE_REASON,ACTIVITY,CE_UO,ROLE,time_from_first,time_from_previous_et,time_from_midnight,weekday,activity_duration,# ACTIVITY=Service closure Request with network responsibility,# ACTIVITY=Service closure Request with BO responsibility,# ACTIVITY=Pending Request for Reservation Closure,# ACTIVITY=Pending Liquidation Request,# ACTIVITY=Request completed with account closure,# ACTIVITY=Request created,# ACTIVITY=Authorization Requested,# ACTIVITY=Evaluating Request (NO registered letter),# ACTIVITY=Network Adjustment Requested,# ACTIVITY=Pending Request for acquittance of heirs,# ACTIVITY=Request deleted,# ACTIVITY=Back-Office Adjustment Requested,# ACTIVITY=Evaluating Request (WITH registered letter),# ACTIVITY=Request completed with customer recovery,# ACTIVITY=Pending Request for Network Information,Back-Office Adjustment Requested
0,Client Recess,1 - Client lost,Pending Request for Reservation Closure,SDCRAP,BACK-OFFICE,4581506,3370093,62254,Friday,137,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0
1,Client Recess,1 - Client lost,Pending Request for Reservation Closure,BOCCRF,BACK-OFFICE,4581506,3370093,62254,Friday,137,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0


## Deprecated Code

### Pratice subprocess module

In [20]:
pickle_file = "random-a01.pkl"
result = subprocess.run(['scp', f'labnum08:git_repos/explainable-prescriptive-analytics/experiment_results/{pickle_file}', 'experiment_results/'], capture_output=True, text=True)

# return code of 0 means the command executed successfully
if result.returncode != 0:
    print("There is an Error in the command")
else:
    print("successful")

''

In [None]:
def get_query_instance(sidx=14, eidx=16):
    assert eidx - sidx == 2, "One row represents the current action and the next one represents the suggested action"
    current_step = X_train[sidx: sidx+1]
    expected_next_step = X_train[eidx-1: eidx]
    return current_step, expected_next_step
# current_step, query_instances = get_query_instance(14, 16)

In [None]:
def validate_transition(cfe):
    """ Deprecated function.
    Works with transition graph which has single activity as key.
    Args:
        cfe: Dice counterfactual object.
    """
    cf_examples_df = cfe.cf_examples_list[0].final_cfs_df.copy()

    # expected_next_action = cfe.cf_examples_list[0].test_instance_df[activity_column_name].item()
    # print(f"Expected next action: {expected_next_action}")

    current_activity = current_step[activity_column_name].item()
    # === Verify the next activity
    indexes_to_drop = []
    for idx, suggested_next_activity in cf_examples_df[activity_column_name].items():
        # print(f"index: {idx}, Suggest: {suggested_next_activity}")
        if suggested_next_activity not in transition_graph[current_activity]:
            indexes_to_drop.append(idx)

    cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)

    # === Verify the associated resources
    indexes_to_drop = []
    for idx, row in cf_examples_df[ ["ACTIVITY", "Involved_ST_Function_Div"] ].iterrows():
        row_tuple = tuple(row)
        if row_tuple not in valid_resources:
            # print(f"removed row had: {row_tuple}")
            indexes_to_drop.append(idx)

    cf_examples_df = cf_examples_df.drop(indexes_to_drop, axis='index').reset_index(drop=True)
    return cf_examples_df