# Import packages

In [1]:
import torch
import numpy as np
import pandas as pd
import os.path as osp
from pathlib import Path
from collections import Counter

import matplotlib.pyplot as plt

from skpm import event_logs

from ppm.datasets import DatasetSchemas
from ppm.utils import parse_args, add_outcome_labels
from setup_experiment import setup_dataloaders, setup_model, extract_one_offer_cases

Disabling PyTorch because PyTorch >= 2.1 is required but found 2.0.1


# Load dataset 

In [2]:
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)

NUMERICAL_FEATURES = [
    "accumulated_time",
    "day_of_month",
    "day_of_week",
    "day_of_year",
    "hour_of_day",
    "min_of_hour",
    "month_of_year",
    "sec_of_min",
    "secs_within_day",
    "week_of_year",
]

In [3]:
config_path = r'D:\PycharmProjects\xAI-PPM\configs\explain_lstm_args_for_op.txt'
checkpoints_path = r'D:\PycharmProjects\xAI-PPM\persisted_models\suffix\BPI17_rnn_outcome_bpi17.pth'

args = parse_args(config_path=config_path)
config = vars(args)
config["log"] = args.dataset
config["continuous_features"]  = (NUMERICAL_FEATURES
        if (
            args.continuous_features is not None
            and "all" in args.continuous_features
        )
        else args.continuous_features
    )
config["checkpoint_path"] = checkpoints_path
config['batch_size'] = 128

In [4]:
log = getattr(event_logs, config["log"])()

labels_dict = {"O_Accepted": 0, "O_Cancelled": 1, "O_Refused": 2}
column_schema = getattr(DatasetSchemas, config["log"])()
labeled_df = add_outcome_labels(log.dataframe, column_schema, labels_dict)

# Remove O_Refused to convert the task to a binary classification
binary_labeled_df = labeled_df[labeled_df["outcome"] != 2]
train_loader, test_loader = setup_dataloaders(config, binary_labeled_df, log.unbiased_split_params)
model = setup_model(config, train_loader.dataset.log, model_name='outcome_predictor')

 [*] Loading checkpoint from D:\PycharmProjects\xAI-PPM\persisted_models\suffix\BPI17_rnn_outcome_bpi17.pth succeed!


In [5]:
# # a) Extracting sequences that predicted to have "O_Cancelled" as an outcome of the given trace

# pred_cases_info  =  {'tp': {'ids': [],  'cases': [], 'y_pred': [], 'y_true': []},
#                      'tn': {'ids': [],  'cases': [], 'y_pred': [], 'y_true': []},
#                      'fp': {'ids': [],  'cases': [], 'y_pred': [], 'y_true': []},
#                      'fn': {'ids': [],  'cases': [], 'y_pred': [], 'y_true': []}
#                      }

# prefix_len = 15
# dataset_device = config['device']

# for ind, batch in enumerate(test_loader):
#     x_cat, x_num, y_cat, _ = batch # type: ignore
#     x_cat, x_num, y_cat = x_cat[:, :prefix_len, :], x_num[:, :prefix_len, :], y_cat[:, :prefix_len]
#     x_cat, x_num = (x_cat.to(dataset_device), x_num.to(dataset_device))

#     attention_mask = (x_cat[..., 0] != 0).long()
#     out, _ = model(x_cat=x_cat, x_num=x_num, attention_mask=attention_mask)
#     prediction = ((out.squeeze(1)) > 0.5).float()[0, -1].item()
    
#     case = np.concatenate([x_cat.numpy(), x_num.numpy()], axis=-1)
#     y_true = float(y_cat[0, -1].item())
#     sample_name = ''
#     # Extracting true positives and negatives
#     if prediction == y_true:
#         if prediction == 0:
#             sample_name = 'tn'
#         else:
#             sample_name = 'tp'
#     # Extracting misclassified cases
#     else:
#         if prediction == 1:
#             sample_name = 'fp'
#         else:
#             sample_name = 'fn'
#     pred_cases_info[sample_name]['ids'].append(ind)
#     pred_cases_info[sample_name]['cases'].append(case)
#     pred_cases_info[sample_name]['y_pred'].append(out.squeeze(1)[0, -1].item())
#     pred_cases_info[sample_name]['y_true'].append(y_true)

In [6]:
# print("The number of cases with correct 'O_Cancelled' outcome:", len(pred_cases_info['tp']['cases']))
# print("The number of cases with correct 'O_Accepted' outcome:", len(pred_cases_info['tn']['cases']))
# print("The number of false positive cases with the outcome 'O_Cancelled':", len(pred_cases_info['fp']['cases']))
# print("The number of false negative cases with the outcome 'O_Accepted':", len(pred_cases_info['fn']['cases']))

In [7]:
# # Extracting cases ids from the o_cancelled_cases list with only one offer event in the trace
# tp_one_offer_ids, tp_multi_offer_ids = extract_one_offer_cases([trace[0, :, 0] for trace in pred_cases_info['tp']['cases']])
# tn_one_offer_ids, tn_multi_offer_ids = extract_one_offer_cases([trace[0, :, 0] for trace in pred_cases_info['tn']['cases']])

# fp_one_offer_ids, fp_multi_offer_ids = extract_one_offer_cases([trace[0, :, 0] for trace in pred_cases_info['fp']['cases']])
# fn_one_offer_ids, fn_multi_offer_ids = extract_one_offer_cases([trace[0, :, 0] for trace in pred_cases_info['fn']['cases']])

# print(f"Number of cases with one offer: {len(tp_one_offer_ids)} (tp), {len(fp_one_offer_ids)} (fp)")
# print(f"Number of cases with more than one offer: {len(tp_multi_offer_ids)} (tp), {len(fp_multi_offer_ids)} (fp)")
# print("Number of offers per case, frequency: \n", 
#       "TP:", list(Counter(tp_multi_offer_ids.values()).items()), '\n', 
#       "FP:", list(Counter(fp_multi_offer_ids.values()).items()))

In [8]:
# explicands_num = 5
# tp_explicands  = [pred_cases_info['tp']['cases'][tp_one_offer_ids[i]] for i in range(explicands_num)]
# tp_predictions = [pred_cases_info['tp']['y_pred'][tp_one_offer_ids[i]] for i in range(explicands_num)]

# tn_explicands  = [pred_cases_info['tn']['cases'][tn_one_offer_ids[i]] for i in range(explicands_num)]
# tn_predictions = [pred_cases_info['tn']['y_pred'][tn_one_offer_ids[i]] for i in range(explicands_num)]

# fp_explicands  = [pred_cases_info['fp']['cases'][fp_one_offer_ids[i]] for i in range(explicands_num)]
# fp_predictions = [pred_cases_info['fp']['y_pred'][fp_one_offer_ids[i]] for i in range(explicands_num)]

# Generate prototype explanations

In [9]:
from global_xai.map import Explainer
from global_xai.map import ConceptProperties

In [10]:
all_features = train_loader.dataset.log.features
total_features_num  = len(all_features.categorical) + len(all_features.numerical)

In [11]:
trace_lens = [len(trace) for trace in train_loader.dataset.traces]
max_len =  max(trace_lens)

In [12]:
output_dir = Path('D:/PycharmProjects/xAI-PPM/output')
explainer_name = 'map_explainer'

output_dir_ex = output_dir / explainer_name
output_dir_ex.mkdir(parents=True, exist_ok=True)

n_concepts = 4
epochs = 10

exp = Explainer(input_dim=total_features_num,
                output_directory=output_dir_ex,
                n_concepts=n_concepts,
                latent_dim=n_concepts * 5,
                epochs=epochs,
                batch_size=32,
                kwargs={'num_layers': 2,
                        'dropout': 0.1,
                        'hiddem_dim': 64})

In [14]:
exp.fit_explainer(classifier=model,
                  dataloader=train_loader)

torch.Size([128, 97, 1])
torch.Size([128, 97, 1])


IndexError: index out of range in self

In [None]:
fit_explainer = not Path(output_dir_ex / "map.h5").exists()


X_concepts_kmeans, latent_centers = exp.get_concepts_kmeans(train.X)
concept_labels = model(X_concepts_kmeans)
latent = exp.explainer.encoder(test.X)

In [None]:
# completeness & importance
y_pred = model(test.X)

cp = ConceptProperties()
map_instance_concept = cp.get_closest_rec_concept_to_instance(test.X, latent.numpy(), latent_centers)

pd.DataFrame({
    "model": "MAP",
    "accuracy": cp.get_completness(test.y, y_pred),
    "output_dir": output_dir,
    "n_concepts": n_concepts,
    "concept representability": cp.KL_divergence_performance(test.X[:, :, 0], latent),
    "reconstructed concept representability": cp.KL_divergence_performance(test.X[:, :, 0],
                                                                            map_instance_concept[:, :, 0]),
    "latent_centers": [latent_centers]
}, index=[0]) \
    .to_csv(output_dir / "completeness_importance_concept_map.csv")
    