In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import cdist
import numpy as np
import pm4py

In [2]:
rtfm_raw=pm4py.read_xes("./Data/road_traffic/Road_Traffic_Fine_Management_Process.xes")

  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 150370/150370 [00:38<00:00, 3952.85it/s]


In [3]:
dataset=pd.read_csv("./Data/road_traffic/mined_rtfm_relabelled_confidences.csv",index_col=0)


In [4]:
dataset = dataset.set_index("case:concept:name")

In [5]:
X=dataset.drop(columns=["Class"])

y=dataset['Class']

In [6]:
le = LabelEncoder()
y_transformed = le.fit_transform(y)
le_name_mapping = dict(zip(le.classes_,le.transform(le.classes_)))
cols=X.columns.to_list()
X_train_and_val, X_test, y_train_and_val, y_test = train_test_split(X,
                                                    y_transformed,
                                                    test_size=0.2,
                                                    stratify=y_transformed,
                                                    shuffle=True,#disorder the data
                                                    random_state=0)

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X_train_and_val,
                                                  y_train_and_val,
                                                  test_size=0.2,
                                                  stratify=y_train_and_val,
                                                  shuffle=True,#disorder the data
                                                  random_state=0)

In [8]:
le.transform(le.classes_)

array([0, 1, 2, 3])

In [9]:
def calculateBoundaryCasesThroughDistance(data, class1, class2, y):
    data=data.fillna(-100)
    data_class1=data[y==class1].drop_duplicates()
    data_class2=data[y==class2].drop_duplicates()
    matrixDistances=cdist(data_class1, data_class2)
    indexMinDistance=np.argmin(matrixDistances)#we obtain the index of the minimum distance
                        
    #Transform the index of the minimum distance to real index:
    min_idx = np.unravel_index(indexMinDistance, matrixDistances.shape)

    #The x component represents a case of classX, and the y component represents a case of classY, so we filter them:
    min_case_class1=data_class1.loc[[list(data_class1.index)[min_idx[0]]]]
    min_case_class2=data_class2.loc[[list(data_class2.index)[min_idx[1]]]]

    return (min_case_class1, min_case_class2)




In [10]:
le_name_mapping

{'collected': 0, 'dismissed': 1, 'fully_paid': 2, 'unresolved': 3}

In [11]:
boundaryCasesClass1vs3=calculateBoundaryCasesThroughDistance(X_train, 1, 3, y_train)

In [None]:
#dismissed vs unresolved

In [12]:
rtfm_raw[rtfm_raw["case:concept:name"]==boundaryCasesClass1vs3[0].index[0]]

Unnamed: 0,amount,org:resource,dismissal,concept:name,vehicleClass,totalPaymentAmount,lifecycle:transition,time:timestamp,article,points,case:concept:name,expense,notificationType,lastSent,paymentAmount,matricola
247844,35.0,537.0,NIL,Create Fine,A,0.0,complete,2005-11-13 00:00:00+00:00,157.0,0.0,N87268,,,,,
247845,,,,Send Fine,,,complete,2006-02-15 00:00:00+00:00,,,N87268,11.0,,,,
247846,,,,Insert Fine Notification,,,complete,2006-03-02 00:00:00+00:00,,,N87268,,P,P,,
247847,,,,Insert Date Appeal to Prefecture,,,complete,2006-04-08 00:00:00+00:00,,,N87268,,,,,
247848,71.5,,,Add penalty,,,complete,2006-05-01 00:00:00+00:00,,,N87268,,,,,
247849,,,#,Send Appeal to Prefecture,,,complete,2006-05-18 00:00:00+00:00,,,N87268,,,,,


In [13]:
rtfm_raw[rtfm_raw["case:concept:name"]==boundaryCasesClass1vs3[1].index[0]]

Unnamed: 0,amount,org:resource,dismissal,concept:name,vehicleClass,totalPaymentAmount,lifecycle:transition,time:timestamp,article,points,case:concept:name,expense,notificationType,lastSent,paymentAmount,matricola
524196,33.6,840.0,NIL,Create Fine,A,0.0,complete,2004-07-10 00:00:00+00:00,7.0,0.0,S94371,,,,,
524197,,,,Send Fine,,,complete,2005-03-10 00:00:00+00:00,,,S94371,16.6,,,,
524198,,,,Insert Fine Notification,,,complete,2005-03-18 00:00:00+00:00,,,S94371,,P,P,,
524199,,,,Insert Date Appeal to Prefecture,,,complete,2005-04-07 00:00:00+00:00,,,S94371,,,,,
524200,68.77,,,Add penalty,,,complete,2005-05-17 00:00:00+00:00,,,S94371,,,,,
524201,,,NIL,Send Appeal to Prefecture,,,complete,2005-05-25 00:00:00+00:00,,,S94371,,,,,


In [14]:
#Fully paid vs unresolved

In [15]:
boundaryCasesClass2vs3=calculateBoundaryCasesThroughDistance(X_train, 2, 3, y_train)

In [16]:
rtfm_raw[rtfm_raw["case:concept:name"]==boundaryCasesClass2vs3[0].index[0]]

Unnamed: 0,amount,org:resource,dismissal,concept:name,vehicleClass,totalPaymentAmount,lifecycle:transition,time:timestamp,article,points,case:concept:name,expense,notificationType,lastSent,paymentAmount,matricola
184302,33.6,546.0,NIL,Create Fine,A,0.0,complete,2004-05-27 00:00:00+00:00,157.0,0.0,N53197,,,,,
184303,,,,Payment,,33.6,complete,2004-06-01 00:00:00+00:00,,,N53197,,,,33.6,


In [17]:
rtfm_raw[rtfm_raw["case:concept:name"]==boundaryCasesClass2vs3[1].index[0]]

Unnamed: 0,amount,org:resource,dismissal,concept:name,vehicleClass,totalPaymentAmount,lifecycle:transition,time:timestamp,article,points,case:concept:name,expense,notificationType,lastSent,paymentAmount,matricola
172184,33.6,536.0,NIL,Create Fine,A,0.0,complete,2003-01-04 00:00:00+00:00,157.0,0.0,N46597,,,,,
172185,,,,Payment,,32.0,complete,2003-01-15 00:00:00+00:00,,,N46597,,,,32.0,
