In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import cdist
import numpy as np
import pm4py

In [33]:
rtfm_raw=pm4py.read_xes("./Data/road_traffic/RawData/Road_Traffic_Fine_Management_Process.xes")

parsing log, completed traces :: 100%|██████████| 150370/150370 [00:35<00:00, 4283.01it/s]


In [None]:
dataset=pd.read_csv("./Data/road_traffic/mined_rtfm_relabelled_confidences.csv",index_col=0)


In [15]:
dataset = dataset.set_index("case:concept:name")

In [16]:
X=dataset.drop(columns=["Class"])

y=dataset['Class']
print("No. of features:"+str(len(X.columns)))

No. of features:2189


In [17]:
le = LabelEncoder()
print("Is na? "+str(X.isnull().values.any()))
y_transformed = le.fit_transform(y)
le_name_mapping = dict(zip(le.classes_,le.transform(le.classes_)))
cols=X.columns.to_list()
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y_transformed,
                                                    test_size=0.2,
                                                    stratify=y_transformed,
                                                    shuffle=True,#disorder the data
                                                    random_state=0)

Is na? True


In [26]:
le.classes_

array(['credit_collection', 'dismissed', 'paid_full', 'unresolved'],
      dtype=object)

In [25]:
le.transform(le.classes_)

array([0, 1, 2, 3])

In [36]:
def calculateBoundaryCasesThroughDistance(data, class1, class2, y):
    data=data.fillna(-100)
    data_class1=data[y==class1].drop_duplicates()
    data_class2=data[y==class2].drop_duplicates()
    matrixDistances=cdist(data_class1, data_class2)
    indexMinDistance=np.argmin(matrixDistances)#we obtain the index of the minimum distance
                        
    #Transform the index of the minimum distance to real index:
    min_idx = np.unravel_index(indexMinDistance, matrixDistances.shape)

    #The x component represents a case of classX, and the y component represents a case of classY, so we filter them:
    min_case_class1=data_class1.loc[[list(data_class1.index)[min_idx[0]]]]
    min_case_class2=data_class2.loc[[list(data_class2.index)[min_idx[1]]]]

    return (min_case_class1, min_case_class2)




In [37]:
boundaryCasesClass1vs3=calculateBoundaryCasesThroughDistance(X_test, 1, 3, y_test)

In [40]:
rtfm_raw[rtfm_raw["case:concept:name"]==boundaryCasesClass1vs3[0].index[0]]

Unnamed: 0,amount,org:resource,dismissal,concept:name,vehicleClass,totalPaymentAmount,lifecycle:transition,time:timestamp,article,points,case:concept:name,expense,notificationType,lastSent,paymentAmount,matricola
516599,33.6,843.0,NIL,Create Fine,A,0.0,complete,2003-08-07 00:00:00+00:00,158.0,0.0,S88125,,,,,
516600,,,,Send Fine,,,complete,2003-11-03 00:00:00+00:00,,,S88125,10.0,,,,
516601,,,,Insert Fine Notification,,,complete,2003-11-13 00:00:00+00:00,,,S88125,,P,N,,
516602,68.77,,,Add penalty,,,complete,2004-01-12 00:00:00+00:00,,,S88125,,,,,
516603,,,,Insert Date Appeal to Prefecture,,,complete,2004-01-12 00:00:00+00:00,,,S88125,,,,,
516604,,,#,Send Appeal to Prefecture,,,complete,2004-02-12 00:00:00+00:00,,,S88125,,,,,


In [41]:
rtfm_raw[rtfm_raw["case:concept:name"]==boundaryCasesClass1vs3[1].index[0]]

Unnamed: 0,amount,org:resource,dismissal,concept:name,vehicleClass,totalPaymentAmount,lifecycle:transition,time:timestamp,article,points,case:concept:name,expense,notificationType,lastSent,paymentAmount,matricola
50317,36.0,562.0,NIL,Create Fine,A,0.0,complete,2008-08-06 00:00:00+00:00,157.0,0.0,A34464,,,,,
50318,,,,Send Fine,,,complete,2008-10-21 00:00:00+00:00,,,A34464,13.5,,,,
50319,,,,Insert Fine Notification,,,complete,2008-10-29 00:00:00+00:00,,,A34464,,P,P,,
50320,74.0,,,Add penalty,,,complete,2008-12-28 00:00:00+00:00,,,A34464,,,,,
50321,,,,Insert Date Appeal to Prefecture,,,complete,2009-01-17 00:00:00+00:00,,,A34464,,,,,
50322,,,NIL,Send Appeal to Prefecture,,,complete,2009-02-09 00:00:00+00:00,,,A34464,,,,,


In [None]:
#2->3

In [42]:
boundaryCasesClass2vs3=calculateBoundaryCasesThroughDistance(X_test, 2, 3, y_test)

In [43]:
rtfm_raw[rtfm_raw["case:concept:name"]==boundaryCasesClass2vs3[0].index[0]]

Unnamed: 0,amount,org:resource,dismissal,concept:name,vehicleClass,totalPaymentAmount,lifecycle:transition,time:timestamp,article,points,case:concept:name,expense,notificationType,lastSent,paymentAmount,matricola
151921,32.8,537.0,NIL,Create Fine,A,0.0,complete,2001-11-09 00:00:00+00:00,157.0,0.0,N38303,,,,,
151922,,,,Payment,,32.8,complete,2001-11-13 00:00:00+00:00,,,N38303,,,,32.8,


In [44]:
rtfm_raw[rtfm_raw["case:concept:name"]==boundaryCasesClass2vs3[1].index[0]]

Unnamed: 0,amount,org:resource,dismissal,concept:name,vehicleClass,totalPaymentAmount,lifecycle:transition,time:timestamp,article,points,case:concept:name,expense,notificationType,lastSent,paymentAmount,matricola
109818,31.3,550.0,NIL,Create Fine,A,0.0,complete,2000-01-20 00:00:00+00:00,157.0,0.0,N22520,,,,,
109819,,,,Payment,,30.99,complete,2000-02-04 00:00:00+00:00,,,N22520,,,,30.99,
