In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn_extra.cluster import KMedoids 
import pm4py

In [2]:
log=pm4py.read_xes("./Data/road_traffic/RawData/Road_Traffic_Fine_Management_Process.xes")

parsing log, completed traces :: 100%|██████████| 150370/150370 [00:55<00:00, 2729.81it/s]


In [4]:
dataset=pd.read_csv("./Data/road_traffic/mined_rtfm_relabelled_confidences.csv", index_col=0)
dataset = dataset.set_index('case:concept:name')


In [5]:
X=dataset.drop(columns=["Class"])

y=dataset['Class']
print("No. of features:"+str(len(X.columns)))

le = LabelEncoder()
print("Is na? "+str(X.isnull().values.any()))
y_transformed = le.fit_transform(y)
le_name_mapping = pd.Series(dict(zip(le.classes_,le.transform(le.classes_))))
cols=X.columns.to_list()
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y_transformed,
                                                    test_size=0.2,
                                                    stratify=y_transformed,
                                                    shuffle=True,
                                                    random_state=0)

No. of features:2189
Is na? True


In [7]:
#using Kmedoids for each class separately

In [6]:
def returnMedoidClass(data_train_classes, log, dataClass):
    data_train_specific_class=data_train_classes[data_train_classes["Classes"]==dataClass].drop(columns=["Classes"]) #obtain the data related to the class where we want to find the medoid,and remove the classes column
    if len(data_train_specific_class)>25000:#45700:#if the class has more instances than that
        data_train_specific_class=data_train_specific_class.sample(n=25000, random_state=0)#sample 45700 instances, otherwise it runs out of RAM, so that is the limit
    kmedoids_clustering_one_medoid=KMedoids(n_clusters=1,random_state=0)#create the kmedoids algorithm instance 
    kmedoids_clustering_one_medoid.fit(X=data_train_specific_class)#perform the clustering with kmedoids
    index_medoid=kmedoids_clustering_one_medoid.medoid_indices_[0] #get the medoid in the format of index
    case_id_medoid=data_train_specific_class.iloc[index_medoid].name #get the case id of the medoid
    medoid=log[log["case:concept:name"]==case_id_medoid]#filter the case of the medoid in the original log
    return medoid

In [7]:
X_train_with_classes=X_train.fillna(-100).copy()#we fill nan values for -100 so that kmedoids can be applied

In [8]:
X_train_with_classes["Classes"]=y_train

In [9]:
medoid_class0_isolated=returnMedoidClass(data_train_classes=X_train_with_classes, log=log, dataClass=0)

In [10]:
medoid_class0_isolated

Unnamed: 0,amount,org:resource,dismissal,concept:name,vehicleClass,totalPaymentAmount,lifecycle:transition,time:timestamp,article,points,case:concept:name,expense,notificationType,lastSent,paymentAmount,matricola
58610,38.0,559.0,NIL,Create Fine,A,0.0,complete,2009-01-10 00:00:00+00:00,157.0,0.0,A39197,,,,,
58611,,,,Send Fine,,,complete,2009-05-12 00:00:00+00:00,,,A39197,13.5,,,,
58612,,,,Insert Fine Notification,,,complete,2009-05-30 00:00:00+00:00,,,A39197,,P,P,,
58613,77.5,,,Add penalty,,,complete,2009-07-29 00:00:00+00:00,,,A39197,,,,,
58614,,,,Send for Credit Collection,,,complete,2010-10-15 00:00:00+00:00,,,A39197,,,,,


In [11]:
medoid_class1_isolated=returnMedoidClass(data_train_classes=X_train_with_classes, log=log, dataClass=1)

In [13]:
medoid_class1_isolated

Unnamed: 0,amount,org:resource,dismissal,concept:name,vehicleClass,totalPaymentAmount,lifecycle:transition,time:timestamp,article,points,case:concept:name,expense,notificationType,lastSent,paymentAmount,matricola
274349,24.0,557.0,NIL,Create Fine,A,0.0,complete,2011-09-20 00:00:00+00:00,7.0,0.0,P2492,,,,,
274350,,,,Send Fine,,,complete,2011-11-19 00:00:00+00:00,,,P2492,15.0,,,,
274351,,,,Insert Fine Notification,,,complete,2011-12-13 00:00:00+00:00,,,P2492,,P,P,,
274352,,,,Insert Date Appeal to Prefecture,,,complete,2012-01-18 00:00:00+00:00,,,P2492,,,,,
274353,47.0,,,Add penalty,,,complete,2012-02-11 00:00:00+00:00,,,P2492,,,,,
274354,,,#,Send Appeal to Prefecture,,,complete,2012-02-24 00:00:00+00:00,,,P2492,,,,,


In [14]:
medoid_class2_isolated=returnMedoidClass(data_train_classes=X_train_with_classes, log=log, dataClass=2)

In [16]:
medoid_class2_isolated

Unnamed: 0,amount,org:resource,dismissal,concept:name,vehicleClass,totalPaymentAmount,lifecycle:transition,time:timestamp,article,points,case:concept:name,expense,notificationType,lastSent,paymentAmount,matricola
48006,36.0,563.0,NIL,Create Fine,A,0.0,complete,2008-08-10 00:00:00+00:00,157.0,0.0,A33342,,,,,
48007,,,,Payment,,36.0,complete,2008-08-28 00:00:00+00:00,,,A33342,,,,36.0,


In [17]:
medoid_class3_isolated=returnMedoidClass(data_train_classes=X_train_with_classes, log=log, dataClass=3)

In [18]:
medoid_class3_isolated

Unnamed: 0,amount,org:resource,dismissal,concept:name,vehicleClass,totalPaymentAmount,lifecycle:transition,time:timestamp,article,points,case:concept:name,expense,notificationType,lastSent,paymentAmount,matricola
479249,32.8,839.0,NIL,Create Fine,A,0.0,complete,2001-08-13 00:00:00+00:00,7.0,0.0,S68043,,,,,
479250,,,,Send Fine,,,complete,2001-11-09 00:00:00+00:00,,,S68043,6.71,,,,


In [19]:
medoid_class0_isolated.to_csv("./results/Kmedoids/medoid_class0.csv")

In [20]:
medoid_class1_isolated.to_csv("./results/Kmedoids/medoid_class1.csv")

In [21]:
medoid_class2_isolated.to_csv("./results/Kmedoids/medoid_class2.csv")

In [22]:
medoid_class3_isolated.to_csv("./results/Kmedoids/medoid_class3.csv")