In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import cdist
import numpy as np
import pm4py

In [2]:
sepsis_log=pm4py.read_xes("./Data/sepsis/sepsis.xes")

  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 1050/1050 [00:00<00:00, 1591.01it/s]


In [3]:
dataset=pd.read_csv("./Data/sepsis/mined_sepsis_confidences_SIRS2OrMore.csv",index_col=0)


In [4]:
dataset = dataset.set_index("case:concept:name")

In [5]:
X=dataset.drop(columns=["Class"])

y=dataset['Class']
print("No. of features:"+str(len(X.columns)))

No. of features:4704


In [6]:
le = LabelEncoder()
print("Is na? "+str(X.isnull().values.any()))
y_transformed = le.fit_transform(y)
le_name_mapping = dict(zip(le.classes_,le.transform(le.classes_)))
cols=X.columns.to_list()
X_train_and_val, X_test, y_train_and_val, y_test = train_test_split(X,
                                                    y_transformed,
                                                    test_size=0.2,
                                                    stratify=y_transformed,
                                                    shuffle=True,#disorder the data
                                                    random_state=0)

X_train, X_val, y_train, y_val = train_test_split(X_train_and_val,
                                                  y_train_and_val,
                                                  test_size=0.2,
                                                  stratify=y_train_and_val,
                                                  shuffle=True,#disorder the data
                                                  random_state=0)

Is na? True


In [9]:
def calculateBoundaryCasesThroughDistance(data, class1, class2, y):
    data=data.fillna(-100)
    data_class1=data[y==class1].drop_duplicates()
    data_class2=data[y==class2].drop_duplicates()
    matrixDistances=cdist(data_class1, data_class2)
    indexMinDistance=np.argmin(matrixDistances)#we obtain the index of the minimum distance
                        
    #Transform the index of the minimum distance to real index:
    min_idx = np.unravel_index(indexMinDistance, matrixDistances.shape)

    #The x component represents a case of classX, and the y component represents a case of classY, so we filter them:
    min_case_class1=data_class1.loc[[list(data_class1.index)[min_idx[0]]]]
    min_case_class2=data_class2.loc[[list(data_class2.index)[min_idx[1]]]]

    return (min_case_class1, min_case_class2)




In [10]:
boundaryCasesClass0vs1=calculateBoundaryCasesThroughDistance(X_train, 0, 1, y_train)

In [11]:
boundaryCasesClass0vs1[0]

Unnamed: 0_level_0,'Absence(Admission IC)','Absence(Admission NC)','Absence(CRP)','Absence(ER Registration)','Absence(ER Sepsis Triage)','Absence(ER Triage)','Absence(IV Antibiotics)','Absence(IV Liquid)','Absence(LacticAcid)','Absence(Leucocytes)',...,"'Succession(Return ER, ER Triage)'","'Succession(Return ER, IV Antibiotics)'","'Succession(Return ER, IV Liquid)'","'Succession(Return ER, LacticAcid)'","'Succession(Return ER, Leucocytes)'","'Succession(Return ER, Release A)'","'Succession(Return ER, Release B)'","'Succession(Return ER, Release C)'","'Succession(Return ER, Release D)'","'Succession(Return ER, Release E)'"
case:concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AM,100.0,100.0,0.0,0.0,0.0,0.0,100.0,100.0,100.0,0.0,...,0.0,-100.0,-100.0,-100.0,0.0,-100.0,-100.0,-100.0,-100.0,-100.0


In [12]:
boundaryCasesClass0vs1[0]

Unnamed: 0_level_0,'Absence(Admission IC)','Absence(Admission NC)','Absence(CRP)','Absence(ER Registration)','Absence(ER Sepsis Triage)','Absence(ER Triage)','Absence(IV Antibiotics)','Absence(IV Liquid)','Absence(LacticAcid)','Absence(Leucocytes)',...,"'Succession(Return ER, ER Triage)'","'Succession(Return ER, IV Antibiotics)'","'Succession(Return ER, IV Liquid)'","'Succession(Return ER, LacticAcid)'","'Succession(Return ER, Leucocytes)'","'Succession(Return ER, Release A)'","'Succession(Return ER, Release B)'","'Succession(Return ER, Release C)'","'Succession(Return ER, Release D)'","'Succession(Return ER, Release E)'"
case:concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AM,100.0,100.0,0.0,0.0,0.0,0.0,100.0,100.0,100.0,0.0,...,0.0,-100.0,-100.0,-100.0,0.0,-100.0,-100.0,-100.0,-100.0,-100.0


In [13]:
dataset.loc[boundaryCasesClass0vs1[0].index[0]]

'Absence(Admission IC)'                    100.0
'Absence(Admission NC)'                    100.0
'Absence(CRP)'                               0.0
'Absence(ER Registration)'                   0.0
'Absence(ER Sepsis Triage)'                  0.0
                                         ...    
'Succession(Return ER, Release B)'           NaN
'Succession(Return ER, Release C)'           NaN
'Succession(Return ER, Release D)'           NaN
'Succession(Return ER, Release E)'           NaN
Class                                 SIRS-False
Name: AM, Length: 4705, dtype: object

In [14]:
dataset.loc[boundaryCasesClass0vs1[1].index[0]]

'Absence(Admission IC)'                   100.0
'Absence(Admission NC)'                   100.0
'Absence(CRP)'                              0.0
'Absence(ER Registration)'                  0.0
'Absence(ER Sepsis Triage)'                 0.0
                                        ...    
'Succession(Return ER, Release B)'          NaN
'Succession(Return ER, Release C)'          NaN
'Succession(Return ER, Release D)'          NaN
'Succession(Return ER, Release E)'          NaN
Class                                 SIRS-True
Name: MCA, Length: 4705, dtype: object

In [15]:
boundaryCasesClass0vs1[0].index[0]

'AM'

In [16]:
sepsis_log[sepsis_log["case:concept:name"]==boundaryCasesClass0vs1[0].index[0]]["concept:name"]

4671     ER Registration
4672           ER Triage
4673    ER Sepsis Triage
4674          Leucocytes
4675                 CRP
Name: concept:name, dtype: object

In [17]:
sepsis_log[sepsis_log["case:concept:name"]==boundaryCasesClass0vs1[1].index[0]]["concept:name"]

11103     ER Registration
11104           ER Triage
11105    ER Sepsis Triage
11106          Leucocytes
11107                 CRP
Name: concept:name, dtype: object