# Segmentation of datasets

In [None]:
import pm4py

In [None]:
# Load the XES file and convert it to a DataFrame
xes_filename = '../datasets_cleaned/PrepaidTravelCost_cleaned.xes'
log = pm4py.read_xes(xes_filename)
df_PTC = pm4py.convert_to_dataframe(log)

In [None]:
# Frequency of some attributes for Prepaid Travel Cost (PTC)
print("Frequency of some attributes for Prepaid Travel Cost (PTC):\n")

resource_counts = df_PTC['org:resource'].value_counts()
print(resource_counts)

role_counts = df_PTC['org:role'].value_counts()
print("\n",role_counts)

project_counts = df_PTC['case:Project'].value_counts()
print("\n",project_counts)

In [None]:
# Load the XES file and convert it to a DataFrame
xes_filename = '../datasets_cleaned/RequestForPayments_cleaned.xes'
log = pm4py.read_xes(xes_filename)
df_RFP = pm4py.convert_to_dataframe(log)

In [None]:
# Frequency of some attributes for Request For Payments (RFP)
print("Frequency of some attributes for Request For Payments (RFP):\n")

resource_counts = df_RFP['org:resource'].value_counts()
print(resource_counts)

role_counts = df_RFP['org:role'].value_counts()
print("\n",role_counts)

project_counts = df_RFP['case:Project'].value_counts()
print("\n",project_counts)

In [None]:
# Load the XES file and convert it to a DataFrame
xes_filename = '../datasets_cleaned/InternationalDeclarations_cleaned.xes'
log = pm4py.read_xes(xes_filename)
df_ID = pm4py.convert_to_dataframe(log)

In [None]:
# Frequency of some attributes for International Declarations (ID)
print("Frequency of some attributes for International Declarations (ID):\n")

resource_counts = df_ID['org:resource'].value_counts()
print(resource_counts)

role_counts = df_ID['org:role'].value_counts()
print("\n",role_counts)

project_counts = df_ID['case:Permit ProjectNumber'].value_counts()
print("\n",project_counts)

In [None]:
# Load the XES file and convert it to a DataFrame
xes_filename = '../datasets_cleaned/PermitLog_cleaned.xes'
log = pm4py.read_xes(xes_filename)
df_PL = pm4py.convert_to_dataframe(log)

In [None]:
# Frequency of some attributes for Permit Log (PL)
print("Frequency of some attributes for Permit Log (PL):\n")

resource_counts = df_PL['org:resource'].value_counts()
print(resource_counts)

role_counts = df_PL['org:role'].value_counts()
print("\n",role_counts)

project_counts = df_PL['case:ProjectNumber'].value_counts()
print("\n",project_counts)

In [None]:
# Load the XES file and convert it to a DataFrame
xes_filename = '../datasets_cleaned/DomesticDeclarations_cleaned.xes'
log = pm4py.read_xes(xes_filename)
df_DD = pm4py.convert_to_dataframe(log)

In [None]:
# Frequency of some attributes for Domestic Declarations (DD)
print("Frequency of some attributes for Domestic Declarations (DD):\n")

resource_counts = df_PL['org:resource'].value_counts()
print(resource_counts)

role_counts = df_PL['org:role'].value_counts()
print("\n",role_counts)

In [None]:
# Map of roles to numeric values
role_to_numeric_map = {
    'UNDEFINED': 0,
    'EMPLOYEE': 1,
    'SUPERVISOR': 2,
    'ADMINISTRATION': 3,
    'BUDGET OWNER': 4,
    'DIRECTOR': 5,
    'PRE_APPROVER': 6,
}

# Function to create a numeric role column in the DataFrame
def create_numeric_role_column(df):
    
    # Work on a copy of the DataFrame to avoid problems
    df_copy = df.copy()
    
    # Default value for org_role_numeric
    df_copy.loc[:, 'org_role_numeric'] = -1

    # Map the roles to numeric values
    df_copy.loc[df_copy['org:role'].isin(role_to_numeric_map.keys()), 'org_role_numeric'] = df_copy['org:role'].map(role_to_numeric_map)

    return df_copy

In [None]:
# Apply the function to all the DataFrames
df_DD = create_numeric_role_column(df_DD)
df_ID = create_numeric_role_column(df_ID)
df_RFP = create_numeric_role_column(df_RFP)
df_PTC = create_numeric_role_column(df_PTC)
df_PL = create_numeric_role_column(df_PL)

# Verify if there are any unmapped roles:
print(f"Unmapped values in Domestic Declarations: {df_DD[df_DD['org_role_numeric'] == -1].shape[0]} rows")
print(f"Unmapped values in International Declarations: {df_ID[df_ID['org_role_numeric'] == -1].shape[0]} rows")
print(f"Unmapped values in Request For Payments: {df_RFP[df_RFP['org_role_numeric'] == -1].shape[0]} rows")
print(f"Unmapped values in Prepaid Travel Cost: {df_PTC[df_PTC['org_role_numeric'] == -1].shape[0]} rows")
print(f"Unmapped values in Permit Log: {df_PL[df_PL['org_role_numeric'] == -1].shape[0]} rows")

In [None]:
# Export the cleaned and segmented DataFrames to an XES file

"""
# Domestic Declarations
log = pm4py.convert_to_event_log(df_DD,
                                 case_id_key='case:id',
                                 activity_key='concept:name',
                                 timestamp_key='time:timestamp')
pm4py.write_xes(log, "DomesticDeclarations_segmented.xes")

# International Declarations
log = pm4py.convert_to_event_log(df_ID,
                                 case_id_key='case:id',
                                 activity_key='concept:name',
                                 timestamp_key='time:timestamp')
pm4py.write_xes(log, "InternationalDeclarations_segmented.xes")

# Request For Payments
log = pm4py.convert_to_event_log(df_RFP,
                                 case_id_key='case:id',
                                 activity_key='concept:name',
                                 timestamp_key='time:timestamp')
pm4py.write_xes(log, "RequestForPayments_segmented.xes")

# Prepaid Travel Cost
log = pm4py.convert_to_event_log(df_PTC,
                                 case_id_key='case:id',
                                 activity_key='concept:name',
                                 timestamp_key='time:timestamp')
pm4py.write_xes(log, "PrepaidTravelCost_segmented.xes")

# Permit Log
log = pm4py.convert_to_event_log(df_PL,
                                 case_id_key='case:id',
                                 activity_key='concept:name',
                                 timestamp_key='time:timestamp')
pm4py.write_xes(log, "PermitLog_segmented.xes")

"""
