#### 2. Do in-person submitted applications for car loans perform differently than digital submissions, especially when the requested amount is over €5,000?

In [1]:
import os
import sys
import zipfile
import pandas as pd
import pm4py
from logview.utils import LogViewBuilder
from logview.predicate import *
from filter_visualization import query_exploration_icicle, query_breakdown_pie
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
from pm4py.visualization.dfg import visualizer as dfg_visualizer

In [2]:
# Load LogView

if not os.path.exists("logview"):
    !git clone https://github.com/fzerbato/logview.git
    print("Cloned logview.")
else:
    print("logview already cloned.")

logview_path = os.path.abspath("logview")

if logview_path not in sys.path:
    sys.path.append(logview_path)
    print(f"Added to sys.path: {logview_path}")

%pip install -r logview/requirements.txt

logview already cloned.
Added to sys.path: c:\Users\cshek\OneDrive\Bureaublad\Master-Thesis-Experiment\logview
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Program Files\Python310\python.exe -m pip install --upgrade pip' command.


In [3]:
# Load data

csv_file = "BPI_Challenge_2017.csv"
zip_file = "BPI_Challenge_2017.zip"

if not os.path.exists(csv_file):
    if os.path.exists(zip_file):
        print(f"Extracting {csv_file} from {zip_file}...")
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extract(csv_file)
    else:
        raise FileNotFoundError(f"Both '{csv_file}' and '{zip_file}' not found")
    
CASE_ID_COL = 'case'
TIMESTAMP_COL = 'time'
ACTIVITY_COL = 'event'
    
bpi_data = pd.read_csv(csv_file, sep=',', quotechar='"')
bpi_data.columns = bpi_data.columns.str.strip()
bpi_data[TIMESTAMP_COL] = pd.to_datetime(bpi_data[TIMESTAMP_COL], format='%Y/%m/%d %H:%M:%S.%f')
log = pm4py.format_dataframe(bpi_data, case_id=CASE_ID_COL, activity_key=ACTIVITY_COL, timestamp_key=TIMESTAMP_COL)

display(log)

Unnamed: 0,case,event,time,lifecycle:transition,ApplicationType,LoanGoal,RequestedAmount,MonthlyCost,org:resource,Selected,...,Accepted,CreditScore,NumberOfTerms,EventOrigin,OfferedAmount,case:concept:name,concept:name,time:timestamp,@@index,@@case_index
0,Application_1000086665,A_Create Application,2016-08-03 17:57:21.673000+00:00,COMPLETE,New credit,"Other, see explanation",5000.0,,User_1,,...,,,,Application,,Application_1000086665,A_Create Application,2016-08-03 17:57:21.673000+00:00,0,0
1,Application_1000086665,A_Submitted,2016-08-03 17:57:21.734000+00:00,COMPLETE,New credit,"Other, see explanation",5000.0,,User_1,,...,,,,Application,,Application_1000086665,A_Submitted,2016-08-03 17:57:21.734000+00:00,1,0
2,Application_1000086665,W_Handle leads,2016-08-03 17:57:21.963000+00:00,SCHEDULE,New credit,"Other, see explanation",5000.0,,User_1,,...,,,,Workflow,,Application_1000086665,W_Handle leads,2016-08-03 17:57:21.963000+00:00,2,0
3,Application_1000086665,W_Handle leads,2016-08-03 17:58:28.286000+00:00,WITHDRAW,New credit,"Other, see explanation",5000.0,,User_1,,...,,,,Workflow,,Application_1000086665,W_Handle leads,2016-08-03 17:58:28.286000+00:00,3,0
4,Application_1000086665,W_Complete application,2016-08-03 17:58:28.293000+00:00,SCHEDULE,New credit,"Other, see explanation",5000.0,,User_1,,...,,,,Workflow,,Application_1000086665,W_Complete application,2016-08-03 17:58:28.293000+00:00,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202262,Application_999993812,W_Call incomplete files,2016-10-20 10:19:28.812000+00:00,RESUME,New credit,Caravan / Camper,30000.0,,User_41,,...,,,,Workflow,,Application_999993812,W_Call incomplete files,2016-10-20 10:19:28.812000+00:00,1202262,31508
1202263,Application_999993812,W_Call incomplete files,2016-10-20 10:21:59.667000+00:00,SUSPEND,New credit,Caravan / Camper,30000.0,,User_41,,...,,,,Workflow,,Application_999993812,W_Call incomplete files,2016-10-20 10:21:59.667000+00:00,1202263,31508
1202264,Application_999993812,O_Accepted,2016-10-24 08:24:30.056000+00:00,COMPLETE,New credit,Caravan / Camper,30000.0,,User_68,,...,,,,Offer,,Application_999993812,O_Accepted,2016-10-24 08:24:30.056000+00:00,1202264,31508
1202265,Application_999993812,A_Pending,2016-10-24 08:24:30.059000+00:00,COMPLETE,New credit,Caravan / Camper,30000.0,,User_68,,...,,,,Application,,Application_999993812,A_Pending,2016-10-24 08:24:30.059000+00:00,1202265,31508


In [4]:
# Build LogView
log_view = LogViewBuilder.build_log_view(log)

# Step 1: Filter for applications that were submitted in-person
query_1 = Query('Submitted', [EqToConstant('event', 'A_Submitted')])
rs_submitted, _ = log_view.evaluate_query('rs_Submitted', log, query_1)

# Step 2: From submitted in person applications, keep only car loan applications
query_2 = Query('CarLoans', [EqToConstant('LoanGoal', 'Car')])
rs_car_loans, _ = log_view.evaluate_query('rs_CarLoans', rs_submitted, query_2)

# Step 3: From car loans, keep only applications with amount > €5,000
query_3 = Query('Over5K', [GreaterThanConstant('RequestedAmount', 5000)])
rs_over_5k, _ = log_view.evaluate_query('rs_Over5K', rs_car_loans, query_3)

# Show Summary
summary = log_view.get_summary()

+----+--------------------+-----------+--------------+----------+
|    | source_log         | query     | result_set   | labels   |
|----+--------------------+-----------+--------------+----------|
|  0 | initial_source_log | Submitted | rs_Submitted | []       |
|  1 | rs_Submitted       | CarLoans  | rs_CarLoans  | []       |
|  2 | rs_CarLoans        | Over5K    | rs_Over5K    | []       |
+----+--------------------+-----------+--------------+----------+
+----+-----------+------------------------------+
|    | query     | predicates                   |
|----+-----------+------------------------------|
|  0 | Submitted | (event in { 'A_Submitted' }) |
|  1 | CarLoans  | (LoanGoal in { 'Car' })      |
|  2 | Over5K    | (RequestedAmount > 5000)     |
+----+-----------+------------------------------+


In [5]:
query_exploration_icicle('rs_Over5K', log_view, metric='avg_case_duration_seconds', details=False)

In [6]:
query_breakdown_pie('rs_Over5K', log_view, metric='avg_case_duration_seconds', details=False)

In [7]:
query_exploration_icicle('rs_Over5K', log_view, metric='avg_events_per_case', details=False)

In [8]:
query_breakdown_pie('rs_Over5K', log_view, metric='avg_events_per_case', details=False)

In [9]:
query_exploration_icicle('rs_Over5K', log_view, metric='avg_time_between_events', details=False)

In [10]:
query_breakdown_pie('rs_Over5K', log_view, metric='avg_time_between_events', details=False)