# Project of ana and leo

In [1]:
import pandas as pd
import statistics
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from function import Utils
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from pandas.api.types import is_numeric_dtype
import numpy as np
import h2o
from IPython.display import display
warnings.filterwarnings('ignore')

## Data loading

In [2]:
df = pd.read_csv("hmw/Sampled_BPIC17_offer_log.csv")

In [3]:
df.head()

Unnamed: 0,index,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,(case) Accepted,(case) ApplicationID,(case) CreditScore,(case) FirstWithdrawalAmount,(case) MonthlyCost,(case) NumberOfTerms,(case) OfferedAmount,(case) Selected,Action,EventID,EventOrigin,OfferID,lifecycle:transition
0,0,Offer_247135719,O_Create Offer,User_17,2016/01/02 18:17:05.720,Variant 8,8,True,Application_196483749,0,10000.0,201.76,57,10000.0,False,Created,Offer_247135719,Offer,,complete
1,1,Offer_247135719,O_Created,User_17,2016/01/02 18:17:08.762,Variant 8,8,True,Application_196483749,0,10000.0,201.76,57,10000.0,False,statechange,OfferState_124849367,Offer,Offer_247135719,complete
2,2,Offer_247135719,O_Sent (online only),User_17,2016/01/02 18:19:21.330,Variant 8,8,True,Application_196483749,0,10000.0,201.76,57,10000.0,False,statechange,OfferState_440662877,Offer,Offer_247135719,complete
3,3,Offer_247135719,O_Cancelled,User_17,2016/01/02 18:21:26.034,Variant 8,8,True,Application_196483749,0,10000.0,201.76,57,10000.0,False,statechange,OfferState_591416028,Offer,Offer_247135719,complete
4,4,Offer_941964966,O_Create Offer,User_17,2016/01/02 18:21:42.022,Variant 1,1,True,Application_196483749,0,4100.0,201.76,57,10000.0,False,Created,Offer_941964966,Offer,,complete


## Data preprocessing

In [4]:
input = {
    "Case ID" : "caseid",
    "Activity" : "activity" ,
    "Resource" : "resource" ,
    "Complete Timestamp": "ts",
    "(case) Accepted" : "y"  #Needs to be 1 for each Case
}

# Change columns name
df.rename(columns=input,inplace = True)

# Timestamp value in df
df["ts"] = pd.to_datetime(df["ts"])

df["dt"] = Utils.dt(df) # Thisneed to be improved
df.drop("ts",axis=1,inplace = True)

In [5]:
Utils.persantage_nan(df)

index                            0.000000
caseid                           0.000000
activity                         0.000000
resource                         0.000000
Variant                          0.000000
Variant index                    0.000000
y                                0.000000
(case) ApplicationID             0.000000
(case) CreditScore               0.000000
(case) FirstWithdrawalAmount     0.000000
(case) MonthlyCost               0.000000
(case) NumberOfTerms             0.000000
(case) OfferedAmount             0.000000
(case) Selected                  0.000000
Action                           0.000000
EventID                          0.000000
EventOrigin                      0.000000
OfferID                         22.312464
lifecycle:transition             0.000000
dt                               0.000000
dtype: float64

In [6]:
df = Utils.predict_null_value(colum="OfferID",df=df)

In [7]:
df = Utils.prod_nan_with_treshold(df)
df.head()

Unnamed: 0,index,caseid,activity,resource,Variant,Variant index,y,(case) ApplicationID,(case) CreditScore,(case) FirstWithdrawalAmount,...,(case) NumberOfTerms,(case) OfferedAmount,(case) Selected,Action,EventID,EventOrigin,OfferID,lifecycle:transition,dt,OfferID_was_null
0,0,Offer_247135719,O_Create Offer,User_17,Variant 8,8,True,Application_196483749,0,10000.0,...,57,10000.0,False,Created,Offer_247135719,Offer,Offer_247135719,complete,0 days 00:00:03.042000,True
1,1,Offer_247135719,O_Created,User_17,Variant 8,8,True,Application_196483749,0,10000.0,...,57,10000.0,False,statechange,OfferState_124849367,Offer,Offer_247135719,complete,0 days 00:02:12.568000,False
2,2,Offer_247135719,O_Sent (online only),User_17,Variant 8,8,True,Application_196483749,0,10000.0,...,57,10000.0,False,statechange,OfferState_440662877,Offer,Offer_247135719,complete,0 days 00:02:04.704000,False
3,3,Offer_247135719,O_Cancelled,User_17,Variant 8,8,True,Application_196483749,0,10000.0,...,57,10000.0,False,statechange,OfferState_591416028,Offer,Offer_247135719,complete,0 days 00:02:04.704000,False
4,4,Offer_941964966,O_Create Offer,User_17,Variant 1,1,True,Application_196483749,0,4100.0,...,57,10000.0,False,Created,Offer_941964966,Offer,Offer_941964966,complete,0 days 00:00:01.551000,True


In [8]:
df_grouped = df.groupby(["caseid"]).agg(list).reset_index()

In [9]:
df_grouped = Utils.reduce_list_columns(df_grouped)

In [10]:
Utils.display_columns_to_aggragate(df_grouped)

Unnamed: 0,activity,resource,Action,EventID,OfferID,dt,OfferID_was_null
0,"[O_Create Offer, O_Created, O_Sent (mail and o...","[User_20, User_20, User_20, User_117, User_115]","[Created, statechange, statechange, statechang...","[Offer_1000681710, OfferState_452902905, Offer...","[Offer_1935259954, Offer_1000681710, Offer_100...","[0 days 00:00:01.663000, 0 days 00:00:19.05600...","[True, False, False, False, False]"
1,"[O_Create Offer, O_Created, O_Sent (mail and o...","[User_2, User_2, User_2, User_113, User_30]","[Created, statechange, statechange, statechang...","[Offer_1001553250, OfferState_1604351174, Offe...","[Offer_533969936, Offer_1001553250, Offer_1001...","[0 days 00:00:01.188000, 0 days 00:00:15.47200...","[True, False, False, False, False]"
2,"[O_Create Offer, O_Created, O_Sent (mail and o...","[User_85, User_85, User_85, User_117, User_118]","[Created, statechange, statechange, statechang...","[Offer_1002136393, OfferState_1608457034, Offe...","[Offer_1935259954, Offer_1002136393, Offer_100...","[0 days 00:00:01.293000, 0 days 00:00:19.55600...","[True, False, False, False, False]"
3,"[O_Create Offer, O_Created, O_Sent (mail and o...","[User_49, User_49, User_49, User_113, User_102]","[Created, statechange, statechange, statechang...","[Offer_1002236598, OfferState_1780384890, Offe...","[Offer_1651128996, Offer_1002236598, Offer_100...","[0 days 00:00:01.269000, 0 days 00:00:11.34800...","[True, False, False, False, False]"
4,"[O_Create Offer, O_Created, O_Sent (mail and o...","[User_15, User_15, User_15, User_1]","[Created, statechange, statechange, statechange]","[Offer_1002530118, OfferState_1971804832, Offe...","[Offer_520979602, Offer_1002530118, Offer_1002...","[0 days 00:00:01.216000, 0 days 00:00:20.01800...","[True, False, False, False]"
...,...,...,...,...,...,...,...
4995,"[O_Create Offer, O_Created, O_Sent (mail and o...","[User_73, User_73, User_73, User_43]","[Created, statechange, statechange, statechange]","[Offer_993689039, OfferState_1192570219, Offer...","[Offer_1468908520, Offer_993689039, Offer_9936...","[0 days 00:00:02.062000, 0 days 00:00:15.96300...","[True, False, False, False]"
4996,"[O_Create Offer, O_Created, O_Sent (mail and o...","[User_28, User_28, User_28, User_1]","[Created, statechange, statechange, statechange]","[Offer_993800442, OfferState_1754231388, Offer...","[Offer_1562802810, Offer_993800442, Offer_9938...","[0 days 00:00:01.530000, 0 days 00:00:20.28100...","[True, False, False, False]"
4997,"[O_Create Offer, O_Created, O_Sent (mail and o...","[User_19, User_19, User_19, User_116, User_113]","[Created, statechange, statechange, statechang...","[Offer_99473283, OfferState_1945701229, OfferS...","[Offer_275638981, Offer_99473283, Offer_994732...","[0 days 00:00:01.283000, 0 days 00:04:24.72900...","[True, False, False, False, False]"
4998,"[O_Create Offer, O_Created, O_Sent (mail and o...","[User_25, User_25, User_25, User_119, User_102]","[Created, statechange, statechange, statechang...","[Offer_995784215, OfferState_1350464436, Offer...","[Offer_520979602, Offer_995784215, Offer_99578...","[0 days 00:00:02.593000, 0 days 00:00:15.82700...","[True, False, False, False, False]"


In [11]:
#Some colomns have no importance in the process so we drop them
df_grouped = df_grouped.drop(['EventID','OfferID'],axis=1)
df_grouped = df_grouped.drop(['OfferID_was_null'],axis=1)

In [12]:
df_grouped.head(2)

Unnamed: 0,caseid,activity,resource,Variant,Variant index,y,(case) ApplicationID,(case) CreditScore,(case) FirstWithdrawalAmount,(case) MonthlyCost,(case) NumberOfTerms,(case) OfferedAmount,(case) Selected,Action,EventOrigin,lifecycle:transition,dt
0,Offer_1000681710,"[O_Create Offer, O_Created, O_Sent (mail and o...","[User_20, User_20, User_20, User_117, User_115]",Variant 2,2,True,Application_2131314372,956,14500.0,200.0,96,16000.0,True,"[Created, statechange, statechange, statechang...",Offer,complete,"[0 days 00:00:01.663000, 0 days 00:00:19.05600..."
1,Offer_1001553250,"[O_Create Offer, O_Created, O_Sent (mail and o...","[User_2, User_2, User_2, User_113, User_30]",Variant 2,2,False,Application_1607028451,0,2500.0,175.0,127,17500.0,True,"[Created, statechange, statechange, statechang...",Offer,complete,"[0 days 00:00:01.188000, 0 days 00:00:15.47200..."


In [13]:
df_grouped_agg = Utils.aggregation_encoding(df_grouped,df)

In [14]:
df_grouped_agg.drop("dt",axis=1,inplace = True)

In [15]:
df_grouped_agg.head()

Unnamed: 0,caseid,Variant,Variant index,y,(case) ApplicationID,(case) CreditScore,(case) FirstWithdrawalAmount,(case) MonthlyCost,(case) NumberOfTerms,(case) OfferedAmount,...,User_86,User_142,User_76,User_141,User_32,Created,statechange,avg_dt,max_dt,min_dt
0,Offer_1000681710,Variant 2,2,True,Application_2131314372,956,14500.0,200.0,96,16000.0,...,0.0,0.0,0.0,0.0,0.0,0.2,0.8,1 days 11:21:59.362200,7 days 04:11:16.866000,0 days 00:00:01.663000
1,Offer_1001553250,Variant 2,2,False,Application_1607028451,0,2500.0,175.0,127,17500.0,...,0.0,0.0,0.0,0.0,0.0,0.2,0.8,2 days 22:47:28.444400,6 days 13:57:17.486000,0 days 00:00:01.188000
2,Offer_1002136393,Variant 2,2,True,Application_638926349,969,6500.0,343.25,20,6500.0,...,0.0,0.0,0.0,0.0,0.0,0.2,0.8,5 days 22:58:29.630800,18 days 00:58:56.495000,0 days 00:00:01.293000
3,Offer_1002236598,Variant 3,3,False,Application_818693455,0,2500.0,210.0,127,21000.0,...,0.0,0.0,0.0,0.0,0.0,0.2,0.8,1 days 20:42:58.900800,3 days 03:26:35.948000,0 days 00:00:01.269000
4,Offer_1002530118,Variant 1,1,False,Application_1126987739,0,19750.0,210.12,120,20000.0,...,0.0,0.0,0.0,0.0,0.0,0.25,0.75,15 days 06:47:37.585000,30 days 13:35:04.553000,0 days 00:00:01.216000


## Logs encoding

i think: activity - onehot?, variant - one hot, action, onehot, event origin - onehot, liofecycle - one hot

In [16]:
import h2o
from h2o.automl import H2OAutoML

In [17]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 21.0.2+13-LTS-58, mixed mode, sharing)
  Starting server from C:\Users\l.marazzi\AppData\Local\Programs\Python\Python311\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\L9A1E~1.MAR\AppData\Local\Temp\tmpv5617e_f
  JVM stdout: C:\Users\L9A1E~1.MAR\AppData\Local\Temp\tmpv5617e_f\h2o_l_marazzi_started_from_python.out
  JVM stderr: C:\Users\L9A1E~1.MAR\AppData\Local\Temp\tmpv5617e_f\h2o_l_marazzi_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/Berlin
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,27 days
H2O_cluster_name:,H2O_from_python_l_marazzi_rjo86d
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.918 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


In [18]:
X_train, X_test = train_test_split(df_grouped_agg, test_size=0.3)

df_h_train = h2o.H2OFrame(X_train)
df_h_test = h2o.H2OFrame(X_test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [19]:
x = df_h_train.columns
y = "y"
x.remove(y)

In [20]:
df_h_train[y] = df_h_train[y].asfactor()
df_h_train[y] = df_h_train[y].asfactor()

In [None]:
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=df_h_train)

AutoML progress: |
01:21:20.10: AutoML: XGBoost is not available; skipping it.
01:21:20.60: _train param, Dropping bad and constant columns: [EventOrigin, avg_dt, lifecycle:transition, max_dt, caseid]

████
01:21:44.980: _train param, Dropping bad and constant columns: [EventOrigin, avg_dt, lifecycle:transition, max_dt, caseid]
01:21:50.222: _train param, Dropping bad and constant columns: [EventOrigin, avg_dt, lifecycle:transition, max_dt, caseid]

█
01:21:56.936: _train param, Dropping bad and constant columns: [EventOrigin, avg_dt, lifecycle:transition, max_dt, caseid]

█
01:21:59.975: _train param, Dropping bad and constant columns: [EventOrigin, avg_dt, lifecycle:transition, max_dt, caseid]

██
01:22:03.460: _train param, Dropping bad and constant columns: [EventOrigin, avg_dt, lifecycle:transition, max_dt, caseid]

█
01:22:06.999: _train param, Dropping bad and constant columns: [EventOrigin, avg_dt, lifecycle:transition, max_dt, caseid]

███
01:22:13.451: _train param, Dropping 

In [None]:
lb = aml.leaderboard
lb.head(rows=lb.nrows) 

In [None]:
model = aml.leader

In [None]:
perf = model.model_performance(df_h_test)
perf

## Model selection

## Evaluation of results