# Load the dataset and prepares default attributes (id, timestamp, state)

In [1]:
# Load the dataset

import pandas as pd

incidents = pd.read_csv("VINST cases incidents.csv", sep=";", encoding="iso-8859-1")
incidents.fillna('?', inplace = True)
incidents.head(3)

Unnamed: 0,SR Number,Change Date+Time,Status,Sub Status,Involved ST Function Div,Involved Org line 3,Involved ST,SR Latest Impact,Product,Country,Owner Country,Owner First Name
0,1-364285768,2010-03-31T15:59:42+01:00,Accepted,In Progress,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Frederic
1,1-364285768,2010-03-31T16:00:56+01:00,Accepted,In Progress,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Frederic
2,1-364285768,2010-03-31T16:45:48+01:00,Queued,Awaiting Assignment,A2_5,Org line A2,V5 3rd,Medium,PROD582,fr,France,Frederic


In [2]:
# SR Number is the case id

incidents["SR Number"].describe()

count           65533
unique           7554
top       1-687082195
freq              123
Name: SR Number, dtype: object

In [3]:
# Transform date time into timestamp

import datetime
import time

incidents['timestamp'] = incidents['Change Date+Time'].transform(lambda x: time.mktime(datetime.datetime.strptime(x[0:19], "%Y-%m-%dT%H:%M:%S").timetuple()))

In [4]:
incidents.head(5)

Unnamed: 0,SR Number,Change Date+Time,Status,Sub Status,Involved ST Function Div,Involved Org line 3,Involved ST,SR Latest Impact,Product,Country,Owner Country,Owner First Name,timestamp
0,1-364285768,2010-03-31T15:59:42+01:00,Accepted,In Progress,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Frederic,1270044000.0
1,1-364285768,2010-03-31T16:00:56+01:00,Accepted,In Progress,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Frederic,1270044000.0
2,1-364285768,2010-03-31T16:45:48+01:00,Queued,Awaiting Assignment,A2_5,Org line A2,V5 3rd,Medium,PROD582,fr,France,Frederic,1270047000.0
3,1-364285768,2010-04-06T15:44:07+01:00,Accepted,In Progress,A2_5,Org line A2,V5 3rd,Medium,PROD582,fr,France,Anne Claire,1270561000.0
4,1-364285768,2010-04-06T15:44:38+01:00,Queued,Awaiting Assignment,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Anne Claire,1270561000.0


In [6]:
# Renames case id and creates state using status and sub status (id and state are named by convention)
incidents.rename(index=str, columns={'SR Number': "id"}, inplace=True)
incidents['state'] = incidents['Status']+"-"+incidents['Sub Status']
incidents.drop(['Status', 'Sub Status'], axis = 1, inplace=True)
incidents.reset_index(drop = True, inplace=True)
incidents.head(5)

Unnamed: 0,id,Change Date+Time,Involved ST Function Div,Involved Org line 3,Involved ST,SR Latest Impact,Product,Country,Owner Country,Owner First Name,timestamp,state
0,1-364285768,2010-03-31T15:59:42+01:00,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Frederic,1270044000.0,Accepted-In Progress
1,1-364285768,2010-03-31T16:00:56+01:00,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Frederic,1270044000.0,Accepted-In Progress
2,1-364285768,2010-03-31T16:45:48+01:00,A2_5,Org line A2,V5 3rd,Medium,PROD582,fr,France,Frederic,1270047000.0,Queued-Awaiting Assignment
3,1-364285768,2010-04-06T15:44:07+01:00,A2_5,Org line A2,V5 3rd,Medium,PROD582,fr,France,Anne Claire,1270561000.0,Accepted-In Progress
4,1-364285768,2010-04-06T15:44:38+01:00,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Anne Claire,1270561000.0,Queued-Awaiting Assignment


# Creates ancilliary attributes

In [7]:
def add_next_state(df):
    df['next_state'] = ''
    df['next_time'] = 0
    df['next_dur'] = 0
    df['prev_time'] = 0
    df['elapsed_time_from_event'] = 0
    df['event_order'] = 0
    
    num_rows = len(df)
    event_order = 0

    for i in range(0, num_rows - 1):
        #print(str(i) + ' out of ' + str(num_rows))

        if df.at[i, 'id'] == df.at[i + 1, 'id']:
            df.at[i, 'next_state'] = df.at[i + 1, 'state']
            df.at[i, 'next_time'] = df.at[i + 1, 'timestamp']
            df.at[i, 'next_dur'] = df.at[i + 1, 'timestamp'] - df.at[i, 'timestamp']
            df.at[i, 'event_order'] = event_order
            df.at[i+1, 'prev_time'] = df.at[i, 'timestamp']
            df.at[i+1, 'elapsed_time_from_event'] = df.at[i+1, 'timestamp'] - df.at[i, 'timestamp']
            event_order = event_order + 1
        else:
            df.at[i, 'next_state'] = 99
            df.at[i, 'next_time'] = df.at[i, 'timestamp']
            df.at[i, 'next_dur'] = 0
            df.at[i, 'event_order'] = event_order            
            df.at[i+1, 'prev_time'] = 0
            df.at[i+1, 'elapsed_time_from_event'] = 0
            event_order = 0


    df.at[0, 'prev_time'] = df.at[0, 'timestamp']
    df.at[0, 'elapsed_time_from_event'] = 0
    df.at[num_rows-1, 'next_state'] = 99
    df.at[num_rows-1, 'next_time'] = df.at[num_rows-1, 'timestamp']
    df.at[num_rows-1, 'next_dur'] = 0
    df.at[num_rows-1, 'event_order'] = event_order

    return df


In [8]:
#  Adds several attributes
add_next_state(incidents)
incidents.head(30)

Unnamed: 0,id,Change Date+Time,Involved ST Function Div,Involved Org line 3,Involved ST,SR Latest Impact,Product,Country,Owner Country,Owner First Name,timestamp,state,next_state,next_time,next_dur,prev_time,elapsed_time_from_event,event_order
0,1-364285768,2010-03-31T15:59:42+01:00,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Frederic,1270044000.0,Accepted-In Progress,Accepted-In Progress,1270044056,74,1270043982,0,0
1,1-364285768,2010-03-31T16:00:56+01:00,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Frederic,1270044000.0,Accepted-In Progress,Queued-Awaiting Assignment,1270046748,2692,1270043982,74,1
2,1-364285768,2010-03-31T16:45:48+01:00,A2_5,Org line A2,V5 3rd,Medium,PROD582,fr,France,Frederic,1270047000.0,Queued-Awaiting Assignment,Accepted-In Progress,1270561447,514699,1270044056,2692,2
3,1-364285768,2010-04-06T15:44:07+01:00,A2_5,Org line A2,V5 3rd,Medium,PROD582,fr,France,Anne Claire,1270561000.0,Accepted-In Progress,Queued-Awaiting Assignment,1270561478,31,1270046748,514699,3
4,1-364285768,2010-04-06T15:44:38+01:00,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Anne Claire,1270561000.0,Queued-Awaiting Assignment,Accepted-In Progress,1270561487,9,1270561447,31,4
5,1-364285768,2010-04-06T15:44:47+01:00,A2_5,Org line A2,V13 2nd 3rd,Medium,PROD582,fr,France,Anne Claire,1270561000.0,Accepted-In Progress,Completed-Resolved,1270561491,4,1270561478,9,5
6,1-364285768,2010-04-06T15:44:51+01:00,A2_5,Org line A2,V13 2nd 3rd,Medium,PROD582,fr,France,Anne Claire,1270561000.0,Completed-Resolved,Queued-Awaiting Assignment,1270561507,16,1270561487,4,6
7,1-364285768,2010-04-06T15:45:07+01:00,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Anne Claire,1270562000.0,Queued-Awaiting Assignment,Accepted-In Progress,1270720343,158836,1270561491,16,7
8,1-364285768,2010-04-08T11:52:23+01:00,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Eric,1270720000.0,Accepted-In Progress,Queued-Awaiting Assignment,1270720415,72,1270561507,158836,8
9,1-364285768,2010-04-08T11:53:35+01:00,A2_5,Org line A2,V5 3rd,Medium,PROD582,fr,France,Eric,1270720000.0,Queued-Awaiting Assignment,Accepted-In Progress,1271750831,1030416,1270720343,72,9


In [9]:
# Checks durations are consistent
(incidents['next_dur'] >= 0).all()

True

In [10]:
import numpy as np
def add_start_end(df):
    id_group = df.groupby('id')
    g = id_group['timestamp'].agg([np.min, np.max])
    ev = id_group['event_order'].agg(np.max)
    for i in range(0,len(df)):
        id_case = df.at[i, 'id']
        df.at[i, 'start_case'] = g.at[id_case, 'amin']
        df.at[i, 'end_case'] = g.at[id_case, 'amax']
        df.at[i, 'total_events'] = ev.at[id_case] + 1
        
        
    return df


In [11]:
# Adds more attributes related to cases

incidents = add_start_end(incidents)
incidents.head(10)

Unnamed: 0,id,Change Date+Time,Involved ST Function Div,Involved Org line 3,Involved ST,SR Latest Impact,Product,Country,Owner Country,Owner First Name,...,state,next_state,next_time,next_dur,prev_time,elapsed_time_from_event,event_order,start_case,end_case,total_events
0,1-364285768,2010-03-31T15:59:42+01:00,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Frederic,...,Accepted-In Progress,Accepted-In Progress,1270044056,74,1270043982,0,0,1270044000.0,1336689000.0,17.0
1,1-364285768,2010-03-31T16:00:56+01:00,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Frederic,...,Accepted-In Progress,Queued-Awaiting Assignment,1270046748,2692,1270043982,74,1,1270044000.0,1336689000.0,17.0
2,1-364285768,2010-03-31T16:45:48+01:00,A2_5,Org line A2,V5 3rd,Medium,PROD582,fr,France,Frederic,...,Queued-Awaiting Assignment,Accepted-In Progress,1270561447,514699,1270044056,2692,2,1270044000.0,1336689000.0,17.0
3,1-364285768,2010-04-06T15:44:07+01:00,A2_5,Org line A2,V5 3rd,Medium,PROD582,fr,France,Anne Claire,...,Accepted-In Progress,Queued-Awaiting Assignment,1270561478,31,1270046748,514699,3,1270044000.0,1336689000.0,17.0
4,1-364285768,2010-04-06T15:44:38+01:00,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Anne Claire,...,Queued-Awaiting Assignment,Accepted-In Progress,1270561487,9,1270561447,31,4,1270044000.0,1336689000.0,17.0
5,1-364285768,2010-04-06T15:44:47+01:00,A2_5,Org line A2,V13 2nd 3rd,Medium,PROD582,fr,France,Anne Claire,...,Accepted-In Progress,Completed-Resolved,1270561491,4,1270561478,9,5,1270044000.0,1336689000.0,17.0
6,1-364285768,2010-04-06T15:44:51+01:00,A2_5,Org line A2,V13 2nd 3rd,Medium,PROD582,fr,France,Anne Claire,...,Completed-Resolved,Queued-Awaiting Assignment,1270561507,16,1270561487,4,6,1270044000.0,1336689000.0,17.0
7,1-364285768,2010-04-06T15:45:07+01:00,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Anne Claire,...,Queued-Awaiting Assignment,Accepted-In Progress,1270720343,158836,1270561491,16,7,1270044000.0,1336689000.0,17.0
8,1-364285768,2010-04-08T11:52:23+01:00,A2_4,Org line A2,V30,Medium,PROD582,fr,France,Eric,...,Accepted-In Progress,Queued-Awaiting Assignment,1270720415,72,1270561507,158836,8,1270044000.0,1336689000.0,17.0
9,1-364285768,2010-04-08T11:53:35+01:00,A2_5,Org line A2,V5 3rd,Medium,PROD582,fr,France,Eric,...,Queued-Awaiting Assignment,Accepted-In Progress,1271750831,1030416,1270720343,72,9,1270044000.0,1336689000.0,17.0


In [12]:
# Finally, computes time-related attributes
incidents['remaining_time'] = incidents['end_case'] - incidents['timestamp']
incidents['time_from_start'] = incidents['timestamp'] - incidents['start_case']
incidents['total_time'] = incidents['end_case'] - incidents['start_case']

# Starts encoding

In [13]:
def index_encoding(df, col, default, window = None):
    if window is None:
        hist_len = np.max(df['event_order'])
    else:
        hist_len = window

    for k in range(0, hist_len):
        df[col + '_' + str(k)] = default

    history = df.groupby('id').apply(lambda x: x[col].values)
    
    num_rows = len(df)
    for i in range(0, num_rows):        
        id_number = df.at[i, 'id']
        prefix = history[id_number]
        event_order = df.at[i, 'event_order']
        start_point = max(0, event_order + 1 - hist_len)
        for k in range(start_point, event_order + 1):
            df.at[i, col + '_' + str(k - start_point)] = prefix[k]
    
    return df, hist_len


def frequency_encoding(df, col):
    possible_values = df[col].unique()

    for k in possible_values:
        df[col + '_' + str(k)] = 0

    history = df.groupby('id').apply(lambda x: x[col].values)    
    num_rows = len(df)
    
    for i in range(0, num_rows):
        id_number = df.at[i, 'id']
        trace = history[id_number]
        event_order = df.at[i, 'event_order']
        prefix = trace[:event_order+1]
        unique, counts = np.unique(prefix, return_counts=True)
        for j in range(0, len(unique)):
            df.at[i, col+'_'+str(unique[j])] = counts[j]

    return df

In [14]:
# These are the columns of the dataset

incidents.columns

Index(['id', 'Change Date+Time', 'Involved ST Function Div',
       'Involved Org line 3', 'Involved ST', 'SR Latest Impact', 'Product',
       'Country', 'Owner Country', 'Owner First Name', 'timestamp', 'state',
       'next_state', 'next_time', 'next_dur', 'prev_time',
       'elapsed_time_from_event', 'event_order', 'start_case', 'end_case',
       'total_events', 'remaining_time', 'time_from_start', 'total_time'],
      dtype='object')

In [15]:
# We chose those that we encode using a window of 2

index_columns = ['Involved ST Function Div','Involved Org line 3', 'Involved ST', 'SR Latest Impact', 'Product',
       'Country', 'Owner Country', 'Owner First Name', 'state']

for col in index_columns:
    incidents, hist_len = index_encoding(incidents, col, default = '?', window=2)
    

In [16]:
# And the same for integers
index_int_columns = ['elapsed_time_from_event', 'time_from_start', 'event_order']
for col in index_int_columns:
    incidents, hist_len = index_encoding(incidents, col, default = 0, window=2)

In [17]:
# These are the columns that we have now

incidents.columns

Index(['id', 'Change Date+Time', 'Involved ST Function Div',
       'Involved Org line 3', 'Involved ST', 'SR Latest Impact', 'Product',
       'Country', 'Owner Country', 'Owner First Name', 'timestamp', 'state',
       'next_state', 'next_time', 'next_dur', 'prev_time',
       'elapsed_time_from_event', 'event_order', 'start_case', 'end_case',
       'total_events', 'remaining_time', 'time_from_start', 'total_time',
       'Involved ST Function Div_0', 'Involved ST Function Div_1',
       'Involved Org line 3_0', 'Involved Org line 3_1', 'Involved ST_0',
       'Involved ST_1', 'SR Latest Impact_0', 'SR Latest Impact_1',
       'Product_0', 'Product_1', 'Country_0', 'Country_1', 'Owner Country_0',
       'Owner Country_1', 'Owner First Name_0', 'Owner First Name_1',
       'state_0', 'state_1', 'elapsed_time_from_event_0',
       'elapsed_time_from_event_1', 'time_from_start_0', 'time_from_start_1',
       'event_order_0', 'event_order_1'],
      dtype='object')

In [18]:
# Remove the attributes that are not going to be used for learning

incidents_X = incidents.drop(['Change Date+Time', 'Involved ST Function Div',
       'Involved Org line 3', 'Involved ST', 'SR Latest Impact', 'Product',
       'Country', 'Owner Country', 'Owner First Name', 'timestamp', 'state',
       'next_state', 'next_time', 'next_dur', 'prev_time',
       'elapsed_time_from_event', 'event_order', 'start_case', 'end_case',
       'remaining_time', 'time_from_start'], axis=1)

In [19]:
# Choose the target attribute (kind of)

incidents_Y = incidents['remaining_time']

# Splits into train, validation and test using case id

In [20]:
# And the attribute (id) that is used to split the dataset in train, validation and test
incidents_group = incidents['id']

In [21]:
# Split into train, validation, test

from sklearn.model_selection import GroupShuffleSplit
from sklearn.utils import safe_indexing
from itertools import chain


def group_train_test_split(X, Y, group, test_size=0.2):
    train, test = next(GroupShuffleSplit(test_size=test_size).split(X, Y, group))
    arrays = [X, Y, group]

    return list(chain.from_iterable((safe_indexing(a, train),
                                     safe_indexing(a, test)) for a in arrays))

X_train, X_test, Y_train, Y_test, group_train, group_test = group_train_test_split(incidents_X, incidents_Y, incidents_group, test_size = 0.4)
X_val, X_test, Y_val, Y_test, group_val, group_test = group_train_test_split(X_test, Y_test, group_test, test_size=0.5)

X_train.shape, Y_train.shape, X_test.shape, Y_test.shape, X_val.shape, Y_val.shape

((38555, 27), (38555,), (13756, 27), (13756,), (13222, 27), (13222,))

In [22]:
# Quick check that case ids are not in two slots
np.asarray([item in group_test.unique() for item in group_train.unique()]).any()

False

In [23]:
X_val.columns

Index(['id', 'total_events', 'total_time', 'Involved ST Function Div_0',
       'Involved ST Function Div_1', 'Involved Org line 3_0',
       'Involved Org line 3_1', 'Involved ST_0', 'Involved ST_1',
       'SR Latest Impact_0', 'SR Latest Impact_1', 'Product_0', 'Product_1',
       'Country_0', 'Country_1', 'Owner Country_0', 'Owner Country_1',
       'Owner First Name_0', 'Owner First Name_1', 'state_0', 'state_1',
       'elapsed_time_from_event_0', 'elapsed_time_from_event_1',
       'time_from_start_0', 'time_from_start_1', 'event_order_0',
       'event_order_1'],
      dtype='object')

# Prepare datasets for learning algorithm

In [24]:
# Converts them into dictionaries in order to use one-hot-encoding
total_events_time = ['total_events', 'total_time', 'id']
all_dict = incidents_X.drop(total_events_time, axis=1).to_dict('record')
train_dict = X_train.drop(total_events_time, axis=1).to_dict('record')
val_dict = X_val.drop(total_events_time, axis=1).to_dict('record')
test_dict = X_test.drop(total_events_time, axis=1).to_dict('record')



In [25]:
X_train

Unnamed: 0,id,total_events,total_time,Involved ST Function Div_0,Involved ST Function Div_1,Involved Org line 3_0,Involved Org line 3_1,Involved ST_0,Involved ST_1,SR Latest Impact_0,...,Owner First Name_0,Owner First Name_1,state_0,state_1,elapsed_time_from_event_0,elapsed_time_from_event_1,time_from_start_0,time_from_start_1,event_order_0,event_order_1
0,1-364285768,17.0,66644793.0,A2_4,?,Org line A2,?,V30,?,Medium,...,Frederic,?,Accepted-In Progress,?,0,0,0,0,0,0
1,1-364285768,17.0,66644793.0,A2_4,A2_4,Org line A2,Org line A2,V30,V30,Medium,...,Frederic,Frederic,Accepted-In Progress,Accepted-In Progress,0,74,0,74,0,1
2,1-364285768,17.0,66644793.0,A2_4,A2_5,Org line A2,Org line A2,V30,V5 3rd,Medium,...,Frederic,Frederic,Accepted-In Progress,Queued-Awaiting Assignment,74,2692,74,2766,1,2
3,1-364285768,17.0,66644793.0,A2_5,A2_5,Org line A2,Org line A2,V5 3rd,V5 3rd,Medium,...,Frederic,Anne Claire,Queued-Awaiting Assignment,Accepted-In Progress,2692,514699,2766,517465,2,3
4,1-364285768,17.0,66644793.0,A2_5,A2_4,Org line A2,Org line A2,V5 3rd,V30,Medium,...,Anne Claire,Anne Claire,Accepted-In Progress,Queued-Awaiting Assignment,514699,31,517465,517496,3,4
5,1-364285768,17.0,66644793.0,A2_4,A2_5,Org line A2,Org line A2,V30,V13 2nd 3rd,Medium,...,Anne Claire,Anne Claire,Queued-Awaiting Assignment,Accepted-In Progress,31,9,517496,517505,4,5
6,1-364285768,17.0,66644793.0,A2_5,A2_5,Org line A2,Org line A2,V13 2nd 3rd,V13 2nd 3rd,Medium,...,Anne Claire,Anne Claire,Accepted-In Progress,Completed-Resolved,9,4,517505,517509,5,6
7,1-364285768,17.0,66644793.0,A2_5,A2_4,Org line A2,Org line A2,V13 2nd 3rd,V30,Medium,...,Anne Claire,Anne Claire,Completed-Resolved,Queued-Awaiting Assignment,4,16,517509,517525,6,7
8,1-364285768,17.0,66644793.0,A2_4,A2_4,Org line A2,Org line A2,V30,V30,Medium,...,Anne Claire,Eric,Queued-Awaiting Assignment,Accepted-In Progress,16,158836,517525,676361,7,8
9,1-364285768,17.0,66644793.0,A2_4,A2_5,Org line A2,Org line A2,V30,V5 3rd,Medium,...,Eric,Eric,Accepted-In Progress,Queued-Awaiting Assignment,158836,72,676361,676433,8,9


In [26]:
from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer()
vec.fit(all_dict)
vec.get_feature_names()

['Country_0=0',
 'Country_0=SE',
 'Country_0=au',
 'Country_0=be',
 'Country_0=br',
 'Country_0=ca',
 'Country_0=cl',
 'Country_0=cn',
 'Country_0=de',
 'Country_0=fr',
 'Country_0=gb',
 'Country_0=in',
 'Country_0=jp',
 'Country_0=kr',
 'Country_0=my',
 'Country_0=nl',
 'Country_0=pe',
 'Country_0=pl',
 'Country_0=ru',
 'Country_0=se',
 'Country_0=th',
 'Country_0=tr',
 'Country_0=us',
 'Country_1=0',
 'Country_1=?',
 'Country_1=SE',
 'Country_1=au',
 'Country_1=be',
 'Country_1=br',
 'Country_1=ca',
 'Country_1=cl',
 'Country_1=cn',
 'Country_1=de',
 'Country_1=fr',
 'Country_1=gb',
 'Country_1=in',
 'Country_1=jp',
 'Country_1=kr',
 'Country_1=my',
 'Country_1=nl',
 'Country_1=pe',
 'Country_1=pl',
 'Country_1=ru',
 'Country_1=se',
 'Country_1=th',
 'Country_1=tr',
 'Country_1=us',
 'Involved Org line 3_0=Org line A2',
 'Involved Org line 3_0=Org line B',
 'Involved Org line 3_0=Org line C',
 'Involved Org line 3_0=Org line D',
 'Involved Org line 3_0=Org line E',
 'Involved Org lin

In [27]:
train_onehot = vec.transform(train_dict)
train_onehot.shape

(38555, 5683)

In [28]:
val_onehot = vec.transform(val_dict)
test_onehot = vec.transform(test_dict)

In [29]:
Y_train.describe()

count    3.855500e+04
mean     1.314628e+06
std      3.494099e+06
min      0.000000e+00
25%      4.847800e+04
50%      6.531550e+05
75%      1.160618e+06
max      6.664479e+07
Name: remaining_time, dtype: float64

In [30]:
# Choose a threshold for the classification task
Y_threshold = 1036800
(Y_train < Y_threshold).describe()

count     38555
unique        2
top        True
freq      27421
Name: remaining_time, dtype: object

# Trains classifier and evaluates them

In [37]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50,min_samples_leaf=2,random_state=0)
clf.fit(train_onehot, Y_train < Y_threshold)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [38]:
from sklearn.metrics import classification_report
print(classification_report(Y_train < Y_threshold, clf.predict(train_onehot)))
print(classification_report(Y_val < Y_threshold, clf.predict(val_onehot)))
print(classification_report(Y_test < Y_threshold, clf.predict(test_onehot)))

             precision    recall  f1-score   support

      False       0.95      0.57      0.71     11134
       True       0.85      0.99      0.91     27421

avg / total       0.88      0.87      0.86     38555

             precision    recall  f1-score   support

      False       0.66      0.25      0.36      3873
       True       0.75      0.95      0.84      9349

avg / total       0.73      0.74      0.70     13222

             precision    recall  f1-score   support

      False       0.73      0.27      0.40      4206
       True       0.75      0.96      0.84      9550

avg / total       0.74      0.75      0.70     13756



In [39]:
# Now, we are going to build estimators for total_time and total_events

from sklearn.ensemble import RandomForestRegressor

time_regressor = RandomForestRegressor(n_estimators=50,min_samples_leaf=2,random_state=0,n_jobs=-1)
time_regressor.fit(train_onehot, X_train['total_time'])

adv_regressor = RandomForestRegressor(n_estimators=50,min_samples_leaf=2,random_state=0,n_jobs=-1)
adv_regressor.fit(train_onehot, X_train['total_events'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

# Creates output for computing reliability

In [119]:
def build_output(val_onehot, clf, time_regressor, adv_regressor, actual, adv, time):
    val_predict = clf.predict_proba(val_onehot)    
    time_pred_val = time_regressor.predict(val_onehot)
    adv_pred_val = adv_regressor.predict(val_onehot)  
    
    output_val = pd.DataFrame(data = {'est_time': time_pred_val, 'est_adv':adv_pred_val, 'proba1': val_predict.transpose()[1], 'prediction': clf.predict(val_onehot), 'actual': actual, 'adv':adv, 'time':time})
    output_val['proba_full'] = output_val['proba1']
    output_val.loc[output_val['prediction'] == 0, 'proba_full'] = (1 - output_val['proba1'])
    output_val['adv_perc'] = np.minimum(1, output_val['adv']/round(output_val['est_adv'],0))
    output_val['time_perc'] = np.minimum(1, output_val['time']/output_val['est_time'])
    
    return output_val

In [120]:
output_val = build_output(val_onehot, clf, time_regressor, adv_regressor, Y_val < Y_threshold, X_val['event_order_1']+1, X_val['time_from_start_1'])
output_test = build_output(test_onehot, clf, time_regressor, adv_regressor, Y_test < Y_threshold, X_test['event_order_1']+1, X_test['time_from_start_1'])
output_val

Unnamed: 0,actual,adv,est_adv,est_time,prediction,proba1,time,proba_full,adv_perc,time_perc
477,False,1,17.934700,1.882660e+07,False,0.310170,0,0.689830,0.055556,0.000000e+00
478,False,2,16.896732,1.796530e+07,False,0.266750,7,0.733250,0.117647,3.896399e-07
479,False,3,17.896699,1.776727e+07,False,0.176332,129,0.823668,0.166667,7.260540e-06
480,False,4,19.041647,1.703669e+07,False,0.132450,51774,0.867550,0.210526,3.038971e-03
481,False,5,17.398989,2.384922e+07,False,0.127220,52909,0.872780,0.294118,2.218479e-03
482,False,6,16.817060,2.371035e+07,False,0.141091,55907,0.858909,0.352941,2.357915e-03
483,False,7,20.636094,1.178091e+07,False,0.151620,56177,0.848380,0.333333,4.768475e-03
484,False,8,19.096023,2.016603e+07,False,0.187487,505796,0.812513,0.421053,2.508158e-02
485,False,9,14.355275,2.395269e+07,False,0.148598,21078539,0.851402,0.642857,8.800070e-01
486,True,10,14.631340,2.501784e+07,False,0.381396,25452844,0.618604,0.666667,1.000000e+00


# Computes error for reliability

In [349]:
from scipy.stats.stats import pearsonr


def errormetric(output_val, step):
    results = []
    count = []
    mid_point = []
    lower = np.round(np.arange(0.0, 1.0, step), 2)

    for i in range(len(lower)):
        if i < len(lower)-1:
            output_interval = output_val[(output_val['reliability'] >= lower[i]) & (output_val['reliability'] < lower[i+1])]
        else:
            output_interval = output_val[(output_val['reliability'] >= lower[i])]

        mid_point.append((lower[i]+step/2))
        correct = np.sum(output_interval['prediction'] == output_interval['actual'])
        incorrect = len(output_interval) - correct
        if correct + incorrect > 0:
            results.append(correct/(correct + incorrect))
        else:
            results.append(float('NaN'))
        count.append(len(output_interval))
        
    results = np.asarray(results)
    mid_point = np.asarray(mid_point)
    
    errors = abs(results - mid_point)
    nonweighted_error = np.nanmean(errors)
    total = sum(count)
    weighted_error = np.nansum((np.asarray(count)/total)*errors)

    corr,pvalue = pearsonr(results[~np.isnan(results)], mid_point[~np.isnan(results)])

    return {'results': results, 'count': count, 'mid_point': mid_point, 'nwe': nonweighted_error, 'we': weighted_error, 'corr': corr, 'pvalue': pvalue}


In [357]:
# Some tests for output_val

#output_val['reliability'] = np.exp(0.3 * (np.log(output_val.adv_perc))  + 0 * (np.log(output_val.time_perc)) + 0.7 * np.log(output_val.proba_full))

#output_val['reliability'] = 0.4 * np.minimum(0,(np.log(output_val.adv_perc)+1))  + 0 * (np.log(output_val.time_perc+1)) + 0.6 * output_val.proba_full
#output_val['reliability'] = np.cbrt(output_val.adv_perc * output_val.time_perc * np.log(output_val.proba_full+1))

#output_val['reliability'] = 0.5 * (np.exp(output_val.adv_perc)-1)/(np.exp(1)-1) + 0 * output_val.time_perc + 0.5 * output_val.proba_full
output_val['reliability'] = 0.3 * output_val.adv_perc + 0 * output_val.time_perc + 0.7 * output_val.proba_full


errormetric(output_val, 0.1), np.mean(output_val['reliability']), np.std(output_val['reliability'])


({'corr': 0.9796613937287924,
  'count': [0, 0, 0, 49, 1620, 3731, 3545, 1884, 1112, 1281],
  'mid_point': array([0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95]),
  'nwe': 0.09783439152756232,
  'pvalue': 0.00011207490601894113,
  'results': array([       nan,        nan,        nan, 0.44897959, 0.50246914,
         0.61404449, 0.77658674, 0.91507431, 0.98471223, 0.99297424]),
  'we': 0.0978218121312963},
 0.6573696425341878,
 0.14648338301256972)

In [358]:
# Baseline for output_val

output_val['reliability'] = 0 * output_val.adv_perc + 0 * output_val.time_perc + 1 * output_val.proba_full
errormetric(output_val, 0.1), np.mean(output_val['reliability']), np.std(output_val['reliability'])

({'corr': 0.9792554959342935,
  'count': [0, 0, 0, 0, 0, 2784, 3509, 3108, 2341, 1480],
  'mid_point': array([0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95]),
  'nwe': 0.05622727969503084,
  'pvalue': 0.0035754641757538286,
  'results': array([       nan,        nan,        nan,        nan,        nan,
         0.4996408 , 0.65346252, 0.83429858, 0.95044853, 0.99256757]),
  'we': 0.0538874602934503},
 0.7203526030278976,
 0.12913208708960577)

In [416]:
# Some tests for output_test

output_test['reliability'] = 0.2 * (np.log(output_test.adv_perc*10)/np.log(10)) + 0 * output_test.time_perc + 0.8 * output_test.proba_full
output_test['reliability'] = 0 * (np.log(output_test.adv_perc*10)/np.log(10)) + 0 * output_test.time_perc + 1 * output_test.proba_full

                                        
e = errormetric(output_test, 0.1)

In [419]:

e

{'corr': 0.9898278924028118,
 'count': [0, 0, 0, 0, 0, 489, 927, 903, 934, 3299],
 'mid_point': array([0.05, 0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95]),
 'nwe': 0.10872511626941787,
 'pvalue': 0.001229664150643457,
 'results': array([       nan,        nan,        nan,        nan,        nan,
        0.41513292, 0.54692557, 0.61461794, 0.72912206, 0.90057593]),
 'we': 0.08542429792429795}

# Searches all possible weights to check which one is better

In [561]:
from scipy.stats import iqr

def search_weights(output_val, weight_step=10, error_step=0.1, transform_adv=(lambda x: x), transform_time=(lambda x: x)):
    values = range(0, 110, weight_step)
    results = []
    
    for i in values:
        for j in values:
            for k in values:
                if i + j + k == 100:
                    output_val['reliability'] = i/100 * transform_adv(output_val['adv_perc']) + j/100 * transform_time(output_val['time_perc']) + k/100 * output_val['proba_full']
                    error = errormetric(output_val, 0.1)
                    results.append(np.concatenate(([i/100, j/100, k/100, error['nwe'], error['we'], error['corr'], error['pvalue'], np.mean(output_val['reliability']), np.std(output_val['reliability'])], error['count'], error['results'])))
                
    return results

def show_best_results(results):
    results = np.asarray(results)
    return results[np.argmin(results[:, 3])], results[np.argmin(results[:, 4])], results[np.argmin(results[:, 4])]

def show_statistically_significant(results):
    results = np.asarray(results)
    return results[(results[:,6] <= 0.05) & (results[:,5] > 0)]

def display_results(results):
    results_ds = pd.DataFrame(results, columns=['w_adv', 'w_time', 'w_prob', 'non-weighted error', 'weighted error', 'correlation', 'pvalue', 'mean', 'std']+['c'+str(i) for i in range(0, 100, 10)]+['r'+str(i) for i in range(0, 100, 10)])
    return results_ds

def store_results(results, path):
    results_ds = display_results(results)
    results_ds.to_csv(path)
    return results_ds

In [526]:
#output_val_false = output_val[output_val['prediction'] == False]
#results = search_weights(output_val, transform_time=(lambda x: np.log((x+0.00000001)*100000000)/np.log(100000000)))#, transform_adv=(lambda x: np.log(x*10)/np.log(10)))
#results = search_weights(output_val, transform_adv=(lambda x: np.log((x)*100)/np.log(100)))#, transform_adv=(lambda x: np.log(x*10)/np.log(10)))
results = search_weights(output_val, transform_time=(lambda x: np.log((x+0.00000001)*100000000)/np.log(100000000)), transform_adv=(lambda x: np.log(x*100)/np.log(100)))


results

[array([0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 5.62272797e-02,
        5.38874603e-02, 9.79255496e-01, 3.57546418e-03, 7.20352603e-01,
        1.29132087e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 2.78400000e+03, 3.50900000e+03,
        3.10800000e+03, 2.34100000e+03, 1.48000000e+03,            nan,
                   nan,            nan,            nan,            nan,
        4.99640805e-01, 6.53462525e-01, 8.34298584e-01, 9.50448526e-01,
        9.92567568e-01]),
 array([0.00000000e+00, 1.00000000e-01, 9.00000000e-01, 5.37131054e-02,
        5.89925881e-02, 9.80105008e-01, 5.89778740e-04, 7.18684326e-01,
        1.22200520e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 8.30000000e+01, 2.36300000e+03, 4.08400000e+03,
        3.24500000e+03, 2.13000000e+03, 1.31700000e+03,            nan,
                   nan,            nan,            nan, 4.45783133e-01,
        4.87515870e-01, 6.58667973e-01

In [527]:
output_val['adv_perc'].describe()

count    13222.000000
mean         0.510409
std          0.271136
min          0.024390
25%          0.285714
50%          0.500000
75%          0.708333
max          1.000000
Name: adv_perc, dtype: float64

In [528]:
np.log(0.5*100)/np.log(100)

0.8494850021680093

In [529]:
(0.0000001)*10000000

1.0

In [562]:
display_results(np.stack(show_best_results(results)))

Unnamed: 0,w_adv,w_time,w_prob,non-weighted error,weighted error,correlation,pvalue,mean,std,c0,...,r0,r10,r20,r30,r40,r50,r60,r70,r80,r90
0,0.5,0.0,0.5,0.039088,0.049244,0.98388,0.000388,0.767046,0.112709,0.0,...,,,,,0.44086,0.532383,0.590846,0.704786,0.910036,0.993371
1,0.3,0.1,0.6,0.122707,0.039775,0.400582,0.373174,0.744102,0.117752,0.0,...,,,,1.0,0.445087,0.537183,0.604161,0.761537,0.941465,0.992376
2,0.3,0.1,0.6,0.122707,0.039775,0.400582,0.373174,0.744102,0.117752,0.0,...,,,,1.0,0.445087,0.537183,0.604161,0.761537,0.941465,0.992376


In [532]:
display_results(show_statistically_significant(results))

Unnamed: 0,w_adv,w_time,w_prob,non-weighted error,weighted error,correlation,pvalue,mean,std,c0,...,r0,r10,r20,r30,r40,r50,r60,r70,r80,r90
0,0.0,0.0,1.0,0.056227,0.053887,0.979255,0.003575,0.720353,0.129132,0.0,...,,,,,,0.499641,0.653463,0.834299,0.950449,0.992568
1,0.0,0.1,0.9,0.053713,0.058993,0.980105,0.000590,0.718684,0.122201,0.0,...,,,,,0.445783,0.487516,0.658668,0.843760,0.961502,0.991648
2,0.0,0.2,0.8,0.043842,0.042845,0.985520,0.000313,0.717016,0.123568,0.0,...,,,,,0.469880,0.537660,0.649753,0.827776,0.961957,0.990854
3,0.0,0.3,0.7,0.073781,0.057435,0.965875,0.000406,0.715348,0.132978,0.0,...,,,,0.430000,0.600000,0.624669,0.619592,0.792586,0.949136,0.989668
4,0.0,0.4,0.6,0.091427,0.054712,0.840275,0.017934,0.713679,0.148913,0.0,...,,,,0.483146,0.773645,0.600287,0.622407,0.754830,0.913901,0.986586
5,0.0,0.5,0.5,0.135684,0.063931,0.705062,0.050789,0.712011,0.169545,0.0,...,,,0.485030,0.725683,0.823420,0.553191,0.662828,0.711259,0.862269,0.984309
6,0.0,0.6,0.4,0.147985,0.090924,0.468057,0.242144,0.710343,0.193375,0.0,...,,,0.617718,0.917391,0.488506,0.590564,0.687029,0.694828,0.803571,0.981073
7,0.0,0.7,0.3,0.178372,0.118106,0.620145,0.074805,0.708675,0.219364,0.0,...,,0.483146,0.838095,0.600000,0.580986,0.636719,0.698392,0.715832,0.730594,0.964439
8,0.0,0.8,0.2,0.181254,0.146733,0.490743,0.216915,0.707006,0.246830,0.0,...,,0.754467,,0.604167,0.622120,0.669725,0.717636,0.721058,0.682884,0.914143
9,0.0,0.9,0.1,0.270724,0.177099,-0.104002,0.790030,0.705338,0.275332,1511.0,...,0.754467,,1.000000,0.602151,0.675541,0.684169,0.723967,0.721566,0.676443,0.855772


In [563]:
results_test = search_weights(output_test)
display_results(np.stack(show_best_results(results_test)))

Unnamed: 0,w_adv,w_time,w_prob,non-weighted error,weighted error,correlation,pvalue,mean,std,c0,...,r0,r10,r20,r30,r40,r50,r60,r70,r80,r90
0,0.2,0.0,0.8,0.019619,0.018068,0.995974,2.4e-05,0.759588,0.159394,0.0,...,,,,,0.420881,0.560122,0.623903,0.726754,0.867474,0.961658
1,0.2,0.0,0.8,0.019619,0.018068,0.995974,2.4e-05,0.759588,0.159394,0.0,...,,,,,0.420881,0.560122,0.623903,0.726754,0.867474,0.961658
2,0.2,0.0,0.8,0.019619,0.018068,0.995974,2.4e-05,0.759588,0.159394,0.0,...,,,,,0.420881,0.560122,0.623903,0.726754,0.867474,0.961658


In [50]:
display_results(show_statistically_significant(results_test))

Unnamed: 0,w_adv,w_time,w_prob,non-weighted error,weighted error,correlation,pvalue
0,0.0,0.0,1.0,0.024747,0.024535,0.993003,7.018220e-04
1,0.0,0.1,0.9,0.055906,0.053984,0.989188,1.747149e-04
2,0.0,0.2,0.8,0.098652,0.102145,0.976536,8.193672e-04
3,0.0,0.3,0.7,0.138798,0.152842,0.976394,1.623731e-04
4,0.0,0.4,0.6,0.160344,0.202472,0.954141,8.438631e-04
5,0.0,0.5,0.5,0.189589,0.254660,0.933389,7.024594e-04
6,0.0,0.6,0.4,0.195886,0.301999,0.883907,3.578963e-03
7,0.0,0.7,0.3,0.225660,0.355452,0.898959,9.768575e-04
8,0.0,0.8,0.2,0.230550,0.402573,0.908935,6.857597e-04
9,0.0,0.9,0.1,0.270379,0.464038,0.877753,8.409280e-04


In [568]:
from sklearn.dummy import DummyRegressor

event_constant = DummyRegressor(strategy='quantile', quantile=0.90)
time_constant = DummyRegressor(strategy='quantile', quantile=0.90)
event_constant.fit(train_onehot, X_train['total_events'])
time_constant.fit(train_onehot, X_train['total_time'])

output_val_dummy = build_output(val_onehot, clf, time_constant, event_constant, Y_val < Y_threshold, X_val['event_order_1']+1, X_val['time_from_start_1']) 

results_dummy = search_weights(output_val_dummy)
show_best_results(results_dummy)

(array([0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 6.51127268e-02,
        4.86235063e-02, 9.65913440e-01, 7.51580554e-03, 8.88495620e-01,
        1.35319559e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 3.65000000e+02, 7.64000000e+02,
        1.07600000e+03, 1.93900000e+03, 9.07800000e+03,            nan,
                   nan,            nan,            nan,            nan,
        5.12328767e-01, 7.17277487e-01, 8.52230483e-01, 9.35018051e-01,
        9.83366380e-01]),
 array([0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 6.51127268e-02,
        4.86235063e-02, 9.65913440e-01, 7.51580554e-03, 8.88495620e-01,
        1.35319559e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 3.65000000e+02, 7.64000000e+02,
        1.07600000e+03, 1.93900000e+03, 9.07800000e+03,            nan,
                   nan,            nan,            nan,            nan,
        5.12328767e-01, 7.17277487e-01

In [52]:
store_results(show_statistically_significant(results_dummy), 'remaining-val-dummy.csv')

Unnamed: 0,w_adv,w_time,w_prob,non-weighted error,weighted error,correlation,pvalue
0,0.0,0.0,1.0,0.033494,0.033588,0.993099,0.000687
1,0.0,0.1,0.9,0.059564,0.060142,0.98655,0.00027
2,0.0,0.2,0.8,0.106753,0.120973,0.962189,0.002117
3,0.0,0.3,0.7,0.141908,0.186333,0.92485,0.002855
4,0.1,0.0,0.9,0.049738,0.04609,0.985008,0.000335
5,0.1,0.1,0.8,0.095788,0.106671,0.974866,0.00094
6,0.1,0.2,0.7,0.136789,0.170852,0.950856,0.001001
7,0.1,0.3,0.6,0.151179,0.234987,0.827116,0.021696
8,0.2,0.0,0.8,0.078859,0.091718,0.975615,0.000885
9,0.2,0.1,0.7,0.121758,0.155945,0.944914,0.001328


In [570]:
from sklearn.model_selection import GroupKFold

total_events_time = ['total_events', 'total_time', 'id']
all_dict = incidents_X.drop(total_events_time, axis=1).to_dict('record')
vec = DictVectorizer()
vec.fit(all_dict)

group_kfold = GroupKFold(n_splits=10)
i = 0

allresults = []
advtrans_results = []
timetrans_results = []
both_results = []


for train_index, test_index in group_kfold.split(incidents_X, incidents_Y, incidents_group):
    X_train, X_test = safe_indexing(incidents_X, train_index), safe_indexing(incidents_X, test_index)
    Y_train, Y_test = safe_indexing(incidents_Y, train_index), safe_indexing(incidents_Y, test_index)

    # Converts them into dictionaries in order to use one-hot-encoding
    train_dict = X_train.drop(total_events_time, axis=1).to_dict('record')
    test_dict = X_test.drop(total_events_time, axis=1).to_dict('record')
    
    train_onehot = vec.transform(train_dict)
    test_onehot = vec.transform(test_dict)
    print('Iteration ' + str(i)+' starting building models')
    clf = RandomForestClassifier(random_state=0)
    clf.fit(train_onehot, Y_train < Y_threshold)
    print('Iteration ' + str(i)+' classifier ready')

    time_regressor = RandomForestRegressor(random_state=0,n_jobs=-1)
    time_regressor.fit(train_onehot, X_train['total_time'])
    print('Iteration ' + str(i)+' time regressor ready')

    adv_regressor = RandomForestRegressor(random_state=0,n_jobs=-1)
    adv_regressor.fit(train_onehot, X_train['total_events'])
    print('Iteration ' + str(i)+' adv regressor ready')

    output_test = build_output(test_onehot, clf, time_regressor, adv_regressor, Y_test < Y_threshold, X_test['event_order_1']+1, X_test['time_from_start_1'])
    print('Iteration ' + str(i)+' finished building models')

    transform_adv=(lambda x: np.log(x*100)/np.log(100))
    transform_time=(lambda x: np.log((x+0.0000001)*10000000)/np.log(10000000))
    
    results = search_weights(output_test)
    df = store_results(results, 'bpi2013-normal-'+str(i)+'.csv')
    allresults.append(df)
    
    results = search_weights(output_test,  transform_adv = transform_adv)
    df = store_results(results, 'bpi2013-adv-'+str(i)+'.csv')
    advtrans_results.append(df)

    results = search_weights(output_test, transform_time = transform_time )
    df = store_results(results, 'bpi2013-time-'+str(i)+'.csv')
    timetrans_results.append(df)

    results = search_weights(output_test, transform_time=transform_time, transform_adv=transform_adv)
    df = store_results(results, 'bpi2013-both-'+str(i)+'.csv')
    both_results.append(df)

    i = i + 1

for res, name in [(allresults, 'normal'), (advtrans_results, 'adv'), (timetrans_results, 'time'), (both_results, 'both')]:
    current = pd.concat(res)
    current = current.groupby(current.index).mean()        

    current.to_csv("bpi2013-"+name+".csv")




Iteration 0 starting building models
Iteration 0 classifier ready
Iteration 0 time regressor ready
Iteration 0 adv regressor ready
Iteration 0 finished building models
Iteration 1 starting building models
Iteration 1 classifier ready
Iteration 1 time regressor ready
Iteration 1 adv regressor ready
Iteration 1 finished building models
Iteration 2 starting building models
Iteration 2 classifier ready
Iteration 2 time regressor ready
Iteration 2 adv regressor ready
Iteration 2 finished building models
Iteration 3 starting building models
Iteration 3 classifier ready
Iteration 3 time regressor ready
Iteration 3 adv regressor ready
Iteration 3 finished building models
Iteration 4 starting building models
Iteration 4 classifier ready
Iteration 4 time regressor ready
Iteration 4 adv regressor ready
Iteration 4 finished building models
Iteration 5 starting building models
Iteration 5 classifier ready
Iteration 5 time regressor ready
Iteration 5 adv regressor ready
Iteration 5 finished building

In [571]:
for name in ('normal', 'adv', 'time', 'both'):
    print(name)
    normal = pd.read_csv("bpi2013-"+name+".csv", index_col=0)
    display(display_results(np.stack(show_best_results(np.asarray(normal)))))

normal


Unnamed: 0,w_adv,w_time,w_prob,non-weighted error,weighted error,correlation,pvalue,mean,std,c0,...,r0,r10,r20,r30,r40,r50,r60,r70,r80,r90
0,0.2,0.0,0.8,0.024304,0.020065,0.985735,0.000532,0.761694,0.159049,0.0,...,,,,,0.489174,0.548882,0.62246,0.717316,0.857892,0.956911
1,0.2,0.0,0.8,0.024304,0.020065,0.985735,0.000532,0.761694,0.159049,0.0,...,,,,,0.489174,0.548882,0.62246,0.717316,0.857892,0.956911
2,0.2,0.0,0.8,0.024304,0.020065,0.985735,0.000532,0.761694,0.159049,0.0,...,,,,,0.489174,0.548882,0.62246,0.717316,0.857892,0.956911


adv


Unnamed: 0,w_adv,w_time,w_prob,non-weighted error,weighted error,correlation,pvalue,mean,std,c0,...,r0,r10,r20,r30,r40,r50,r60,r70,r80,r90
0,0.1,0.1,0.8,0.026374,0.023973,0.984353,0.000478,0.762311,0.153359,0.0,...,,,,,0.485168,0.54838,0.614281,0.719138,0.871389,0.961925
1,0.1,0.1,0.8,0.026374,0.023973,0.984353,0.000478,0.762311,0.153359,0.0,...,,,,,0.485168,0.54838,0.614281,0.719138,0.871389,0.961925
2,0.1,0.1,0.8,0.026374,0.023973,0.984353,0.000478,0.762311,0.153359,0.0,...,,,,,0.485168,0.54838,0.614281,0.719138,0.871389,0.961925


time


Unnamed: 0,w_adv,w_time,w_prob,non-weighted error,weighted error,correlation,pvalue,mean,std,c0,...,r0,r10,r20,r30,r40,r50,r60,r70,r80,r90
0,0.2,0.0,0.8,0.024304,0.020065,0.985735,0.000532,0.761694,0.159049,0.0,...,,,,,0.489174,0.548882,0.62246,0.717316,0.857892,0.956911
1,0.2,0.0,0.8,0.024304,0.020065,0.985735,0.000532,0.761694,0.159049,0.0,...,,,,,0.489174,0.548882,0.62246,0.717316,0.857892,0.956911
2,0.2,0.0,0.8,0.024304,0.020065,0.985735,0.000532,0.761694,0.159049,0.0,...,,,,,0.489174,0.548882,0.62246,0.717316,0.857892,0.956911


both


Unnamed: 0,w_adv,w_time,w_prob,non-weighted error,weighted error,correlation,pvalue,mean,std,c0,...,r0,r10,r20,r30,r40,r50,r60,r70,r80,r90
0,0.0,0.2,0.8,0.04333,0.037837,0.968852,0.001725,0.790942,0.15016,0.0,...,,,,,0.514098,0.529185,0.600708,0.68656,0.811853,0.93083
1,0.0,0.3,0.7,0.061214,0.036106,0.937058,0.002242,0.77872,0.153351,0.0,...,,,,0.499019,0.563378,0.578086,0.607799,0.71057,0.810677,0.933426
2,0.0,0.3,0.7,0.061214,0.036106,0.937058,0.002242,0.77872,0.153351,0.0,...,,,,0.499019,0.563378,0.578086,0.607799,0.71057,0.810677,0.933426


In [569]:
def build_output_true(val_onehot, clf, time_pred_val, adv_pred_val, actual, adv, time):
    val_predict = clf.predict_proba(val_onehot)    
    
    output_val = pd.DataFrame(data = {'est_time': time_pred_val, 'est_adv':adv_pred_val, 'proba1': val_predict.transpose()[1], 'prediction': clf.predict(val_onehot), 'actual': actual, 'adv':adv, 'time':time})
    output_val['proba_full'] = output_val['proba1']
    output_val.loc[output_val['prediction'] == 0, 'proba_full'] = (1 - output_val['proba1'])
    output_val['adv_perc'] = np.minimum(1, output_val['adv']/round(output_val['est_adv'],0))
    output_val['time_perc'] = np.minimum(1, output_val['time']/output_val['est_time'])
    
    return output_val

output_val_true = build_output_true(val_onehot, clf, X_val['total_time'], X_val['total_events'], Y_val < Y_threshold, X_val['event_order_1']+1, X_val['time_from_start_1'])
results_true = search_weights(output_val_true)
store_results(results_true, 'bpi2013-true.csv')
show_best_results(results_true)


(array([0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 6.51127268e-02,
        4.86235063e-02, 9.65913440e-01, 7.51580554e-03, 8.88495620e-01,
        1.35319559e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 3.65000000e+02, 7.64000000e+02,
        1.07600000e+03, 1.93900000e+03, 9.07800000e+03,            nan,
                   nan,            nan,            nan,            nan,
        5.12328767e-01, 7.17277487e-01, 8.52230483e-01, 9.35018051e-01,
        9.83366380e-01]),
 array([0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 6.51127268e-02,
        4.86235063e-02, 9.65913440e-01, 7.51580554e-03, 8.88495620e-01,
        1.35319559e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 3.65000000e+02, 7.64000000e+02,
        1.07600000e+03, 1.93900000e+03, 9.07800000e+03,            nan,
                   nan,            nan,            nan,            nan,
        5.12328767e-01, 7.17277487e-01