In [1]:
import pandas as pd
from collections import Counter
import numpy as np

df = pd.read_csv("BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",
                usecols=['CustomerID', 'AgeCategory', 'Gender', 'Office_U', 'Office_W',
       'SessionID', 'IPID', 'TIMESTAMP', 'VHOST', 'URL_FILE', 'PAGE_NAME',
       'REF_URL_category', 'page_load_error', 'page_action_detail', 'tip',
       'service_detail', 'xps_info'])

  exec(code_obj, self.user_global_ns, self.user_ns)


Turn the timestamp in the correct format and sort the DataFrame on SessionID and TIMESTAMP

In [2]:
df["TIMESTAMP"] = pd.to_datetime(df["TIMESTAMP"], infer_datetime_format=True)
df = df.sort_values(["SessionID", "TIMESTAMP"])

In [3]:
#Create consecutive column with the consecutive activity and count how many time each pair occurs
df["Consecutive_1"] = df.groupby("SessionID")["URL_FILE"].shift(periods=-1)

#Count all the pairs and safe in new DataFrame
df_for_pivot_1 = pd.DataFrame(Counter(list(zip(df['URL_FILE'], df['Consecutive_1'],))), index=["value"]).T.reset_index()

df_for_pivot_1.fillna("end_session", inplace=True)
df_for_pivot_1

Unnamed: 0,level_0,level_1,value
0,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,123143
1,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_d...,2442
2,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_d...,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,21692
3,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,241943
4,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,end_session,53821
...,...,...,...
15942,/xpsimage/wdo215419,/xpsimage/wdo215419,3
15943,/xpsimage/wdo213133,/xpsimage/wdo215419,1
15944,/xpsimage/wdo215419,/xpsimage/wdo211812,1
15945,/portal/page/portal/werk_nl/werknemer/sollicit...,/werk_nl/werknemer/solliciteren/solliciteren-w...,1


In [4]:
def make_pivot(df, index_names, column_names):
    """
    Functions that takes in a DataFrame and returns a pivot table with all the chances
    
    Create pivot table where chances are calculated that each row is succeded by activity that is represented in the column
    """
    df_chances = df.pivot_table(index=index_names, columns=column_names, values='value')
    
    #Calculate what the probabilities are by summing the row and dividing all the values in the row by total sum of the row
    df_chances["total_row_count"] = df_chances.sum(axis=1)
    df_chances = df_chances.div(df_chances["total_row_count"], axis=0)
    
    df_chances.fillna(0, inplace=True)
    df_chances.drop("total_row_count", axis=1, inplace=True)

    return df_chances

In [5]:
df_chances_1 = make_pivot(df_for_pivot_1, "level_0", "level_1")

#Calculate the probability of each activity that it is the first activity performed in the session
df_chances_1["start_session_chance"] = df.groupby("SessionID").nth(0)["URL_FILE"].value_counts() / df.groupby("SessionID").nth(0)["URL_FILE"].value_counts().sum()
df_chances_1.fillna(0, inplace=True)

In [6]:
df_chances_total = df_chances_1 

#Split the start probabilities from the normal DataFrame
start_chances = df_chances_total["start_session_chance"][df_chances_total["start_session_chance"] > 0]

#Create df with all chances except the starting chance.
df_chances_rest = df_chances_total.iloc[:, :-1]

In [7]:
df_chances_total

level_1,/,//,//portal/page/portal/werk_nl/werknemer/solliciteren/solliciteren-werk-zoeken/sollicitatiebrief,//werk_nl/werknemer/contact,//werk_nl/werknemer/contact/email_uw_vraag,//werk_nl/werknemer/contact/vestiging-zoeken,//werk_nl/werknemer/home,//werk_nl/werknemer/over-werk-nl/handleiding,//werk_nl/werknemer/over-werk-nl/handleiding/cv-plaatsen,//werk_nl/werknemer/over-werk-nl/handleiding/sollicitatie-wijziging,...,/xpsitem/wdo_013407,/xpsitem/wdo_013827,/xpsitem/wdo_014041,/zoeken_portlet/ajax/addVacatureToGereageerd,/zoeken_portlet/ajax/zoekAantalIndicatief,/zoeken_portlet/ajax/zoekBeroep,/zoeken_portlet_wg/ajax/selectContactpersoon,/zoeken_portlet_wg/ajax/selectVacature,end_session,start_session_chance
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
/,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.073834,0.000850
//,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.035714,0.000035
//portal/page/portal/werk_nl/werknemer/solliciteren/solliciteren-werk-zoeken/sollicitatiebrief,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000003
//werk_nl/werknemer/contact,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
//werk_nl/werknemer/contact/email_uw_vraag,0.0,0.0,0.0,0.0,0.029412,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.058824,0.000005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/zoeken_portlet/ajax/addVacatureToGereageerd,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017588,0.002513,0.005025,0.0,0.0,0.055276,0.000002
/zoeken_portlet/ajax/zoekAantalIndicatief,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000107,0.537184,0.051269,0.0,0.0,0.007539,0.000161
/zoeken_portlet/ajax/zoekBeroep,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.441592,0.489910,0.0,0.0,0.003979,0.000097
/zoeken_portlet_wg/ajax/selectContactpersoon,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000


In [8]:
url_diensten_overzicht = "/portal/page/portal/home/diensten/overzicht"
url_werknemer_home = "/werk_nl/werknemer/home"
url_aanvragen_ww = "/portal/page/portal/home/diensten/aanvragen-ww"
url_image = "/xpsimage/wdo212395"

In [9]:
def change_probability(prob_matrix, url_1, url_2, increase):
    print(url_1)
    print(url_2)
    original_value = prob_matrix.loc[url_1, url_2]
    print("Original probability is: ", original_value)
    
    prob_matrix.loc[url_1, url_2] += increase
    
    prob_matrix.loc[url_1, :] = prob_matrix.loc[url_1, :] / prob_matrix.loc[url_1, :].sum()

    new_value = prob_matrix.loc[url_1, url_2]
    print("New probability is: ", new_value)
    print()
    
    return prob_matrix

df_chances_rest = change_probability(df_chances_rest, url_aanvragen_ww, url_diensten_overzicht, 25)
df_chances_rest = change_probability(df_chances_rest, url_aanvragen_ww, url_werknemer_home, 25)
df_chances_rest = change_probability(df_chances_rest, url_diensten_overzicht, url_aanvragen_ww, 25)
df_chances_rest = change_probability(df_chances_rest, url_werknemer_home, url_aanvragen_ww, 25)

/portal/page/portal/home/diensten/aanvragen-ww
/portal/page/portal/home/diensten/overzicht
Original probability is:  0.010399780134866548
New probability is:  0.9619384530821102

/portal/page/portal/home/diensten/aanvragen-ww
/werk_nl/werknemer/home
Original probability is:  0.0009927021075617243
New probability is:  0.9615766423887523

/portal/page/portal/home/diensten/overzicht
/portal/page/portal/home/diensten/aanvragen-ww
Original probability is:  0.8371161548731643
New probability is:  0.9937352367258909

/werk_nl/werknemer/home
/portal/page/portal/home/diensten/aanvragen-ww
Original probability is:  0.0012063988907308066
New probability is:  0.9615848614957976



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [10]:
def transform_df_to_dict(df):
    df_dict = df.T.to_dict()
    for i in df_dict:
        df_dict[i] = {k: v for k, v in df_dict[i].items() if v > 0}
        
    return df_dict

df_dict = transform_df_to_dict(df_chances_rest)

In [11]:
def markov_chain(amount, start_chances, df_dict):
    lijst_aantal_gegenereerde = []

    #Loop through the df with all the chances and based on this a new session is generated
    aanvragen_ww_list = []
    while len(aanvragen_ww_list) <= amount:
        activity_list = []
        activity = np.random.choice(a = np.array(start_chances.index), size = 1, p = np.array(start_chances.values))
        activity_list.append(activity[0])

        while activity != "end_session":
            row_activity = df_dict[activity[0]]
            activity = np.random.choice(a = list(row_activity.keys()), size = 1, p = list(row_activity.values()))
            activity_list.append(activity[0])

        if "/portal/page/portal/home/diensten/aanvragen-ww" in activity_list:
            aanvragen_ww_list.append(activity_list)

        #print how many sessions are already generated by the Markov chain. Only print every 250 generated sessions
        if len(aanvragen_ww_list) % 250 == 0 and len(aanvragen_ww_list) not in lijst_aantal_gegenereerde:
            print(len(aanvragen_ww_list))
            lijst_aantal_gegenereerde.append(len(aanvragen_ww_list))
    
    return aanvragen_ww_list

markov_result = markov_chain(2000, start_chances, df_dict)

0
250
500
750
1000
1250
1500
1750
2000


In [12]:
def markov_result_to_df(list_markov_result, name_index, name_csv):
    activity_list_final = []
    session_id_list = []
    for nbr in range(len(markov_result)):
        string_generated = name_index + str(nbr)
        for i in markov_result[nbr]:
            if i != "end_session":
                activity_list_final.append(i)
                session_id_list.append(string_generated)
                
    df_generated = pd.DataFrame(list(zip(session_id_list, activity_list_final)),
               columns =['SessionID', 'URL_FILE'])
    df_generated.to_csv(name_csv)
    
markov_result_to_df(markov_result, "generated_plus_25_", "generated_data/Pattern_3/generated_anomalies_plus_25_2k.csv")