In [2]:
import pandas as pd
from collections import Counter
import numpy as np

In [3]:
base_df = pd.read_csv("BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",
                        usecols=[ 'CustomerID', 'AgeCategory', 'Gender', 'Office_U', 'Office_W',
                                  'SessionID', 'IPID', 'TIMESTAMP', 'VHOST', 'URL_FILE', 'PAGE_NAME',
                                  'REF_URL_category', 'page_load_error', 'page_action_detail', 'tip',
                                  'service_detail', 'xps_info'])
base_df.rename(columns={'URL_FILE':'Activity'}, inplace=True)
base_df["TIMESTAMP"] = pd.to_datetime(base_df["TIMESTAMP"], infer_datetime_format=True)
base_df = base_df.sort_values(["SessionID", "TIMESTAMP"])

  base_df = pd.read_csv("BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",


In [4]:
# Helper functions
def create_df_for_pivot(df: pd.DataFrame):
  df = df.copy()
  #Create consecutive column with the consecutive activity and count how many time each pair occurs
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)

  #Count all the pairs and safe in new DataFrame
  df_for_pivot_1 = pd.DataFrame(Counter(list(zip(df['Activity'], df['Consecutive_1'],))), index=["value"]).T.reset_index()

  df_for_pivot_1.fillna("end_session", inplace=True)
  return df_for_pivot_1
  
def make_pivot(df, index_names, column_names):
  df = df.copy()
  """
  Functions that takes in a DataFrame and returns a pivot table with all the chances
  
  Create pivot table where chances are calculated that each row is succeded by activity that is represented in the column
  """
  df_chances = df.pivot_table(index=index_names, columns=column_names, values='value')
  
  #Calculate what the probabilities are by summing the row and dividing all the values in the row by total sum of the row
  df_chances["total_row_count"] = df_chances.sum(axis=1)
  df_chances = df_chances.div(df_chances["total_row_count"], axis=0)
  
  df_chances.fillna(0, inplace=True)
  df_chances.drop("total_row_count", axis=1, inplace=True)

  return df_chances

  
# Create df for pivot
df_for_pivot = create_df_for_pivot(base_df)
df_chances_1 = make_pivot(df_for_pivot, "level_0", "level_1")

#Calculate the probability of each activity that it is the first activity performed in the session
df_chances_1["start_session_chance"] = base_df.groupby("SessionID").nth(0)["Activity"].value_counts() / base_df.groupby("SessionID").nth(0)["Activity"].value_counts().sum()
df_chances_1.fillna(0, inplace=True)

df_chances_total = df_chances_1 
df_chances_total

#Split the start probabilities from the normal DataFrame
start_chances = df_chances_total["start_session_chance"][df_chances_total["start_session_chance"] > 0]

#Create df with all chances except the starting chance.
final_df = df_chances_total.iloc[:, :-1]
final_df
probability_matrix = final_df.copy()


In [5]:
def find_paths_of_len_3(_df:pd.DataFrame):
  df = _df.copy()
  col = df.columns.values
  row  = df.index.values
  s  = row.size
  res = []
  while s > 0:
    current_row = df.iloc[row.size - s]
    i = 0
    while i < current_row.size:
      current_col = df.columns[i]
      chance = current_row.iloc[i]
      if chance > 0:
        a = current_row.name
        b = current_col
        
        if (df.index == b).any() :
          _r = df.loc[b]
          index = 0
          for r in _r:
            if r > 0:
              res.append({'a':a,'b':b,'c':col.item(index),"a-b-c":r*chance,"a-b":chance,"b-c":r})  
            index += 1        
            
      i += 1
    s -= 1

  return res

In [6]:
res = find_paths_of_len_3(probability_matrix)
res.sort(key=lambda x: x['a-b-c'], reverse=True)
len(res)

1362220

In [7]:
# transform to dataframe
paths_df = pd.DataFrame(res)
# remove rows with a-b == 1
paths_df = paths_df[paths_df['a-b'] != 1]
# remove rows where a == b and where b == c and where a == c
paths_df = paths_df[paths_df['a'] != paths_df['b']]
paths_df = paths_df[paths_df['b'] != paths_df['c']]
paths_df = paths_df[paths_df['a'] != paths_df['c']]
# remove a row if the activity in a occurs only once in column a
paths_df = paths_df[paths_df['a'] != paths_df['a'].value_counts().index[0]]
# remove a row if the activity in c occurs only once in column c
paths_df = paths_df[paths_df['c'] != paths_df['c'].value_counts().index[0]]

In [8]:
paths_df

Unnamed: 0,a,b,c,a-b-c,a-b,b-c
65,/werk_nl/werkgever/meerweten/ontslag/ontslagpr...,/werk_nl/werkgever/meerweten/ontslag/ontslagve...,/werk_nl/werkgever/home,5.000000e-01,0.500000,1.000000
75,/werk_nl/werknemer/solliciteren/europa/land/no...,/werk_nl/werknemer/solliciteren/europa/land/le...,/werk_nl/werknemer/solliciteren/europa/land/zw...,5.000000e-01,0.500000,1.000000
76,/werk_nl/werknemer/solliciteren/europa/land/oo...,/werk_nl/werknemer/solliciteren/europa/land/ts...,/werk_nl/werknemer/solliciteren/europa/land/sl...,5.000000e-01,0.500000,1.000000
80,/xpsimage/wdo211383,/xpsimage/wdo_014540,/xpsimage/wdo_011751,5.000000e-01,0.500000,1.000000
81,/xpsimage/wdo_011538,/xpsimage/wdo_011537,/xpsimage/wdo_013846,5.000000e-01,0.500000,1.000000
...,...,...,...,...,...,...
1362215,/portal/page/portal/home/diensten/aanvragen-ww,/werk_nl/werknemer/mijn_werkmap/werk-zoeken/va...,/xpsimage/wdo_011687,4.785496e-11,0.000028,0.000002
1362216,/portal/page/portal/home/diensten/aanvragen-ww,/werk_nl/werknemer/mijn_werkmap/werk-zoeken/va...,/xpsimage/wdo_012890,4.785496e-11,0.000028,0.000002
1362217,/portal/page/portal/home/diensten/aanvragen-ww,/werk_nl/werknemer/mijn_werkmap/werk-zoeken/va...,/xpsitem/ptl1716476,4.785496e-11,0.000028,0.000002
1362218,/portal/page/portal/home/diensten/aanvragen-ww,/werk_nl/werknemer/mijn_werkmap/werk-zoeken/va...,/xpsitem/wdo_008957,4.785496e-11,0.000028,0.000002


In [44]:
sorted = paths_df.sort_values(by=['a-b-c','a','c'], ascending=False)
tester = sorted.copy()

In [46]:
# create column that counts how many times the combination of the values in columns a and c occurs
tester = tester.sort_values(by=['a','c'], ascending=False)
tester


Unnamed: 0,a,b,c,a-b-c,a-b,b-c
741365,/zoeken_portlet/ajax/zoekBeroep,/werk_nl/werkgever/cvs_zoeken,/zoeken_portlet_wg/ajax/selectVacature,7.070286e-07,0.000284,0.002488
601066,/zoeken_portlet/ajax/zoekBeroep,/werk_nl/werkgever/cvs_zoeken,/zoeken_portlet_wg/ajax/selectContactpersoon,1.767571e-06,0.000284,0.006219
27695,/zoeken_portlet/ajax/zoekBeroep,/werk_nl/werknemer/vacatures,/zoeken_portlet/ajax/zoekAantalIndicatief,3.049503e-03,0.039839,0.076546
272496,/zoeken_portlet/ajax/zoekBeroep,/werk_nl/werknemer/mijn_werkmap/werk-zoeken/va...,/zoeken_portlet/ajax/zoekAantalIndicatief,3.176599e-05,0.010374,0.003062
344404,/zoeken_portlet/ajax/zoekBeroep,/werk_nl/werkgever/cvs_zoeken,/zoeken_portlet/ajax/zoekAantalIndicatief,1.484760e-05,0.000284,0.052239
...,...,...,...,...,...,...
1061762,/,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_d...,//portal/page/portal/werk_nl/werknemer/sollici...,1.178980e-07,0.016839,0.000007
1182578,/,/werk_nl/werknemer/mijn_werkmap/werk-zoeken/va...,//portal/page/portal/werk_nl/werknemer/sollici...,5.024011e-08,0.047927,0.000001
714819,/,/werk_nl/werknemer/home,//,8.335844e-07,0.161917,0.000005
1080732,/,/werk_nl/werknemer/mijn_werkmap/doorgeven/wijz...,//,1.046991e-07,0.006477,0.000016


In [10]:
a_b_c = 0.1
to_be_replaced = sorted[sorted['a-b-c'] > a_b_c]
to_be_replaced

Unnamed: 0,a,b,c,a-b-c,a-b,b-c
81,/xpsimage/wdo_011538,/xpsimage/wdo_011537,/xpsimage/wdo_013846,0.500000,0.500000,1.000000
80,/xpsimage/wdo211383,/xpsimage/wdo_014540,/xpsimage/wdo_011751,0.500000,0.500000,1.000000
76,/werk_nl/werknemer/solliciteren/europa/land/oo...,/werk_nl/werknemer/solliciteren/europa/land/ts...,/werk_nl/werknemer/solliciteren/europa/land/sl...,0.500000,0.500000,1.000000
75,/werk_nl/werknemer/solliciteren/europa/land/no...,/werk_nl/werknemer/solliciteren/europa/land/le...,/werk_nl/werknemer/solliciteren/europa/land/zw...,0.500000,0.500000,1.000000
65,/werk_nl/werkgever/meerweten/ontslag/ontslagpr...,/werk_nl/werkgever/meerweten/ontslag/ontslagve...,/werk_nl/werkgever/home,0.500000,0.500000,1.000000
...,...,...,...,...,...,...
810,/werk_nl/werkgever/meerweten/arbeidsrecht/loon,/xpsimage/wdo211830,/werk_nl/werknemer/mijn_werkmap,0.104317,0.333333,0.312950
813,/xdocs/css/AMI/PIE.htc,/werk_nl/arbeidsmarktinformatie/sector-beroep/...,/werk_nl/arbeidsmarktinformatie/sector-beroep/...,0.104167,0.333333,0.312500
812,/werk_nl/arbeidsmarktinformatie/sector-beroep/...,/werk_nl/arbeidsmarktinformatie/publicaties,/werk_nl/werknemer/home,0.104167,0.333333,0.312500
821,/portal/page/portal/werk_nl/werknemer/sollicit...,/portal/page/portal/werk_nl/werknemer/sollicit...,/xpsimage/WDO211980,0.102691,0.331522,0.309756


In [11]:
# for each row in to_be_replaced
# count how many rows in sorted have the same a and c
def count(a, c):
  return len(sorted[(sorted['a'] == a) & (sorted['c'] == c)])

to_be_replaced = to_be_replaced.assign(count=sorted.apply(lambda x: count(x['a'], x['c']), axis=1))
# remove rows where count == 1
to_be_replaced = to_be_replaced[to_be_replaced['count'] != 1]
# create a new column where all possibilities for b are saved in a list
to_be_replaced = to_be_replaced.assign(b_list=to_be_replaced.apply(lambda x: sorted[(sorted['a'] == x['a']) & (sorted['c'] == x['c'])]['b'].tolist(), axis=1))


KeyboardInterrupt: 

In [119]:
to_be_replaced.sort_values(by=['count'], ascending=False)
# remove rows where count > 2
to_be_replaced = to_be_replaced[to_be_replaced['count'] <= 2]
# remove the value from b_list that is the same as b
to_be_replaced = to_be_replaced.assign(b_list=to_be_replaced.apply(lambda x: [i for i in x['b_list'] if i != x['b']], axis=1))
to_be_replaced

Unnamed: 0,a,b,c,a-b-c,a-b,b-c,count,b_list,b_list_len
65,/werk_nl/werkgever/meerweten/ontslag/ontslagpr...,/werk_nl/werkgever/meerweten/ontslag/ontslagve...,/werk_nl/werkgever/home,0.5,0.5,1.0,2,[/werk_nl/werknemer/uitkering-aanvragen/ww],2
149,/werk_nl/werknemer/contact/werkplein/routebesc...,/xpsitem/ptl1716628,/werk_nl/werknemer/home,0.333333,0.333333,1.0,2,[/werk_nl/werknemer/ontslag/wederzijds-goedvin...,2
136,/portal/page/portal/werk_nl/werknemer/ontslag/...,/werk_nl/werkgever/meerweten/ontslag/redenen/b...,/xpsimage/wdo211832,0.333333,0.333333,1.0,2,[/werk_nl/werknemer/mijn_werkmap],2
248,/werk_nl/werkgever/wervingsadvies/werkgeversse...,/werk_nl/werkgever/wervingsadvies/werkgeversse...,/werk_nl/werknemer/mijn_werkmap/werk-zoeken/va...,0.25,0.5,0.5,2,[/werk_nl/werknemer/home],2
349,/werk_nl/arbeidsmarktinformatie/publicaties/th...,/werk_nl/arbeidsmarktinformatie/publicaties/th...,/werk_nl/werknemer/home,0.2,0.2,1.0,2,[/werk_nl/arbeidsmarktinformatie/home],2
379,/werk_nl/werkgever/over,/werk_nl/werknemer/werkmap,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,0.1839,0.5,0.3678,2,[/werk_nl/werkgever/home],2
437,/werk_nl/werkgever/direct_naar/WerkgeversServi...,/werk_nl/werkgever/direct_naar/formulieren_dow...,/werk_nl/werkgever/home,0.166667,0.5,0.333333,2,[/werk_nl/werkgever/meerweten/werving],2
435,/werk_nl/arbeidsmarktinformatie/sector/Zorg,/werk_nl/arbeidsmarktinformatie/sector/welzijn...,/werk_nl/arbeidsmarktinformatie/publicaties,0.166667,0.333333,0.5,2,[/werk_nl/arbeidsmarktinformatie/regio],2
434,/werk_nl/arbeidsmarktinformatie/sector-beroep/...,/werk_nl/arbeidsmarktinformatie/sector-beroep/...,/werk_nl/werknemer/home,0.166667,0.333333,0.5,2,[/werk_nl/arbeidsmarktinformatie/home],2
426,/portal/page/portal/werk_nl/werknemer/ontslag/...,/werk_nl/werkgever/meerweten/ontslag/opzegtermijn,/werk_nl/werknemer/home,0.166667,0.333333,0.5,2,[/werk_nl/werknemer/ontslag],2
