In [1]:
import pandas as pd
from collections import Counter
import numpy as np

In [2]:
base_df = pd.read_csv("BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",
                        usecols=[ 'CustomerID', 'AgeCategory', 'Gender', 'Office_U', 'Office_W',
                                  'SessionID', 'IPID', 'TIMESTAMP', 'VHOST', 'URL_FILE', 'PAGE_NAME',
                                  'REF_URL_category', 'page_load_error', 'page_action_detail', 'tip',
                                  'service_detail', 'xps_info'])
base_df.rename(columns={'URL_FILE':'Activity'}, inplace=True)
base_df["TIMESTAMP"] = pd.to_datetime(base_df["TIMESTAMP"], infer_datetime_format=True)
base_df = base_df.sort_values(["SessionID", "TIMESTAMP"])

  base_df = pd.read_csv("BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",


In [3]:
# Helper functions
def create_df_for_pivot(df: pd.DataFrame):
  df = df.copy()
  #Create consecutive column with the consecutive activity and count how many time each pair occurs
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)

  #Count all the pairs and safe in new DataFrame
  df_for_pivot_1 = pd.DataFrame(Counter(list(zip(df['Activity'], df['Consecutive_1'],))), index=["value"]).T.reset_index()

  df_for_pivot_1.fillna("end_session", inplace=True)
  return df_for_pivot_1
  
def make_pivot(df, index_names, column_names):
  df = df.copy()
  """
  Functions that takes in a DataFrame and returns a pivot table with all the chances
  
  Create pivot table where chances are calculated that each row is succeded by activity that is represented in the column
  """
  df_chances = df.pivot_table(index=index_names, columns=column_names, values='value')
  
  #Calculate what the probabilities are by summing the row and dividing all the values in the row by total sum of the row
  df_chances["total_row_count"] = df_chances.sum(axis=1)
  df_chances = df_chances.div(df_chances["total_row_count"], axis=0)
  
  df_chances.fillna(0, inplace=True)
  df_chances.drop("total_row_count", axis=1, inplace=True)

  return df_chances

  
# Create df for pivot
df_for_pivot = create_df_for_pivot(base_df)
df_chances_1 = make_pivot(df_for_pivot, "level_0", "level_1")

#Calculate the probability of each activity that it is the first activity performed in the session
df_chances_1["start_session_chance"] = base_df.groupby("SessionID").nth(0)["Activity"].value_counts() / base_df.groupby("SessionID").nth(0)["Activity"].value_counts().sum()
df_chances_1.fillna(0, inplace=True)

df_chances_total = df_chances_1 
df_chances_total

#Split the start probabilities from the normal DataFrame
start_chances = df_chances_total["start_session_chance"][df_chances_total["start_session_chance"] > 0]

#Create df with all chances except the starting chance.
final_df = df_chances_total.iloc[:, :-1]
final_df
probability_matrix = final_df.copy()


In [4]:
# for every activity count how many activities are possible performed before and after it. But do not count itself
# Save a new Dataframe with columns index, activity, before, after
def check_activities(df):
  df = df.copy()
  df["before"] = df.apply(lambda x: x[x.index[x > 0].tolist()].count(), axis=1)
  df["after"] = df.apply(lambda x: x[x.index[x > 0].tolist()].count(), axis=0)
  df["index"] = df.index
  df = df.reset_index(drop=True)
  df = df[["index", "before", "after"]]
  return df

In [5]:
top_10_activities = ['/werk_nl/werknemer/mijn_werkmap/doorgeven/taken',
                     '/werk_nl/werknemer/mijn_werkmap/werk-zoeken/vacatures_bij_mijn_cv',
                     '/werk_nl/werknemer/mijn_werkmap/werk-zoeken/mijn_cv',
                     '/werk_nl/werknemer/home',
                     '/werk_nl/werknemer/mijn_werkmap/werk-zoeken/vacatures_zoeken',
                     '/werk_nl/werknemer/mijn_werkmap/postvak/mijn_berichten',
                     '/portal/page/portal/home/diensten/aanvragen-ww',
                     '/werk_nl/werknemer/mijn_werkmap',
                     '/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn_sollicitaties',
                     '/werk_nl/werknemer/werkmap']

In [6]:
# url to skip
home = "/werk_nl/werknemer/home"
taken = '/werk_nl/werknemer/mijn_werkmap/doorgeven/taken'
werkmap = '/werk_nl/werknemer/mijn_werkmap'
postvak = '/werk_nl/werknemer/mijn_werkmap/postvak/mijn_berichten'


In [7]:
# create a list of all possible activities that can be performed directly before the url to skip
# and sort on the probability that the url to skip is reached
def activities_before(_df, activity):
  df = _df.copy()
  coll = df.loc[:,activity]
  # drop values that are 0 or 1
  coll = coll[coll > 0]
  coll = coll[coll < 1]
  # sort descending
  coll = coll.sort_values(ascending=False)
  return coll

In [8]:
# A skipping pattern means that there is a high probability to go from a to b to c and a smaller probability to go from a to c directly where a,b and c are not the same
def find_skipping_patterns(df:pd.DataFrame):
  df = df.copy()
  res = []
  for _a,valueRowA in df.iterrows():
    # filter the row to not include the row itself
    rowA = valueRowA.filter(regex='^(?!'+_a+'$).*$')
    # find the highest value in the row and get the index
    for _b, value_A_to_B in rowA.items():
      if value_A_to_B > 0:
      # remove the index from the row to not make sure we do not generate a skipping pattern with the same activity twice
        rowWithoutIndex = rowA.filter(regex='^(?!'+_b+'$).*$')
        # remove zeros from the row
        finalRowB = rowWithoutIndex[rowWithoutIndex > 0]
        # check there is a probability to go from the row of the highest index to a item in rowNotHighest
        # if there is append to res
        if _b != 'end_session':
          rowB = df.loc[_b,:]
          for _c, value_B_to_C in rowB.items():
            if value_B_to_C > 0 and any(item in [_a,_c] for item in top_10_activities) and df.loc[_a,_c] > 0 and _a != _c and _b != _c and value_A_to_B * value_B_to_C > df.loc[_a,_c]:
              res.append({"a":_a,"b":_b,"c":_c, "a-b":value_A_to_B, "b-c":value_B_to_C,'a-c':df.loc[_a,_c],'a-b-c':value_A_to_B*value_B_to_C})
      
  return res

In [9]:
one = '/werk_nl/werknemer/werkmap' 
two = '/werk_nl/werknemer/mijn_werkmap/doorgeven/taken' 
tree = '/werk_nl/werknemer/home'
check = probability_matrix.loc[one,two]
print(check)
check1 = probability_matrix.loc[two,tree]
print(check1)
check2 = probability_matrix.loc[one,tree]
print(check2)

0.3678003609132098
0.06493998916825476
0.04185268781007356


In [10]:
one = '/werk_nl/werknemer/werkmap' 
two = '/werk_nl/werknemer/mijn_werkmap/postvak/mijn_berichten'
tree = '/werk_nl/werknemer/home'
check = probability_matrix.loc[one,two]
print(check)
check1 = probability_matrix.loc[two,tree]
print(check1)
check2 = probability_matrix.loc[one,tree]
print(check2)

0.13404007571451435
0.08468268848360888
0.04185268781007356


In [11]:
one = '/werk_nl/werknemer/mijn_werkmap/postvak/mijn_berichten'
two = '/werk_nl/werknemer/mijn_werkmap/doorgeven/taken'
tree = '/werk_nl/werknemer/home'
check = probability_matrix.loc[one,two]
print(check)
check1 = probability_matrix.loc[two,tree]
print(check1)
check2 = probability_matrix.loc[one,tree]
print(check2)

0.11682480162112001
0.06493998916825476
0.08468268848360888


In [26]:
# Helper functions
def drie_create_df_for_pivot(df: pd.DataFrame):
  df = df.copy()
  #Create consecutive column with the consecutive activity and count how many time each pair occurs
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)
  df["Consecutive_2"] = df.groupby("SessionID")["Activity"].shift(periods=-2)

  #Count all the pairs and safe in new DataFrame
  df_for_pivot_1 = pd.DataFrame(Counter(list(zip(df['Activity'], df['Consecutive_1'],df['Consecutive_2']))), index=["value"]).T.reset_index()

  df_for_pivot_1.fillna("end_session", inplace=True)
  return df_for_pivot_1


# function that removes a activity if it is the same as the activity before
def remove_same(df:pd.DataFrame):
  df = df.copy()
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=+1)
  df['same'] = df['Activity'] == df['Consecutive_1']
  # remove the rows where the activity is the same as the activity before
  df = df[df['same'] == False]
  # remove the created columns
  df = df.drop(columns=['same','Consecutive_1'])
  return df

In [30]:
lo = remove_same(base_df)
# only use cols ['SessionID','Activity','Timestamp']
lo = lo[['SessionID','Activity','TIMESTAMP']]

In [31]:
lo

Unnamed: 0,SessionID,Activity,TIMESTAMP
3273278,46,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,2015-11-06 08:07:22.780
5333642,46,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,2015-11-06 08:07:40.767
3733243,46,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_d...,2015-11-06 08:07:51.390
5904405,46,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,2015-11-06 08:08:06.003
1071622,92,/werk_nl/werknemer/mijn_werkmap,2015-10-14 14:59:37.753
...,...,...,...
1400504,55314390,/werk_nl/werknemer/home,2016-02-28 09:26:57.507
2213018,55314605,/xdocs/ux-frontend/fonts/uwvsanlig-webfont.woff,2016-02-27 10:13:43.503
3084818,55314605,/werk_nl/werknemer/mijn_werkmap/werk-zoeken/va...,2016-02-27 10:13:51.350
1613546,55314751,/werk_nl/werknemer/werkmap,2016-02-28 08:17:15.947


In [32]:
drie = drie_create_df_for_pivot(lo)
drie.sort_values(by='value', ascending=False).head(10)

Unnamed: 0,level_0,level_1,level_2,value
10,/werk_nl/werknemer/home,end_session,end_session,264166
11,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,end_session,end_session,132024
27,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/werk_nl/werknemer/home,end_session,101641
46,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,53870
3,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,end_session,end_session,53821
56,/werk_nl/werknemer/mijn_werkmap/werk-zoeken/va...,end_session,end_session,46432
95,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,/werk_nl/werknemer/home,end_session,39002
100,/werk_nl/werknemer/mijn_werkmap/werk-zoeken/va...,/werk_nl/werknemer/home,end_session,25820
447,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,22083
57,/werk_nl/werknemer/home,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/werk_nl/werknemer/home,21265


In [33]:
drie['a=b'] = drie['level_0'] == drie['level_1']
drie['a=c'] = drie['level_0'] == drie['level_2']
drie['b=c'] = drie['level_1'] == drie['level_2']

In [34]:
# remove all rows where a=b or a=c or b=c
_drie = drie[(drie['a=b'] == False) & (drie['a=c'] == False) & (drie['b=c'] == False)]
_drie = _drie.drop(columns=['a=b','a=c','b=c'])


In [36]:
_drie.sort_values(by='value', ascending=False).head(20)
# drop is level_2 == end_session
_drie = _drie[_drie['level_2'] != 'end_session']
_drie.sort_values(by='value', ascending=False)

Unnamed: 0,level_0,level_1,level_2,value
62,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/werk_nl/werknemer/home,17976
315,/werk_nl/werknemer/home,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,17633
474,/werk_nl/werknemer/werkmap,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,15372
75,/werk_nl/werknemer/werkmap,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/werk_nl/werknemer/home,14121
346,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,/werk_nl/werknemer/home,13611
...,...,...,...,...
43044,/werk_nl/werknemer/mijn_werkmap/doorgeven/take...,/werk_nl/werknemer/uitkering-aanvragen,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,1
43045,/werk_nl/werknemer/uitkering-aanvragen,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,/werk_nl/werknemer/mijn_werkmap/inschrijving/m...,1
43046,/werk_nl/werknemer/mijn_werkmap/meer/mijn_agenda,/werk_nl/werknemer/mijn_werkmap/inschrijving/m...,/xpsimage/wdo211818,1
43048,/werk_nl/werknemer/mijn_werkmap/inschrijving/m...,/xpsimage/wdo211812,/werk_nl/werknemer/solliciteren,1


In [272]:
df = pd.DataFrame(data=_drie['level_0'].T, columns=pd.MultiIndex.from_tuples(zip(_drie['level_1'],_drie['level_2'])))