In [2]:
import pandas as pd
from collections import Counter
import numpy as np

base_df = pd.read_csv("BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",
                usecols=['SessionID','TIMESTAMP','URL_FILE'])
base_df.rename(columns={'URL_FILE':'Activity'}, inplace=True)
base_df["TIMESTAMP"] = pd.to_datetime(base_df["TIMESTAMP"], infer_datetime_format=True)
base_df = base_df.sort_values(["SessionID", "TIMESTAMP"])

In [19]:
# Helper functions
def create_df_for_pivot(df: pd.DataFrame):
  df = df.copy()
  #Create consecutive column with the consecutive activity and count how many time each pair occurs
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)

  #Count all the pairs and safe in new DataFrame
  df_for_pivot_1 = pd.DataFrame(Counter(list(zip(df['Activity'], df['Consecutive_1'],))), index=["value"]).T.reset_index()

  df_for_pivot_1.fillna("end_session", inplace=True)
  return df_for_pivot_1
  
def make_pivot(df, index_names, column_names):
  df = df.copy()
  """
  Functions that takes in a DataFrame and returns a pivot table with all the chances
  
  Create pivot table where chances are calculated that each row is succeded by activity that is represented in the column
  """
  df_chances = df.pivot_table(index=index_names, columns=column_names, values='value')
  
  #Calculate what the probabilities are by summing the row and dividing all the values in the row by total sum of the row
  df_chances["total_row_count"] = df_chances.sum(axis=1)
  df_chances = df_chances.div(df_chances["total_row_count"], axis=0)
  
  df_chances.fillna(0, inplace=True)
  df_chances.drop("total_row_count", axis=1, inplace=True)

  return df_chances

  
# Create df for pivot
df_for_pivot = create_df_for_pivot(base_df)
df_chances_1 = make_pivot(df_for_pivot, "level_0", "level_1")

#Calculate the probability of each activity that it is the first activity performed in the session
df_chances_1["start_session_chance"] = base_df.groupby("SessionID").nth(0)["Activity"].value_counts() / base_df.groupby("SessionID").nth(0)["Activity"].value_counts().sum()
df_chances_1.fillna(0, inplace=True)

df_chances_total = df_chances_1 
df_chances_total

#Split the start probabilities from the normal DataFrame
start_chances = df_chances_total["start_session_chance"][df_chances_total["start_session_chance"] > 0]

#Create df with all chances except the starting chance.
final_df = df_chances_total.iloc[:, :-1]
final_df
test_df = final_df.copy()


In [20]:
def find_paths(_df:pd.DataFrame,len:int = 3):
  df = _df.copy()
  col = df.columns.values
  row  = df.index.values
  s  = row.size
  res = []
  while s > 0:
    current_row = df.iloc[row.size - s]
    i = 0
    while i < current_row.size:
      current_col = df.columns[i]
      chance = current_row.iloc[i]
      if chance > 0:
        a = current_row.name
        b = current_col
        
        if (df.index == b).any() :
          _r = df.loc[b]
          index = 0
          for r in _r:
            if r > 0:
              res.append([a,b,col.item(index),r*chance,chance,r])  
            index += 1        
            
      i += 1
    s -= 1

  return res


paths = find_paths(test_df)

In [21]:
paths

[['/',
  '/portal/pls/portal/PORTAL.wwsbr_javascript.page_js',
  '//werk_nl/werknemer/solliciteren/ontwikkel-uzelf/tips/hulp',
  2.3659119407575653e-07,
  0.0012953367875647669,
  0.00018264840182648402],
 ['/',
  '/portal/pls/portal/PORTAL.wwsbr_javascript.page_js',
  '//werk_nl/werknemer/solliciteren/ontwikkel-uzelf/tips/kwaliteiten',
  2.3659119407575653e-07,
  0.0012953367875647669,
  0.00018264840182648402],
 ['/',
  '/portal/pls/portal/PORTAL.wwsbr_javascript.page_js',
  '//werk_nl/werknemer/solliciteren/ontwikkel-uzelf/tips/netwerken',
  2.3659119407575653e-07,
  0.0012953367875647669,
  0.00018264840182648402],
 ['/',
  '/portal/pls/portal/PORTAL.wwsbr_javascript.page_js',
  '//werk_nl/werknemer/solliciteren/ontwikkel-uzelf/tips/vacatures',
  2.3659119407575653e-07,
  0.0012953367875647669,
  0.00018264840182648402],
 ['/',
  '/portal/pls/portal/PORTAL.wwsbr_javascript.page_js',
  '/ewerkmap/readdetails',
  2.3659119407575653e-07,
  0.0012953367875647669,
  0.000182648401826484