In [43]:
import pandas as pd
from collections import Counter
import numpy as np

In [44]:
base_df = pd.read_csv("BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",
                usecols=['SessionID','TIMESTAMP','URL_FILE'])
base_df.rename(columns={'URL_FILE':'Activity'}, inplace=True)
base_df["TIMESTAMP"] = pd.to_datetime(base_df["TIMESTAMP"], infer_datetime_format=True)
base_df = base_df.sort_values(["SessionID", "TIMESTAMP"])

In [45]:
# Helper functions
def create_df_for_pivot(df: pd.DataFrame):
  df = df.copy()
  #Create consecutive column with the consecutive activity and count how many time each pair occurs
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)

  #Count all the pairs and safe in new DataFrame
  df_for_pivot_1 = pd.DataFrame(Counter(list(zip(df['Activity'], df['Consecutive_1'],))), index=["value"]).T.reset_index()

  df_for_pivot_1.fillna("end_session", inplace=True)
  return df_for_pivot_1
  
def make_pivot(df, index_names, column_names):
  df = df.copy()
  """
  Functions that takes in a DataFrame and returns a pivot table with all the chances
  
  Create pivot table where chances are calculated that each row is succeded by activity that is represented in the column
  """
  df_chances = df.pivot_table(index=index_names, columns=column_names, values='value')
  
  #Calculate what the probabilities are by summing the row and dividing all the values in the row by total sum of the row
  df_chances["total_row_count"] = df_chances.sum(axis=1)
  df_chances = df_chances.div(df_chances["total_row_count"], axis=0)
  
  df_chances.fillna(0, inplace=True)
  df_chances.drop("total_row_count", axis=1, inplace=True)

  return df_chances

  
# Create df for pivot
df_for_pivot = create_df_for_pivot(base_df)
df_chances_1 = make_pivot(df_for_pivot, "level_0", "level_1")

#Calculate the probability of each activity that it is the first activity performed in the session
df_chances_1["start_session_chance"] = base_df.groupby("SessionID").nth(0)["Activity"].value_counts() / base_df.groupby("SessionID").nth(0)["Activity"].value_counts().sum()
df_chances_1.fillna(0, inplace=True)

df_chances_total = df_chances_1 
df_chances_total

#Split the start probabilities from the normal DataFrame
start_chances = df_chances_total["start_session_chance"][df_chances_total["start_session_chance"] > 0]

#Create df with all chances except the starting chance.
final_df = df_chances_total.iloc[:, :-1]
final_df
probability_matrix = final_df.copy()


In [111]:
def create_test_df():
  d = {'B': [0.8,0,0,0], 'C': [0.2,1,0.8,0],'D':[0,0,0.2,0],'E':[0,0,0,1]}
  df = pd.DataFrame(data=d)
  df.index = ['A','B','C','D']
  return df

test_df = create_test_df()  
test_df
# test = remove_self_loops(test_df)
# test

Unnamed: 0,B,C,D,E
A,0.8,0.2,0.0,0
B,0.0,1.0,0.0,0
C,0.0,0.0,1.0,0
D,0.0,0.0,0.0,1


In [48]:
# find transitions with high probability that are lower than 1 and  where from and to are not the same
def transition_no_circles(df:pd.DataFrame, threshold:float = 0.81):
  df = df.copy()
  res = []
  for col in df.columns.values:
    if col == 'end_session':
      continue
    row = df.loc[col,:].sort_values(ascending=False)
    for index, value in row.items():
      if value < 1 and value > threshold and col != index:
        res.append({"from":col, "to":index, "probability":value})
  return res

# test = transition_no_circles(probability_matrix, 0.2)
# test.sort(key=lambda x: x["value"], reverse=True)
# test


In [176]:
# A skipping pattern means that there is a high probability to go from a to b to c and a smaller probability to go from a to c directly where a,b and c are not the same
def find_skipping_patterns(df:pd.DataFrame):
  df = df.copy()
  res = []
  for _a,valueRowA in df.iterrows():
    # filter the row to not include the row itself
    rowA = valueRowA.filter(regex='^(?!'+_a+'$).*$')
    # find the highest value in the row and get the index
    for _b, value_A_to_B in rowA.items():
      if value_A_to_B > 0:
      # remove the index from the row to not make sure we do not generate a skipping pattern with the same activity twice
        rowWithoutIndex = rowA.filter(regex='^(?!'+_b+'$).*$')
        # remove zeros from the row
        finalRowB = rowWithoutIndex[rowWithoutIndex > 0]
        # check there is a probability to go from the row of the highest index to a item in rowNotHighest
        # if there is append to res
        if _b != 'end_session':
          rowB = df.loc[_b,:]
          for _c, value_B_to_C in rowB.items():
            if value_B_to_C > 0 and _c in finalRowB.index.values and value_A_to_B * value_B_to_C > df.loc[_a, _c]:
              res.append({"a":_a,"b":_b,"c":_c, "a-b":value_A_to_B, "b-c":value_B_to_C,'a-c':df.loc[_a,_c],'a-b-c':value_A_to_B*value_B_to_C})
      
  return res

skipping_patterns = find_skipping_patterns(probability_matrix)
skipping_patterns.sort(key=lambda x: x["a-b-c"] - x['a-c'], reverse=True)


In [177]:
skipping_df = pd.DataFrame(skipping_patterns)

In [179]:
# skipping_df.sort_values(by="a-b-c", ascending=True, inplace=True)
skipping_patterns

[{'a': '/shared/timeout.htm',
  'b': '/werk_nl/werknemer/home',
  'c': 'end_session',
  'a-b': 0.7053471964352024,
  'b-c': 0.45332797918747403,
  'a-c': 0.10193093204604531,
  'a-b-c': 0.3197536191855206},
 {'a': '/ajax/dwr/call/plaincall/JAddressFinder.getAddress.dwr',
  'b': '/werk_nl/werknemer/mijn_werkmap/doorgeven/wijziging_doorgeven',
  'c': 'end_session',
  'a-b': 0.6666666666666666,
  'b-c': 0.26068541868735856,
  'a-c': 0.007936507936507936,
  'a-b-c': 0.1737902791249057},
 {'a': '/xpsimage/wdo_013817',
  'b': '/werk_nl/werknemer/home',
  'c': 'end_session',
  'a-b': 0.45454545454545453,
  'b-c': 0.45332797918747403,
  'a-c': 0.09090909090909091,
  'a-b-c': 0.20605817235794274},
 {'a': '/werk_nl/werknemer/mijn_werkmap/mijn-weg-naar-werk',
  'b': '/werk_nl/werknemer/mijn_werkmap/mijn-weg-naar-werk/mijn-inschrijving',
  'c': '/werk_nl/werknemer/mijn_werkmap/inschrijving/mijn_bewijs_van_inschrijving',
  'a-b': 0.2558139534883721,
  'b-c': 0.4298245614035088,
  'a-c': 0.003322259