In [2]:
import pandas as pd
from collections import Counter
import numpy as np

In [3]:
base_df = pd.read_csv("BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",
                        usecols=[ 'CustomerID', 'AgeCategory', 'Gender', 'Office_U', 'Office_W',
                                  'SessionID', 'IPID', 'TIMESTAMP', 'VHOST', 'URL_FILE', 'PAGE_NAME',
                                  'REF_URL_category', 'page_load_error', 'page_action_detail', 'tip',
                                  'service_detail', 'xps_info'])
base_df.rename(columns={'URL_FILE':'Activity'}, inplace=True)
base_df["TIMESTAMP"] = pd.to_datetime(base_df["TIMESTAMP"], infer_datetime_format=True)
base_df = base_df.sort_values(["SessionID", "TIMESTAMP"])

  base_df = pd.read_csv("BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",


In [4]:
# Helper functions
def create_df_for_pivot(df: pd.DataFrame):
  df = df.copy()
  #Create consecutive column with the consecutive activity and count how many time each pair occurs
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)

  #Count all the pairs and safe in new DataFrame
  df_for_pivot_1 = pd.DataFrame(Counter(list(zip(df['Activity'], df['Consecutive_1'],))), index=["value"]).T.reset_index()

  df_for_pivot_1.fillna("end_session", inplace=True)
  return df_for_pivot_1
  
def make_pivot(df, index_names, column_names):
  df = df.copy()
  """
  Functions that takes in a DataFrame and returns a pivot table with all the chances
  
  Create pivot table where chances are calculated that each row is succeded by activity that is represented in the column
  """
  df_chances = df.pivot_table(index=index_names, columns=column_names, values='value')
  
  #Calculate what the probabilities are by summing the row and dividing all the values in the row by total sum of the row
  df_chances["total_row_count"] = df_chances.sum(axis=1)
  df_chances = df_chances.div(df_chances["total_row_count"], axis=0)
  
  df_chances.fillna(0, inplace=True)
  df_chances.drop("total_row_count", axis=1, inplace=True)

  return df_chances

  
# Create df for pivot
df_for_pivot = create_df_for_pivot(base_df)
df_chances_1 = make_pivot(df_for_pivot, "level_0", "level_1")

#Calculate the probability of each activity that it is the first activity performed in the session
df_chances_1["start_session_chance"] = base_df.groupby("SessionID").nth(0)["Activity"].value_counts() / base_df.groupby("SessionID").nth(0)["Activity"].value_counts().sum()
df_chances_1.fillna(0, inplace=True)

df_chances_total = df_chances_1 
df_chances_total

#Split the start probabilities from the normal DataFrame
start_chances = df_chances_total["start_session_chance"][df_chances_total["start_session_chance"] > 0]

#Create df with all chances except the starting chance.
final_df = df_chances_total.iloc[:, :-1]
final_df
probability_matrix = final_df.copy()


In [5]:
anomalous_factor = 20
min_normal_probability = 0.3
anomalous_factor_string = (str(anomalous_factor).replace('.', ''))
min_normal_probability_string  = (str(min_normal_probability).replace('.', '')) 

In [6]:
# swapping pattern means that the probability of a transition is at least the anomalous_factor times higher then the reversed transition
# this function finds all swapping patterns and saves the results in a new dataFrame
def find_swapping_patterns(df: pd.DataFrame, anomalous_factor: int,min_normal_probability: float):
  df = df.copy()
  # transpose the matrix to a dataFrame with column Activity and column Activity_2 and calculate the ratio of the two probabilities
  # if the ratio is higher then the anomalous_factor then the swapping pattern is found
  df = df.stack().reset_index()
  # rename the columns to activity and activity_2
  df = df.rename(columns={"level_0": "Activity", "level_1": "Activity_2", 0: "Probability"})
  # remove the rows where the activity is equal to the activity_2
  df = df[df["Activity"] != df["Activity_2"]]
  # remove the rows where the probability is 0
  df = df[df["Probability"] != 0]
  # create a new column with the reversed transition
  df = df.merge(df.rename(columns={"Activity": "Activity_2", "Activity_2": "Activity", "Probability": "Swapped_Probability"}), on=["Activity", "Activity_2"])
  # if the probability is:
  #   1. at least the anomalous_factor times bigger then the reversed probability and
  #   2. at least the min_normal_probability
  # then the swapping pattern is found
  # If found set Swapping_pattern to True else False
  df = df.assign(Swapping_Pattern = (df["Probability"] > df["Swapped_Probability"] * anomalous_factor) & (df["Probability"] >= min_normal_probability) )
  # only keep the rows where the swapping pattern is True
  df = df[df["Swapping_Pattern"] == True]
  
  

  return df
  

In [7]:
swapping_patterns = find_swapping_patterns(probability_matrix, anomalous_factor,min_normal_probability)
# change the colum names Activity to first and Activity_2 to second
swapping_patterns.rename(columns={'Activity':'first','Activity_2':'second'}, inplace=True)
swapping_patterns

Unnamed: 0,first,second,Probability,Swapped_Probability,Swapping_Pattern
5,/,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,0.375648,2.359499e-05,True
32,//werk_nl/werknemer/home,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,0.370370,5.487206e-07,True
36,//werk_nl/werknemer/over-werk-nl/handleiding/s...,/werk_nl/werknemer/home,1.000000,1.716072e-06,True
38,//werk_nl/werknemer/solliciteren/eigen-bedrijf...,/xpsimage/WDO211817,0.619048,1.706485e-03,True
137,//werk_nl/werknemer/solliciteren/test/kansverk...,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,0.333333,3.787664e-06,True
...,...,...,...,...,...
8453,/xpsitem/wdo212424,/werk_nl/werknemer/home,0.300000,3.432145e-06,True
8454,/xpsitem/wdo_007730,/werk_nl/werknemer/home,1.000000,1.716072e-06,True
8492,/zoeken_portlet/ajax/addVacatureToGereageerd,/werk_nl/werknemer/vacatures,0.311558,3.226018e-03,True
8542,/zoeken_portlet_wg/ajax/selectContactpersoon,/werk_nl/werkgever/cvs_zoeken,1.000000,6.218905e-03,True


In [17]:
# Function to find the skipping patterns in a session.
def find_anomalous_sessions(df:pd.DataFrame,swapping_patterns:pd.DataFrame) -> list[int]:
  sessions_df = df.copy()
  swapping_patterns = swapping_patterns.copy()
  # Create a new column with the consecutive activity
  sessions_df["Consecutive_Activity"] = sessions_df.groupby("SessionID")["Activity"].shift(periods=-1)  

  # To find the patterns we merge the swapping patterns with the sessions_df 
  merged = pd.merge(sessions_df, swapping_patterns, left_on=['Activity','Consecutive_Activity'], right_on=['second','first'], how='inner',indicator='Anomaly')
  merged['Anomaly'] = np.where(merged.Anomaly == 'both', True, False)
  
  anomalous_session = merged[merged['Anomaly'] == True]
  # create a list with all SessionIDs that have an anomaly
  anomaly_sessions = anomalous_session['SessionID'].unique()
  
  return anomaly_sessions

In [18]:
find_anomalous_sessionIDs = find_anomalous_sessions(base_df,swapping_patterns)
len(find_anomalous_sessionIDs)

14739

In [None]:
# create a new column in the base_df that is True if the sessionID is in the anomalous_sessionID list
base_df = base_df.assign(Anomalous = base_df["SessionID"].isin(find_anomalous_sessionIDs))
# drop the Consecutive_1	 column
base_df
# save to csv
base_df.to_csv(f"labeled_data/swapped/{anomalous_factor_string}_{min_normal_probability_string}.csv")
