In [3]:
import pandas as pd
from collections import Counter
import numpy as np

In [4]:
base_df = pd.read_csv("BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",
                        usecols=[ 'CustomerID', 'AgeCategory', 'Gender', 'Office_U', 'Office_W',
                                  'SessionID', 'IPID', 'TIMESTAMP', 'VHOST', 'URL_FILE', 'PAGE_NAME',
                                  'REF_URL_category', 'page_load_error', 'page_action_detail', 'tip',
                                  'service_detail', 'xps_info'])
base_df.rename(columns={'URL_FILE':'Activity'}, inplace=True)
base_df["TIMESTAMP"] = pd.to_datetime(base_df["TIMESTAMP"], infer_datetime_format=True)
base_df = base_df.sort_values(["SessionID", "TIMESTAMP"])

  base_df = pd.read_csv("BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",


In [5]:
# Helper functions
def create_df_for_pivot(df: pd.DataFrame):
  df = df.copy()
  #Create consecutive column with the consecutive activity and count how many time each pair occurs
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)

  #Count all the pairs and safe in new DataFrame
  df_for_pivot_1 = pd.DataFrame(Counter(list(zip(df['Activity'], df['Consecutive_1'],))), index=["value"]).T.reset_index()

  df_for_pivot_1.fillna("end_session", inplace=True)
  return df_for_pivot_1
  
def make_pivot(df, index_names, column_names):
  df = df.copy()
  """
  Functions that takes in a DataFrame and returns a pivot table with all the chances
  
  Create pivot table where chances are calculated that each row is succeded by activity that is represented in the column
  """
  df_chances = df.pivot_table(index=index_names, columns=column_names, values='value')
  
  #Calculate what the probabilities are by summing the row and dividing all the values in the row by total sum of the row
  df_chances["total_row_count"] = df_chances.sum(axis=1)
  df_chances = df_chances.div(df_chances["total_row_count"], axis=0)
  
  df_chances.fillna(0, inplace=True)
  df_chances.drop("total_row_count", axis=1, inplace=True)

  return df_chances

  
# Create df for pivot
df_for_pivot = create_df_for_pivot(base_df)
df_chances_1 = make_pivot(df_for_pivot, "level_0", "level_1")

#Calculate the probability of each activity that it is the first activity performed in the session
df_chances_1["start_session_chance"] = base_df.groupby("SessionID").nth(0)["Activity"].value_counts() / base_df.groupby("SessionID").nth(0)["Activity"].value_counts().sum()
df_chances_1.fillna(0, inplace=True)

df_chances_total = df_chances_1 
df_chances_total

#Split the start probabilities from the normal DataFrame
start_chances = df_chances_total["start_session_chance"][df_chances_total["start_session_chance"] > 0]

#Create df with all chances except the starting chance.
final_df = df_chances_total.iloc[:, :-1]
final_df
probability_matrix = final_df.copy()


In [7]:
# A skipping pattern means that there is a high probability to go from a to b to c and a smaller probability to go from a to c directly where a,b and c are not the same
def find_skipping_patterns(df:pd.DataFrame):
  df = df.copy()
  res = []
  for _a,valueRowA in df.iterrows():
    # filter the row to not include the row itself
    rowA = valueRowA.filter(regex='^(?!'+_a+'$).*$')
    # find the highest value in the row and get the index
    for _b, value_A_to_B in rowA.items():
      if value_A_to_B > 0:
      # remove the index from the row to not make sure we do not generate a skipping pattern with the same activity twice
        rowWithoutIndex = rowA.filter(regex='^(?!'+_b+'$).*$')
        # remove zeros from the row
        finalRowB = rowWithoutIndex[rowWithoutIndex > 0]
        # check there is a probability to go from the row of the highest index to a item in rowNotHighest
        # if there is append to res
        if _b != 'end_session':
          rowB = df.loc[_b,:]
          for _c, value_B_to_C in rowB.items():
            if value_B_to_C > 0 and _c in finalRowB.index.values:
              res.append({"a":_a,"b":_b,"c":_c, "a-b":value_A_to_B, "b-c":value_B_to_C,'a-c':df.loc[_a,_c],'a-b-c':value_A_to_B*value_B_to_C})
      
  return res

skipping_patterns = find_skipping_patterns(probability_matrix)
skipping_patterns.sort(key=lambda x: x["a-b-c"] - x['a-c'], reverse=True)


In [8]:
factor_1 = 0.08
factor_1_string = (str(factor_1).replace('.', ''))
factor_2 = 5
factor_2_string = (str(factor_2).replace('.', ''))

print(len(skipping_patterns))
# filter the skipping patters to only include the ones where the probability of a-b is min factor_1 and the a-b * b-c is factor_2 times higher than a-c
final_skipping_patterns = [x for x in skipping_patterns if x["a-b"] > factor_1 and x["a-b"] * x["b-c"] > x['a-c'] * factor_2] 
print(len(final_skipping_patterns))
final_skipping_patterns_df = pd.DataFrame(final_skipping_patterns)
# final_skipping_patterns_df.to_csv(f"gen_patterns/skipping/skipping_patterns_{factor_1_string}_{factor_2_string}.csv", index=False)

403396
105


In [9]:
skipping_df = pd.DataFrame(final_skipping_patterns)

In [13]:
skipping_df

Unnamed: 0,a,b,c,a-b,b-c,a-c,a-b-c
0,/ajax/dwr/call/plaincall/JAddressFinder.getAdd...,/werk_nl/werknemer/mijn_werkmap/doorgeven/wijz...,end_session,0.666667,0.260685,0.007937,0.173790
1,/werk_nl/werknemer/mijn_werkmap/mijn-weg-naar-...,/werk_nl/werknemer/mijn_werkmap/mijn-weg-naar-...,/werk_nl/werknemer/mijn_werkmap/inschrijving/m...,0.255814,0.429825,0.003322,0.109955
2,/werk_nl/werknemer/mijn_werkmap/mijn-weg-naar-...,/werk_nl/werknemer/mijn_werkmap/mijn-weg-naar-...,/werk_nl/werknemer/mijn_werkmap/werk-zoeken/mi...,0.265781,0.352436,0.009967,0.093671
3,/werk_nl/werknemer/solliciteren/ontwikkel-uzelf,/portal/page/portal/werk_nl/werknemer/sollicit...,/werk_nl/werknemer/solliciteren/ontwikkel-uzel...,0.221088,0.350685,0.001701,0.077532
4,/werk_nl/werknemer/uitkering-aanvragen,/portal/page/portal/werk_nl/werknemer/uitkerin...,/portal/page/portal/home/diensten/aanvragen-ww,0.171661,0.464428,0.004622,0.079724
...,...,...,...,...,...,...,...
100,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,/werk_nl/werknemer/home,/werk_nl/werknemer/solliciteren/ontwikkel-uzel...,0.091857,0.000592,0.000005,0.000054
101,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_d...,/werk_nl/werknemer/home,/werk_nl/werknemer/over-werk-nl/handleiding/va...,0.082469,0.000659,0.000007,0.000054
102,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_d...,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,/werk_nl/werknemer/mijn_werkmap/inschrijving/m...,0.151873,0.000345,0.000007,0.000052
103,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_d...,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,//werk_nl/werknemer/solliciteren/ontwikkel-uze...,0.151873,0.000276,0.000007,0.000042


In [10]:
# Function to find the skipping patterns in a session.
def find_anomalous_sessions(sessions_df:pd.DataFrame,skipping_patterns:pd.DataFrame) -> list[int]:

  # Create a new column with the consecutive activity
  sessions_df["Consecutive_Activity"] = sessions_df.groupby("SessionID")["Activity"].shift(periods=-1)  

  merged = pd.merge(sessions_df, skipping_patterns, left_on=['Activity','Consecutive_Activity'], right_on=['a','c'], how='inner',indicator='Anomaly')
  merged['Anomaly'] = np.where(merged.Anomaly == 'both', True, False)
  
  anomalous_session = merged[merged['Anomaly'] == True]
  # create a list with all SessionIDs that have an anomaly
  anomaly_sessions = anomalous_session['SessionID'].unique()
  
  return anomaly_sessions

In [11]:
sessions = find_anomalous_sessions(base_df,skipping_df)
len(sessions)

# 5478504
# 1434495

348

In [12]:
# create a new column in the base_df to indicate if the session is anomalous
# if a session is anomalous it will have a True in the column else False
labeled_df = base_df.copy()
labeled_df["anomaly"] = base_df["SessionID"].apply(lambda x: x in sessions)

KeyboardInterrupt: 

In [None]:
labeled_df.to_csv(f"labeled_data/skipped/{factor_1_string}_{factor_2_string}.csv")