In [5]:
import pandas as pd
from collections import Counter
import numpy as np

In [6]:
# base_df = pd.read_csv("../no_repeated.csv", encoding_errors="ignore", on_bad_lines='skip', sep=",",)
base_df = pd.read_csv("../BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",usecols=['SessionID', 'TIMESTAMP', 'URL_FILE', ])
base_df.rename(columns={'URL_FILE':'Activity'}, inplace=True)
base_df["TIMESTAMP"] = pd.to_datetime(base_df["TIMESTAMP"], infer_datetime_format=True)
base_df = base_df.sort_values(["SessionID", "TIMESTAMP"])
base_df.head()

Unnamed: 0,SessionID,TIMESTAMP,Activity
3273278,46,2015-11-06 08:07:22.780,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken
5333642,46,2015-11-06 08:07:40.767,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...
3733243,46,2015-11-06 08:07:51.390,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_d...
5904405,46,2015-11-06 08:08:06.003,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...
5573282,46,2015-11-06 08:08:19.343,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...


In [7]:
# Helper functions
def create_df_for_pivot(_df: pd.DataFrame):
  df = _df.copy()
  #Create consecutive column with the consecutive activity and count how many time each pair occurs
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)

  #Count all the pairs and safe in new DataFrame
  df_for_pivot_1 = pd.DataFrame(Counter(list(zip(df['Activity'], df['Consecutive_1'],))), index=["value"]).T.reset_index()

  df_for_pivot_1.fillna("end_session", inplace=True)
  return df_for_pivot_1
  
  
def make_pivot(_df, index_names, column_names):
  df = _df.copy()
  """
  Functions that takes in a DataFrame and returns a pivot table with all the chances
  
  Create pivot table where chances are calculated that each row is succeded by activity that is represented in the column
  """
  df_chances = df.pivot_table(index=index_names, columns=column_names, values='value')
  
  #Calculate what the probabilities are by summing the row and dividing all the values in the row by total sum of the row
  df_chances["total_row_count"] = df_chances.sum(axis=1)
  df_chances = df_chances.div(df_chances["total_row_count"], axis=0)
  
  df_chances.fillna(0, inplace=True)
  df_chances.drop("total_row_count", axis=1, inplace=True)

  return df_chances

def calculate_paths_in_sessions(_df:pd.DataFrame,paths:list[str]) -> tuple[int,int]:
  # this function calculates in how many sessions a path is present
  # a path is a series of transitions
  # a transition is two consecutive activities where the 2 activities are not the same
  # we return a tuple where the first value is the total amount of times the path is found and
  # the second value is the total amount of sessions where the path is found
  # group the dataframe by session id
  df = _df.copy()
  # create a new column that contains the consecutive activity
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)
  # if the consecutive activity is the same as the current activity, remove the row
  # this is done because we only want to count transitions
  df = df[df["Activity"] != df["Consecutive_1"]]
  # remove the consecutive activity column
  df = df.drop("Consecutive_1", axis=1)
  # get the length of the path
  len_path = len(paths)
  # create a column for len_path - 1 consecutive activities
  df = df.assign(**{f"Activity_{i}": df.groupby("SessionID")["Activity"].shift(periods=-i) for i in range(1, len_path)})
  # change name of Activiy column to Activity_0
  df.rename(columns={'Activity':'Activity_0'}, inplace=True)
  # create a column "Found" that contains True if the path is found in the session
  df['Found'] = df.apply(lambda row: all(row[f"Activity_{i}"] == paths[i] for i in range(len_path)), axis=1)
  total = df['Found'].sum()
  sessions = df[df['Found'] == True]['SessionID'].nunique()
  return total,sessions


def all_paths_len_x(_df:pd.DataFrame,len):
  df = _df.copy()
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)
  # if the consecutive activity is the same as the current activity, remove the row
  # this is done because we only want to count transitions
  df = df[df["Activity"] != df["Consecutive_1"]]
  # remove the consecutive activity column
  df = df.drop("Consecutive_1", axis=1)
  # create a column for len_path - 1 consecutive activities
  df = df.assign(**{f"Activity_{i}": df.groupby("SessionID")["Activity"].shift(periods=-i) for i in range(1, len)})
  # change name of Activiy column to Activity_0
  df.rename(columns={'Activity':'Activity_0'}, inplace=True)
  # create a column that counts the amount of times the path is found
  df = df.groupby([f"Activity_{i}" for i in range(len)]).size().reset_index(name='count')
  total_count = df['count'].sum()
  df['probability'] = df['count'] / total_count
  return df
  
  

def activity_prob_df(_df:pd.DataFrame) -> pd.DataFrame:
  # group the dataframe by activity and count the number of times each activity occurs
  df = _df.copy()
  total_amount_of_activities = df['Activity'].count()
  # create df where each row contains the activity with columns: Activity, Count, Probability
  final_df = df.groupby('Activity').count().reset_index()
  # Drop column Timestamp
  final_df = final_df.drop('TIMESTAMP', axis=1)
  # change SessionID column name to Count
  final_df.rename(columns={'SessionID':'Count'}, inplace=True)
  # create Probability column
  final_df = final_df.assign(Probability=final_df['Count']/total_amount_of_activities * 100)
  
  return final_df.sort_values('Probability', ascending=False)
  


In [8]:
# Create df for pivot
df_for_pivot = create_df_for_pivot(base_df)
df_chances_1 = make_pivot(df_for_pivot, "level_0", "level_1")

#Calculate the probability of each activity that it is the first activity performed in the session
df_chances_1["start_session_chance"] = base_df.groupby("SessionID").nth(0)["Activity"].value_counts() / base_df.groupby("SessionID").nth(0)["Activity"].value_counts().sum()
df_chances_1.fillna(0, inplace=True)

df_chances_total = df_chances_1 
df_chances_total

#Split the start probabilities from the normal DataFrame
start_chances = df_chances_total["start_session_chance"][df_chances_total["start_session_chance"] > 0]

#Create df with all chances except the starting chance.
final_df = df_chances_total.iloc[:, :-1]
final_df
probability_matrix = final_df.copy()

In [9]:
paths_len_2 = all_paths_len_x(base_df,2)
paths_len_2_2 = all_paths_len_x(base_df,2)


In [10]:
work_df_1 = paths_len_2.sort_values('count', ascending=False)
work_df_2 = paths_len_2_2.copy()

In [11]:
merged = pd.merge(work_df_1, work_df_2, left_on=['Activity_0','Activity_1'], right_on=['Activity_1','Activity_0'], how='inner')
merged

Unnamed: 0,Activity_0_x,Activity_1_x,count_x,probability_x,Activity_0_y,Activity_1_y,count_y,probability_y
0,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,123143,5.336082e-02,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,95763,4.149641e-02
1,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/werk_nl/werknemer/home,118348,5.128303e-02,/werk_nl/werknemer/home,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,89957,3.898053e-02
2,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,95763,4.149641e-02,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,123143,5.336082e-02
3,/werk_nl/werknemer/home,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,89957,3.898053e-02,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/werk_nl/werknemer/home,118348,5.128303e-02
4,/werk_nl/werknemer/werkmap,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,66648,2.888018e-02,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/werk_nl/werknemer/werkmap,4314,1.869360e-03
...,...,...,...,...,...,...,...,...
8539,/werk_nl/werknemer/mijn_werkmap/tip-van-de-week,/werk_nl/werknemer/uitkering-aanvragen/uwv-die...,1,4.333240e-07,/werk_nl/werknemer/uitkering-aanvragen/uwv-die...,/werk_nl/werknemer/mijn_werkmap/tip-van-de-week,1,4.333240e-07
8540,/werk_nl/werknemer/mijn_werkmap/tip-van-de-week,/werk_nl/werknemer/solliciteren/solliciteren-w...,1,4.333240e-07,/werk_nl/werknemer/solliciteren/solliciteren-w...,/werk_nl/werknemer/mijn_werkmap/tip-van-de-week,2,8.666480e-07
8541,/werk_nl/werknemer/mijn_werkmap/tip-van-de-week,/werk_nl/werknemer/over-werk-nl,1,4.333240e-07,/werk_nl/werknemer/over-werk-nl,/werk_nl/werknemer/mijn_werkmap/tip-van-de-week,1,4.333240e-07
8542,/werk_nl/werknemer/mijn_werkmap/tip-van-de-week,/werk_nl/ignore_request_page,1,4.333240e-07,/werk_nl/ignore_request_page,/werk_nl/werknemer/mijn_werkmap/tip-van-de-week,3,1.299972e-06


In [12]:
merged = merged.drop(['Activity_1_y','Activity_0_y','probability_y','probability_x'], axis=1)


In [13]:
merged['dif'] = merged['count_x'] - merged['count_y']
merged['fac'] = merged['count_x'] / merged['count_y']
merged.sort_values('fac', ascending=False)
merged_df = merged[merged['fac'] > 40]
len(merged_df)
merged_df

Unnamed: 0,Activity_0_x,Activity_1_x,count_x,count_y,dif,fac
88,/xdocs/ux-frontend/fonts/uwvsanlig-webfont.woff,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,4330,8,4322,541.25
97,/shared/timeout.htm,/werk_nl/werknemer/home,3799,43,3756,88.348837
179,/xdocs/ux-frontend/fonts/uwvsanlig-webfont.woff,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,1556,3,1553,518.666667
222,/werk_nl/werknemer/uitkering-aanvragen,/portal/page/portal/werk_nl/werknemer/uitkerin...,1077,10,1067,107.7
232,/xdocs/ux-frontend/fonts/uwvsanlig-webfont.woff,/werk_nl/werknemer/mijn_werkmap/werk-zoeken/va...,1021,5,1016,204.2
237,/xdocs/ux-frontend/fonts/uwvsanlig-webfont.eot,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,979,1,978,979.0
314,/portal/page/portal/werk_nl/werknemer/uitkerin...,/portal/page/portal/home/diensten/aanvragen-ww,581,14,567,41.5
366,/werk_nl/werknemer/mijn_werkmap/doorgeven/wijz...,/werk_nl/werknemer/home,433,1,432,433.0
371,/xdocs/ux-frontend/fonts/uwvsanlig-webfont.woff,/werk_nl/werknemer/mijn_werkmap,422,4,418,105.5
379,/xdocs/ux-frontend/fonts/uwvsanlig-webfont.woff,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,401,2,399,200.5


In [14]:
patters_df = merged_df.copy()

In [15]:
patters_df

Unnamed: 0,Activity_0_x,Activity_1_x,count_x,count_y,dif,fac
88,/xdocs/ux-frontend/fonts/uwvsanlig-webfont.woff,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,4330,8,4322,541.25
97,/shared/timeout.htm,/werk_nl/werknemer/home,3799,43,3756,88.348837
179,/xdocs/ux-frontend/fonts/uwvsanlig-webfont.woff,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,1556,3,1553,518.666667
222,/werk_nl/werknemer/uitkering-aanvragen,/portal/page/portal/werk_nl/werknemer/uitkerin...,1077,10,1067,107.7
232,/xdocs/ux-frontend/fonts/uwvsanlig-webfont.woff,/werk_nl/werknemer/mijn_werkmap/werk-zoeken/va...,1021,5,1016,204.2
237,/xdocs/ux-frontend/fonts/uwvsanlig-webfont.eot,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,979,1,978,979.0
314,/portal/page/portal/werk_nl/werknemer/uitkerin...,/portal/page/portal/home/diensten/aanvragen-ww,581,14,567,41.5
366,/werk_nl/werknemer/mijn_werkmap/doorgeven/wijz...,/werk_nl/werknemer/home,433,1,432,433.0
371,/xdocs/ux-frontend/fonts/uwvsanlig-webfont.woff,/werk_nl/werknemer/mijn_werkmap,422,4,418,105.5
379,/xdocs/ux-frontend/fonts/uwvsanlig-webfont.woff,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,401,2,399,200.5


In [16]:
def label_sessions(_df:pd.DataFrame, _patterns:pd.DataFrame) -> pd.DataFrame:
  df = _df.copy()
  patterns = _patterns.copy()
  # create consecutive_1 and consecutive_2 columns
  df['consecutive_1'] = df.groupby('SessionID')['Activity'].shift(periods=-1).fillna('end')
  # drop rows where Activity == consecutive_1
  df = df[df['Activity'] != df['consecutive_1']]
  df['consecutive_2'] = df.groupby('SessionID')['Activity'].shift(periods=-2).fillna('end')
  
  merged = pd.merge(df, patterns, left_on=['Activity','consecutive_1'], right_on=['Activity_1_x','Activity_0_x'], how='inner',indicator='Anomaly')
  merged['Anomaly'] = np.where(merged.Anomaly == 'both', True, False)
  
  return merged

In [17]:
sessions = label_sessions(base_df, patters_df)
len(sessions)

161

In [18]:
sess = sessions['SessionID'].unique()
len(sess)

159

In [19]:
labeled_df = base_df.copy()
labeled_df["anomaly"] = base_df["SessionID"].apply(lambda x: x in sess)

In [24]:
import random


def label_80(ids:list[int]) -> list[int]:
  sample = random.sample(ids,k=int(0.8*len(ids)))
  return sample

sample = label_80(list(sess))
len(sample)
  
labeled_sample = base_df.copy()
labeled_sample["anomaly"] = labeled_sample["SessionID"].apply(lambda x: x in sample)
labeled_sample.to_csv(f"./labeled_data/Swapped_Sample.csv",index=False)


In [None]:
# # save in csv
# labeled_df.to_csv(f"./labeled_data/Swapped.csv",index=False)
# patters_df.to_csv(f"./gen_patterns/Swapped.csv",index=False)