In [1]:
import pandas as pd
from collections import Counter
import numpy as np

In [2]:
# base_df = pd.read_csv("../no_repeated.csv", encoding_errors="ignore", on_bad_lines='skip', sep=",",)
base_df = pd.read_csv("../BPI2016_Clicks_Logged_In.csv", encoding_errors="ignore", on_bad_lines='skip', sep=";",usecols=['SessionID', 'TIMESTAMP', 'URL_FILE', ])
base_df.rename(columns={'URL_FILE':'Activity'}, inplace=True)
base_df["TIMESTAMP"] = pd.to_datetime(base_df["TIMESTAMP"], infer_datetime_format=True)
base_df = base_df.sort_values(["SessionID", "TIMESTAMP"])
base_df.head()

Unnamed: 0,SessionID,TIMESTAMP,Activity
3273278,46,2015-11-06 08:07:22.780,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken
5333642,46,2015-11-06 08:07:40.767,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...
3733243,46,2015-11-06 08:07:51.390,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_d...
5904405,46,2015-11-06 08:08:06.003,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...
5573282,46,2015-11-06 08:08:19.343,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...


In [3]:
# Helper functions
def create_df_for_pivot(_df: pd.DataFrame):
  df = _df.copy()
  #Create consecutive column with the consecutive activity and count how many time each pair occurs
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)

  #Count all the pairs and safe in new DataFrame
  df_for_pivot_1 = pd.DataFrame(Counter(list(zip(df['Activity'], df['Consecutive_1'],))), index=["value"]).T.reset_index()

  df_for_pivot_1.fillna("end_session", inplace=True)
  return df_for_pivot_1
  
  
def make_pivot(_df, index_names, column_names):
  df = _df.copy()
  """
  Functions that takes in a DataFrame and returns a pivot table with all the chances
  
  Create pivot table where chances are calculated that each row is succeded by activity that is represented in the column
  """
  df_chances = df.pivot_table(index=index_names, columns=column_names, values='value')
  
  #Calculate what the probabilities are by summing the row and dividing all the values in the row by total sum of the row
  df_chances["total_row_count"] = df_chances.sum(axis=1)
  df_chances = df_chances.div(df_chances["total_row_count"], axis=0)
  
  df_chances.fillna(0, inplace=True)
  df_chances.drop("total_row_count", axis=1, inplace=True)

  return df_chances

def calculate_paths_in_sessions(_df:pd.DataFrame,paths:list[str]) -> tuple[int,int]:
  # this function calculates in how many sessions a path is present
  # a path is a series of transitions
  # a transition is two consecutive activities where the 2 activities are not the same
  # we return a tuple where the first value is the total amount of times the path is found and
  # the second value is the total amount of sessions where the path is found
  # group the dataframe by session id
  df = _df.copy()
  # create a new column that contains the consecutive activity
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)
  # if the consecutive activity is the same as the current activity, remove the row
  # this is done because we only want to count transitions
  df = df[df["Activity"] != df["Consecutive_1"]]
  # remove the consecutive activity column
  df = df.drop("Consecutive_1", axis=1)
  # get the length of the path
  len_path = len(paths)
  # create a column for len_path - 1 consecutive activities
  df = df.assign(**{f"Activity_{i}": df.groupby("SessionID")["Activity"].shift(periods=-i) for i in range(1, len_path)})
  # change name of Activiy column to Activity_0
  df.rename(columns={'Activity':'Activity_0'}, inplace=True)
  # create a column "Found" that contains True if the path is found in the session
  df['Found'] = df.apply(lambda row: all(row[f"Activity_{i}"] == paths[i] for i in range(len_path)), axis=1)
  total = df['Found'].sum()
  sessions = df[df['Found'] == True]['SessionID'].nunique()
  return total,sessions


def all_paths_len_x(_df:pd.DataFrame,len):
  df = _df.copy()
  df["Consecutive_1"] = df.groupby("SessionID")["Activity"].shift(periods=-1)
  # if the consecutive activity is the same as the current activity, remove the row
  # this is done because we only want to count transitions
  df = df[df["Activity"] != df["Consecutive_1"]]
  # remove the consecutive activity column
  df = df.drop("Consecutive_1", axis=1)
  # create a column for len_path - 1 consecutive activities
  df = df.assign(**{f"Activity_{i}": df.groupby("SessionID")["Activity"].shift(periods=-i) for i in range(1, len)})
  # change name of Activiy column to Activity_0
  df.rename(columns={'Activity':'Activity_0'}, inplace=True)
  # create a column that counts the amount of times the path is found
  df = df.groupby([f"Activity_{i}" for i in range(len)]).size().reset_index(name='count')
  total_count = df['count'].sum()
  df['probability'] = df['count'] / total_count
  return df
  
  

def activity_prob_df(_df:pd.DataFrame) -> pd.DataFrame:
  # group the dataframe by activity and count the number of times each activity occurs
  df = _df.copy()
  total_amount_of_activities = df['Activity'].count()
  # create df where each row contains the activity with columns: Activity, Count, Probability
  final_df = df.groupby('Activity').count().reset_index()
  # Drop column Timestamp
  final_df = final_df.drop('TIMESTAMP', axis=1)
  # change SessionID column name to Count
  final_df.rename(columns={'SessionID':'Count'}, inplace=True)
  # create Probability column
  final_df = final_df.assign(Probability=final_df['Count']/total_amount_of_activities * 100)
  
  return final_df.sort_values('Probability', ascending=False)
  


In [4]:
# Create df for pivot
df_for_pivot = create_df_for_pivot(base_df)
df_chances_1 = make_pivot(df_for_pivot, "level_0", "level_1")

#Calculate the probability of each activity that it is the first activity performed in the session
df_chances_1["start_session_chance"] = base_df.groupby("SessionID").nth(0)["Activity"].value_counts() / base_df.groupby("SessionID").nth(0)["Activity"].value_counts().sum()
df_chances_1.fillna(0, inplace=True)

df_chances_total = df_chances_1 
df_chances_total

#Split the start probabilities from the normal DataFrame
start_chances = df_chances_total["start_session_chance"][df_chances_total["start_session_chance"] > 0]

#Create df with all chances except the starting chance.
final_df = df_chances_total.iloc[:, :-1]
final_df
probability_matrix = final_df.copy()

In [5]:
paths_len_3 = all_paths_len_x(base_df,3)


In [6]:
work_df = paths_len_3.copy()

In [7]:
# Create a column a->c
work_df["a->c"] = work_df["Activity_0"] + "->" + work_df["Activity_2"]
work_df.rename(columns={'count':'a-b-c'}, inplace=True)
# count how many times a->c occurs
work_df['a-c-count'] = work_df.groupby('a->c')['a->c'].transform('count')
work_df = work_df[work_df['Activity_0'] != work_df['Activity_2']]
work_df = work_df.sort_values(['a-c-count','a-b-c'], ascending=False)
#  create df maybe patterns with the rows where a-b-c < 5 and a-c-count > 100
maybe_patterns = work_df[(work_df['a-b-c'] < 2) & (work_df['a-c-count'] > 120)]

In [8]:
#  get the unique values of a->c from maybe_patterns
a_c_uni = maybe_patterns['a->c'].unique()
a_c_df = pd.DataFrame(a_c_uni, columns=['a->c'])

# create column replace that contains a list of the 3 highest count of a->c from work_df
maybe_patterns['replace'] = maybe_patterns['a->c'].apply(lambda x: work_df[work_df['a->c'] == x].head(3)['Activity_1'].tolist())
maybe_patterns
# get unique values of a->c from maybe_patterns
uni_a_c = maybe_patterns['a->c'].unique()
uni_a_c

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maybe_patterns['replace'] = maybe_patterns['a->c'].apply(lambda x: work_df[work_df['a->c'] == x].head(3)['Activity_1'].tolist())


array(['/werk_nl/werknemer/mijn_werkmap/doorgeven/taken->/werk_nl/werknemer/home',
       '/werk_nl/werknemer/mijn_werkmap/postvak/mijn_berichten->/werk_nl/werknemer/home'],
      dtype=object)

In [9]:
patterns = maybe_patterns.copy()
# drop columns a-b-c probability a-c-count a->c
patterns = patterns.drop(['a-b-c','probability','a-c-count','a->c'], axis=1)
uniq_a = patterns['Activity_2'].unique()
patterns

Unnamed: 0,Activity_0,Activity_1,Activity_2,replace
23186,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,//werk_nl/werknemer/home,/werk_nl/werknemer/home,[/werk_nl/werknemer/mijn_werkmap/doorgeven/mij...
23315,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/portal/page/portal/werk_nl/werknemer/contact,/werk_nl/werknemer/home,[/werk_nl/werknemer/mijn_werkmap/doorgeven/mij...
23318,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/portal/page/portal/werk_nl/werknemer/eintake/...,/werk_nl/werknemer/home,[/werk_nl/werknemer/mijn_werkmap/doorgeven/mij...
23360,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/portal/page/portal/werk_nl/werknemer/over-wer...,/werk_nl/werknemer/home,[/werk_nl/werknemer/mijn_werkmap/doorgeven/mij...
23365,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,/portal/page/portal/werk_nl/werknemer/over-wer...,/werk_nl/werknemer/home,[/werk_nl/werknemer/mijn_werkmap/doorgeven/mij...
...,...,...,...,...
34383,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,/xpsimage/wdo_014521,/werk_nl/werknemer/home,[/werk_nl/werknemer/mijn_werkmap/doorgeven/tak...
34407,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,/xpsitem/wdo212424,/werk_nl/werknemer/home,[/werk_nl/werknemer/mijn_werkmap/doorgeven/tak...
34409,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,/xpsitem/wdo_013142,/werk_nl/werknemer/home,[/werk_nl/werknemer/mijn_werkmap/doorgeven/tak...
34418,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,/zoeken_portlet/ajax/addVacatureToGereageerd,/werk_nl/werknemer/home,[/werk_nl/werknemer/mijn_werkmap/doorgeven/tak...


A session is anomalous if there is a transition Activity_0 -> Activity_1 -> Activity_2

In [10]:
def label_sessions(_df:pd.DataFrame, _patterns:pd.DataFrame) -> list[int]:
  df = _df.copy()
  patterns = _patterns.copy()
  # create consecutive_1 and consecutive_2 columns
  df['consecutive_1'] = df.groupby('SessionID')['Activity'].shift(periods=-1).fillna('end')
  # drop rows where Activity == consecutive_1
  df = df[df['Activity'] != df['consecutive_1']]
  df['consecutive_2'] = df.groupby('SessionID')['Activity'].shift(periods=-2).fillna('end')
  
  merged = pd.merge(df, patterns, left_on=['Activity','consecutive_1','consecutive_2'], right_on=['Activity_0','Activity_1','Activity_2'], how='inner',indicator='Anomaly')
  merged['Anomaly'] = np.where(merged.Anomaly == 'both', True, False)
  
  anomalous_session = merged[merged['Anomaly'] == True]
  # create a list with all SessionIDs that have an anomaly
  anomaly_sessions = anomalous_session['SessionID'].unique()
  
  return anomaly_sessions

In [11]:
anomaly_session_id = label_sessions(base_df, patterns)

print(anomaly_session_id)

[ 1563214  1563748  2009843  2164924  3857794  5083131  5261268  5706837
  7104669  7467278  7661136  9066187  9353495  9422897 10087017 10427261
 11162910 11196944 11422582 12100185 12484676 14428843 14685509 14845137
 15963234 16102568 16644247 16680330 17135049 17307904 17933910 18133772
 20541787 20770933 21314355 22129304 22189735 22776487 23704865 24124045
 25356592 28738492 29296715 33395489 33788706 35156887 35660983 35677805
 36382646 36699905 37529855 37565367 37581361 38270377 38339524 38377127
 38890920 39132798 39502182 39524098 40761809 40777068 41021741 41162060
 41496946 41680367 41940114 42830571 42897415 43299910 45083723 46198574
 46336030 46528348 46658644 46951123 47471031 47615131 47855937 48337625
 49161707 49355033 50276849 50557986 50596927 51539690 52144141 52413316
 52794414 53071927 53723406]


In [12]:
len(anomaly_session_id)

91

In [13]:
import random


def label_80(ids:list[int]) -> list[int]:
  sample = random.sample(ids,k=int(0.8*len(ids)))
  return sample

sample = label_80(list(anomaly_session_id))
len(sample)
  
labeled_sample = base_df.copy()
labeled_sample["anomaly"] = labeled_sample["SessionID"].apply(lambda x: x in sample)
print('done')
labeled_sample.to_csv(f"./labeled_data/Replaced_Sample.csv",index=False)

done


In [14]:
# # create a new column in the base_df to indicate if the session is anomalous
# # if a session is anomalous it will have a True in the column else False
# labeled_df = base_df.copy()
# labeled_df["anomaly"] = base_df["SessionID"].apply(lambda x: x in anomaly_session_id)

In [15]:
# # save in csv
# labeled_df.to_csv(f"./labeled_data/Replaced.csv",index=False)
# patterns.to_csv(f"./gen_patterns/Replaced.csv",index=False)