In [102]:

import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import numpy as np
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# imprt tree
from sklearn import tree
# import metrics
from sklearn import metrics

In [103]:
df_raw = pd.read_csv("labeled_data/skipped/015_5.csv", encoding_errors="ignore", on_bad_lines='skip', sep=",",
                    usecols=[ 'CustomerID', 'AgeCategory', 'Gender', 'Office_U', 'Office_W',
                    'SessionID', 'IPID', 'TIMESTAMP', 'VHOST', 'Activity', 'PAGE_NAME',
                    'REF_URL_category', 'page_load_error', 'page_action_detail', 'tip',
                    'service_detail', 'xps_info','anomaly'])
df_raw["TIMESTAMP"] = pd.to_datetime(df_raw["TIMESTAMP"], infer_datetime_format=True)
df_raw = df_raw.sort_values(by=["SessionID", "TIMESTAMP"]).copy()

In [104]:
# count the number of unique SessionID where anomaly == True
count_anomaly_raw = df_raw[df_raw["anomaly"] == True]["SessionID"].nunique()
count_normal_raw = df_raw[df_raw["anomaly"] == False]["SessionID"].nunique()
print(f'Amount of anomalous sessions in the dataset:   {count_anomaly_raw}' )
print(f'Amount of normal sessions in the dataset:      {count_normal_raw}')
distribution =  count_anomaly_raw / count_normal_raw 
print(f'Distribution:                                  {distribution * 100} %')

Amount of anomalous sessions in the dataset:   277
Amount of normal sessions in the dataset:      659993
Distribution:                                  0.04197014210756781 %


In [105]:
nr_of_sessions_used = 50000
# get a random sample of the sessions
df_ = df_raw[df_raw["SessionID"].isin(df_raw["SessionID"].unique()[:nr_of_sessions_used])]

# Only use the columns that are needed -> SessionID, Activity, anomaly
df_50k = df_[["SessionID", "Activity", "anomaly"]]

In [106]:
# count the number of unique SessionID where anomaly == True
count_anomaly = df_50k[df_50k["anomaly"] == True]["SessionID"].nunique()
count_normal = df_50k[df_50k["anomaly"] == False]["SessionID"].nunique()
print(f'Amount of anomalous sessions in the sampled dataset:   {count_anomaly}')
print(f'Amount of normal sessions in the sampled dataset:      {count_normal}')
distribution =  count_anomaly / count_normal 
print(f'Distribution:                                          {distribution * 100} %')

Amount of anomalous sessions in the sampled dataset:   18
Amount of normal sessions in the sampled dataset:      49982
Distribution:                                          0.03601296466728022 %


Helper Functions

In [107]:
# function to encode the activities
def encode_activities(_df):
    df = _df.copy()
    le = LabelEncoder()
    le.fit(df)
    df= le.transform(df)
    return df

# function to create a list of al the activities
def activities_list(_df):
  df = _df.copy()
  activities = []
  for i in df["Activity"].unique():
    activities.append(i)
  return activities

# function to create a dictionary with the activities as keys and the one hot encoded activities as values
def create_dict(l1, l2):
  dic = {}
  for i in range(len(l1)):
    dic[l1[i]] = l2[i]
  return dic

# function to transform each session into a sequence of activities
def transform_session_to_sequence(_df):
  df = _df.copy()
  df_ses = df.groupby("SessionID")["Activity"].apply(list).reset_index()
  # merge the anomaly column
  df_ses = df_ses.merge(df[["SessionID", "anomaly"]].drop_duplicates(), on="SessionID")
  return df_ses

# function the create the dataFrame for the model
def df_for_model(_df,Activity_col):
  df = pd.DataFrame(_df[Activity_col].values.tolist(), index= _df.index)
  df_for_model = pd.concat([_df, df], axis=1)
  df_for_model = df_for_model.drop(columns=[Activity_col])
  df_for_model.fillna(0, inplace=True)
  return df_for_model
  
# function to concat 2 dataframes
def concat_df(df_gen ,amount_real,amount_generated,_df1 = df_raw,):
  df1 = _df1.copy()
  df2 = df_gen
  df_raw_filtered = df1[df1["SessionID"].isin(df1["SessionID"].unique()[:amount_real])]
  df_gen_anomalies_filtered = df2[df2["SessionID"].isin(df2["SessionID"].unique()[:amount_generated])]
  df = pd.concat([df_raw_filtered, df_gen_anomalies_filtered], ignore_index=True)
  return df

GLOBALS

In [108]:
# Global variables
# list of all possible activities in df_raw
ALL_ACTIVITIES = activities_list(df_raw)
ENCODED_ACTIVITIES = encode_activities(ALL_ACTIVITIES)
DICT = create_dict(ALL_ACTIVITIES, ENCODED_ACTIVITIES)
if len(ALL_ACTIVITIES) == len(ENCODED_ACTIVITIES) == len(DICT):
  print("All lists are the same length")

All lists are the same length


Stappenplan:

1. df -> transform_session_to_sequence
2. Transform Activities to encoded Activities
3. Transform the list in the Activity column to multiple columns with fillna(0)

In [109]:
base_data = transform_session_to_sequence(df_50k)

In [110]:
base_data['Activity'] = base_data['Activity'].apply(lambda x: [DICT[i] for i in x])

In [111]:
base_data = df_for_model(base_data, "Activity")

In [112]:
base_data

Unnamed: 0,SessionID,anomaly,0,1,2,3,4,5,6,7,...,355,356,357,358,359,360,361,362,363,364
0,46,False,372,371.0,392.0,391.0,391.0,391.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,92,False,369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,358,False,391,372.0,401.0,391.0,391.0,391.0,391.0,369.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,420,False,372,372.0,372.0,372.0,372.0,372.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,447,False,391,391.0,391.0,391.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,5486344,False,372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49996,5486405,False,372,367.0,367.0,367.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49997,5486408,False,367,391.0,371.0,372.0,372.0,372.0,372.0,372.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49998,5486558,False,542,391.0,391.0,391.0,391.0,391.0,391.0,391.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Functions to run the model

In [113]:
# Function to split the data into train and test data
def split_data(_df):
  df = _df.copy()
  X = df.drop(columns=["anomaly"])
  y = df["anomaly"]
  X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
  return X_train, X_test, y_train, y_test

In [114]:
# Base test and train data
X_train, X_test, y_train, y_test = split_data(base_data)

In [115]:
Le = LabelEncoder()
Le.fit(y_train)
y_train_transformed = Le.transform(y_train)
y_test_transformed = Le.transform(y_test)
y_train_transformed

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

Run the model

In [116]:
# import the generated sessions:
base_path = "gen_sessions/skipped/"
gen_sessions_paths = [
  # base_path + '5_1000.csv',
  # base_path + '10_1000.csv',
  # base_path + '15_1000.csv',
  # base_path + '25_1000.csv',
  # base_path + '50_1000.csv',
  # base_path + '80_1000.csv',
  base_path + '100_1000.csv',
]

In [119]:
for sessions in gen_sessions_paths:
    amount_anomalies_list = []
    precision_score_list = []
    recall_score_list = []
    
    # build the dataFrame
    cvs = pd.read_csv(sessions)
    cvs['anomaly'] = True
    gen_df_seq = transform_session_to_sequence(cvs)
    gen_df_seq['Activity'] = gen_df_seq['Activity'].apply(lambda x: [DICT[i] for i in x])
    ready_df = df_for_model(gen_df_seq, "Activity")
    # change the sessionID to something useable
    # first remove everything before the last _
    ready_df['SessionID'] = ready_df['SessionID'].apply(lambda x: x.split('_')[-1])
    ready_df['SessionID'] = ready_df['SessionID'].astype(int)
    ready_df['SessionID'] = ready_df['SessionID'] + 99999124
    
    
    for amount_gen in [10, 20, 50, 65, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]:
        amount_real = 0
        df_gen = ready_df[:amount_gen]
        print(df_gen)
        #Add the generated anomalies to the training dataset
        # X_train_extra = pd.concat([X_train, df_gen.drop(columns=["anomaly"])]).fillna(0)
        # y_train_extra = pd.concat([y_train, df_gen["anomaly"]])
        
        # #transform y values to binary class
        # Le = LabelEncoder()
        # Le.fit(y_train_extra)
        # y_train_transformed = Le.transform(y_train)
        # y_test_transformed = Le.transform(y_test)
        # y_train_extra_transformed = Le.transform(y_train_extra)
        
        # #Make sure that both dataframes have the same columns
        # for column_name in X_train_extra.columns:
        #     if column_name not in X_test.columns:
        #         X_test[column_name] = 0

        # #Make sure that both dataframes have the same columns
        # for column_name in X_test.columns:
        #     if column_name not in X_train_extra.columns:
        #         X_test.drop(columns=[column_name], inplace=True)
                
        # clf = tree.DecisionTreeClassifier()
        # clf = clf.fit(X_train_extra, y_train_extra_transformed)
        # y_test_predict = clf.predict(X_test)
                
        # print(f'Session: {sessions}')
        # print(f'Amount of injected anomalies: {amount_gen}')
        # print("Accuracy:",metrics.accuracy_score(y_test, y_test_predict))
        # balanced_accuracy_score = metrics.balanced_accuracy_score(y_test, y_test_predict)
        # print("Balanced accuracy:", balanced_accuracy_score)
    
       
        


   SessionID  anomaly    0      1      2      3      4      5      6      7  \
0   99999124     True  542  391.0  391.0  391.0  391.0  401.0  401.0  391.0   
1   99999125     True  367   33.0  374.0  374.0  374.0  374.0  374.0  374.0   
2   99999134     True  367   33.0  372.0  371.0  367.0    0.0    0.0    0.0   
3   99999224     True  367  541.0  541.0  541.0  541.0  541.0  778.0   37.0   
4   99999225     True  372  372.0  372.0  372.0  372.0  367.0   33.0   33.0   
5   99999226     True  367   33.0   33.0  372.0  372.0  372.0  372.0  372.0   
6   99999227     True  367   33.0  372.0  372.0  371.0  365.0  365.0  365.0   
7   99999228     True  367  398.0  398.0  398.0  398.0  398.0  372.0  372.0   
8   99999229     True  398  398.0  398.0  398.0  398.0  367.0   33.0   33.0   
9   99999230     True  369  372.0  372.0  372.0  367.0   33.0    0.0    0.0   

   ...   96   97   98   99  100  101  102  103  104  105  
0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
1  ...  0.0 