In [41]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# imprt tree
from sklearn import tree
# import metrics
from sklearn import metrics

In [42]:
df_raw = pd.read_csv("labeled_data/skipped/015_5.csv", encoding_errors="ignore", on_bad_lines='skip', sep=",",
                    usecols=[ 'CustomerID', 'AgeCategory', 'Gender', 'Office_U', 'Office_W',
                    'SessionID', 'IPID', 'TIMESTAMP', 'VHOST', 'Activity', 'PAGE_NAME',
                    'REF_URL_category', 'page_load_error', 'page_action_detail', 'tip',
                    'service_detail', 'xps_info','anomaly'])
df_raw["TIMESTAMP"] = pd.to_datetime(df_raw["TIMESTAMP"], infer_datetime_format=True)
df_raw = df_raw.sort_values(by=["SessionID", "TIMESTAMP"]).copy()

  df_raw = pd.read_csv("labeled_data/skipped/015_5.csv", encoding_errors="ignore", on_bad_lines='skip', sep=",",


In [43]:
df_raw.head()

Unnamed: 0,CustomerID,AgeCategory,Gender,Office_U,Office_W,SessionID,IPID,TIMESTAMP,VHOST,Activity,PAGE_NAME,REF_URL_category,page_load_error,page_action_detail,tip,service_detail,xps_info,anomaly
0,1837760,50-65,M,247,247,46,1419591,2015-11-06 08:07:22.780,digid.werk.nl,/werk_nl/werknemer/mijn_werkmap/doorgeven/taken,taken,,0,,,,,False
1,1837760,50-65,M,247,247,46,1419591,2015-11-06 08:07:40.767,digid.werk.nl,/werk_nl/werknemer/mijn_werkmap/doorgeven/mijn...,mijn_sollicitaties,,0,,,,,False
2,1837760,50-65,M,247,247,46,1419591,2015-11-06 08:07:51.390,digid.werk.nl,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_d...,mijn_documenten,,0,,,,,False
3,1837760,50-65,M,247,247,46,1419591,2015-11-06 08:08:06.003,digid.werk.nl,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,mijn_berichten,,0,,,,,False
4,1837760,50-65,M,247,247,46,1419591,2015-11-06 08:08:19.343,digid.werk.nl,/werk_nl/werknemer/mijn_werkmap/postvak/mijn_b...,mijn_berichten,,0,,,,,False


Dataset 

In [44]:
# count the number of unique SessionID where anomaly == True
count_anomaly_raw = df_raw[df_raw["anomaly"] == True]["SessionID"].nunique()
count_normal_raw = df_raw[df_raw["anomaly"] == False]["SessionID"].nunique()
print(f'Amount of anomalous sessions in the dataset:   {count_anomaly_raw}' )
print(f'Amount of normal sessions in the dataset:      {count_normal_raw}')
distribution =  count_anomaly_raw / count_normal_raw 
print(f'Distribution:                                  {distribution * 100} %')

Amount of anomalous sessions in the dataset:   277
Amount of normal sessions in the dataset:      659993
Distribution:                                  0.04197014210756781 %


To make the calculations work on my laptop I need to take a sample of the dataset.

For now I take the first 50.000 sessions

In [45]:
nr_of_sessions_used = 50000
# get a random sample of the sessions
df_ = df_raw[df_raw["SessionID"].isin(df_raw["SessionID"].unique()[:nr_of_sessions_used])]

# Only use the columns that are needed -> SessionID, Activity, anomaly
df_50k = df_[["SessionID", "Activity", "anomaly"]]


In [46]:
# count the number of unique SessionID where anomaly == True
count_anomaly = df_50k[df_50k["anomaly"] == True]["SessionID"].nunique()
count_normal = df_50k[df_50k["anomaly"] == False]["SessionID"].nunique()
print(f'Amount of anomalous sessions in the sampled dataset:   {count_anomaly}')
print(f'Amount of normal sessions in the sampled dataset:      {count_normal}')
distribution =  count_anomaly / count_normal 
print(f'Distribution:                                          {distribution * 100} %')

Amount of anomalous sessions in the sampled dataset:   18
Amount of normal sessions in the sampled dataset:      49982
Distribution:                                          0.03601296466728022 %


Helper functions

In [47]:
# function to encode the activities
def encode_activities(_df):
    df = _df.copy()
    le = LabelEncoder()
    le.fit(df["Activity"])
    df["Activity"] = le.transform(df["Activity"])
    return df

In [48]:
# function to use one hot encoding on a sequence of activities
def oneHot_encoder(_sequence):
  sequence = np.array(_sequence)
  oneHot_encoder = OneHotEncoder(sparse=False)
  sequence = sequence.reshape(len(sequence), 1)
  oneHot_encoded = oneHot_encoder.fit_transform(sequence)
  return oneHot_encoded


In [49]:
seq = encode_activities(df_50k)

# count the amount unique activities
count_activities_without_encoding = df_50k["Activity"].nunique()
count_activities = seq["Activity"].nunique()
print(f'Amount of unique activities without encoding: {count_activities_without_encoding}')
print(f'Amount of unique activities with encoding:    {count_activities}')
if count_activities_without_encoding == count_activities:
  print("The encoding is correct")
else:
  print("The encoding is not correct")
seq.head()

Amount of unique activities without encoding: 430
Amount of unique activities with encoding:    430
The encoding is correct


Unnamed: 0,SessionID,Activity,anomaly
0,46,163,False
1,46,162,False
2,46,182,False
3,46,181,False
4,46,181,False


In [50]:
# function to create a list of al the activities
def activities_list(_df):
  df = _df.copy()
  activities = []
  for i in df["Activity"].unique():
    activities.append(i)
  return activities

act = activities_list(seq)

In [51]:
# function to create a dictionary with the activities as keys and the one hot encoded activities as values
def create_dict(_activities):
  dic = {}
  activities = _activities.copy()
  encoded = oneHot_encoder(activities)
  for i in range(len(activities)):
    dic[activities[i]] = encoded[i]
  return dic

In [52]:
encoded_dict = create_dict(act)
len(encoded_dict)

430

In [53]:
# function to transform each session into a sequence of activities
def transform_session_to_sequence(_df):
  df = _df.copy()
  df_ses = df.groupby("SessionID")["Activity"].apply(list).reset_index()
  # merge the anomaly column
  df_ses = df_ses.merge(df[["SessionID", "anomaly"]].drop_duplicates(), on="SessionID")
  return df_ses

In [54]:
sessions_sequenced_df = transform_session_to_sequence(seq)


# count the number of anomaly == True
count_encoded_sequences = sessions_sequenced_df['anomaly'].value_counts()[True]
if count_encoded_sequences == count_anomaly:
  print("The encoding is correct")
else:
  print("The encoding is not correct")
sessions_sequenced_df

The encoding is correct


Unnamed: 0,SessionID,Activity,anomaly
0,46,"[163, 162, 182, 181, 181, 181]",False
1,92,[161],False
2,358,"[181, 163, 190, 181, 181, 181, 181, 161, 159]",False
3,420,"[163, 163, 163, 163, 163, 163]",False
4,447,"[181, 181, 181, 181]",False
...,...,...,...
49995,5486344,[163],False
49996,5486405,"[163, 159, 159, 159]",False
49997,5486408,"[159, 181, 162, 163, 163, 163, 163, 163, 163, ...",False
49998,5486558,"[297, 181, 181, 181, 181, 181, 181, 181, 181, ...",False


In [55]:
# function to transform the sequences into one hot encoded sequences
sessions_sequenced_df['Activities_Encoded'] = sessions_sequenced_df['Activity']

# sessions_sequenced_df['Activities_Encoded'] = sessions_sequenced_df['Activity'].apply(lambda x: [encoded_dict[i] for i in x])    

In [56]:

final_df = sessions_sequenced_df.copy()
# Drop columns that are not needed anymore: Activity
final_df = final_df.drop(columns=["Activity"])


In [57]:
final_df

Unnamed: 0,SessionID,anomaly,Activities_Encoded
0,46,False,"[163, 162, 182, 181, 181, 181]"
1,92,False,[161]
2,358,False,"[181, 163, 190, 181, 181, 181, 181, 161, 159]"
3,420,False,"[163, 163, 163, 163, 163, 163]"
4,447,False,"[181, 181, 181, 181]"
...,...,...,...
49995,5486344,False,[163]
49996,5486405,False,"[163, 159, 159, 159]"
49997,5486408,False,"[159, 181, 162, 163, 163, 163, 163, 163, 163, ..."
49998,5486558,False,"[297, 181, 181, 181, 181, 181, 181, 181, 181, ..."


In [58]:
# transform the list to multiple columns
final_df_with_col = pd.DataFrame(final_df.Activities_Encoded.values.tolist(), index= final_df.index)

In [59]:
# concat the columns to the dataframe
df_for_model = pd.concat([final_df, final_df_with_col], axis=1)
# drop the Activities_Encoded column
df_for_model = df_for_model.drop(columns=["Activities_Encoded"])
df_for_model.fillna(0, inplace=True)

In [60]:
# Function to split the data into train and test data
def split_data(_df):
  df = _df.copy()
  X = df.drop(columns=["anomaly"])
  y = df["anomaly"]
  X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
  return X_train, X_test, y_train, y_test

In [61]:
X_train, X_test, y_train, y_test = split_data(df_for_model)

In [62]:
Le = LabelEncoder()
Le.fit(y_train)
y_train_transformed = Le.transform(y_train)
y_test_transformed = Le.transform(y_test)
y_train_transformed

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [63]:
#Make sure that both dataframes have the same columns
for column_name in X_train.columns:
    if column_name not in X_test.columns:
        X_test[column_name] = 0

  #Make sure that both dataframes have the same columns
for column_name in X_test.columns:
    if column_name not in X_train.columns:
        X_test.drop(columns=[column_name], inplace=True)

In [64]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)



In [65]:
y_test_predict = clf.predict(X_test)



In [66]:
print("Accuracy:",metrics.accuracy_score(y_test, y_test_predict))

Accuracy: 0.9988


In [67]:
balanced_accuracy_score = metrics.balanced_accuracy_score(y_test, y_test_predict)
print("Balanced accuracy:", balanced_accuracy_score)

Balanced accuracy: 0.49954986495948783
