## **PROBLEM**

The Problem of Anomaly Detection is divided into two parts:

1. Given a network, we need a program to find anomalies in that particular network, based on some previous known data [Training Data] - **SUPERVISED LEARNING**

2. Given a network, we need a program to find anomalies in that particular network, without any previous known data [Testing Data] - **UNSUPERVISED LEARNING**

We will use the following dataset for this problem: https://www.kaggle.com/datasets/sampadab17/network-intrusion-detection/code

**Importing Libraries**

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
import warnings
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree  import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_selection import RFE
import itertools
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import plotly.express as px
import plotly.graph_objects as go

### **Reading Data**

In [2]:
train=pd.read_csv(r'E:\Programming\Machine Learning\Network-Anomaly-Detection-main\archive\Train_data.csv')

In [3]:
test=pd.read_csv(r'E:\Programming\Machine Learning\Network-Anomaly-Detection-main\archive\Test_data.csv')

In [4]:
train.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [5]:
test.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,tcp,private,REJ,0,0,0,0,0,0,...,255,10,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0
1,0,tcp,private,REJ,0,0,0,0,0,0,...,255,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,134,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,3,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,29,86,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71


### **Data Preprocessing**

In [6]:
# Compare columns in train and test
print('Columns in train but not in test:', set(train.columns)-set(test.columns))
print('Columns in test but not in train:', set(test.columns)-set(train.columns))

Columns in train but not in test: {'class'}
Columns in test but not in train: set()


**Checking Null Values**

In [7]:
for i in train.columns:
    # Check Any NA Values
    if train[i].isnull().sum() > 0:
        print(i, train[i].isnull().sum())

In [8]:
for i  in test.columns:
    # Check Any NA Values
    if test[i].isnull().sum() > 0:
        print(i, test[i].isnull().sum())

No Null Values in Train Dataset as well as Test Dataset

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25192 entries, 0 to 25191
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     25192 non-null  int64  
 1   protocol_type                25192 non-null  object 
 2   service                      25192 non-null  object 
 3   flag                         25192 non-null  object 
 4   src_bytes                    25192 non-null  int64  
 5   dst_bytes                    25192 non-null  int64  
 6   land                         25192 non-null  int64  
 7   wrong_fragment               25192 non-null  int64  
 8   urgent                       25192 non-null  int64  
 9   hot                          25192 non-null  int64  
 10  num_failed_logins            25192 non-null  int64  
 11  logged_in                    25192 non-null  int64  
 12  num_compromised              25192 non-null  int64  
 13  root_shell      

**Further Processing**

In [10]:
# Get Numeric Columns
numeric_columns = train.select_dtypes(include=[np.number]).columns
other_columns = train.columns.difference(numeric_columns)

In [11]:
len(numeric_columns), len(other_columns)

(38, 4)

In [12]:
print("NUMERIC COLUMNS:", numeric_columns)
print("OTHER COLUMNS:", other_columns)

NUMERIC COLUMNS: Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
       'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate'],
      dtype='object')
OTHER COLUMNS: Index(['class', 'flag', 'protocol_type', 'service'], dtype='object')


In [13]:
output_column = 'class'
other_columns = other_columns.difference([output_column])

In [14]:
numeric_columns, other_columns, output_column

(Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
        'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
        'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
        'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
        'is_guest_login', 'count', 'srv_count', 'serror_rate',
        'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
        'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
        'dst_host_srv_count', 'dst_host_same_srv_rate',
        'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
        'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
        'dst_host_srv_rerror_rate'],
       dtype='object'),
 Index(['flag', 'protocol_type', 'service'], dtype='object'),
 'class')

**One Hot Encoding**

In [15]:
# Number of Distinct Values in Each Column
for i in other_columns:
    print(i, " -> Number of Distinct Values:", train[i].nunique())

flag  -> Number of Distinct Values: 11
protocol_type  -> Number of Distinct Values: 3
service  -> Number of Distinct Values: 66


In [16]:
train = pd.get_dummies(train, columns=other_columns, drop_first=True)

In [17]:
train.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,service_systat,service_telnet,service_tim_i,service_time,service_urh_i,service_urp_i,service_uucp,service_uucp_path,service_vmnet,service_whois
0,0,491,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
1,0,146,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
2,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
3,0,232,8153,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,False,False
4,0,199,420,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,False,False


In [18]:
train.columns

Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       ...
       'service_systat', 'service_telnet', 'service_tim_i', 'service_time',
       'service_urh_i', 'service_urp_i', 'service_uucp', 'service_uucp_path',
       'service_vmnet', 'service_whois'],
      dtype='object', length=116)

**TRAIN TEST SPLIT**

In [19]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('class', axis=1), train['class'], test_size=0.2, random_state=42, stratify=train['class'])

In [20]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((20153, 115), (5039, 115), (20153,), (5039,))

**NORMALISATION**

In [21]:
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

In [22]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((20153, 115), (5039, 115), (20153,), (5039,))

In [23]:
X_train[0]

array([0.00000000e+00, 7.72839861e-07, 2.47913911e-03, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [24]:
y_train[0]

'normal'

**Visualizing in 3-Dimensions**

In [25]:
# Visualize the Data in 3 dimensions using TSNE
from sklearn.manifold import TSNE
tsne = TSNE(n_components=3, random_state=42)
X_train_3d_tsne = tsne.fit_transform(X_train)

X_train_3d_tsne.shape

(20153, 3)

In [26]:
colors = ['red' if i == 'normal' else 'blue' for i in y_train]

In [27]:
# Visualize the Data in 3 dimensions via Plotly using TSNE, Mark y_train[i] = 'normal' as red and y_train[i] = 'anomaly' as blue
fig = go.Figure()
fig.add_trace(go.Scatter3d(x=X_train_3d_tsne[:, 0], y=X_train_3d_tsne[:, 1], z=X_train_3d_tsne[:, 2], mode='markers', marker=dict(color=colors)))
fig.show()

Clear Pattern Seems to be Visible.

In [28]:
# Visualize the Data in 2 dimensions using TSNE
tsne = TSNE(n_components=2, random_state=42)
X_train_2d_tsne = tsne.fit_transform(X_train)

X_train_2d_tsne.shape

(20153, 2)

In [29]:
# Visualize the Data in 2 dimensions via Plotly using TSNE, Mark y_train[i] = 'normal' as red and y_train[i] = 'anomaly' as blue
fig = go.Figure()
fig.add_trace(go.Scatter(x=X_train_2d_tsne[:, 0], y=X_train_2d_tsne[:, 1], mode='markers', marker=dict(color=colors)))
fig.show()

### **UNSUPERVISED TECHNIQUES BASED ON ALL FEATURES**

We will use 3 techniques for anomaly detection [**Out of which 2 Dont work for this problem**]:

1. **Isolation Forest** : It is an unsupervised learning algorithm that belongs to the ensemble decision trees family. It is based on the Decision Tree algorithm. In these methods, the decision tree is used to isolate each observation. This method is highly useful for detecting anomalies in the data - $Wont$ $work$ as in our data, anomalies are not in minority.

2. **Local Outlier Factor** : The LOF algorithm is an unsupervised outlier detection method which computes the local density deviation of a given data point with respect to its neighbors. It considers as outlier samples that have a substantially lower density than their neighbors - $Wont$ $work$ as in our data, anomalies are not in minority.

3. **DBSCAN** : DBSCAN is a density-based clustering non-parametric algorithm: given a set of points in some space, it groups together points that are closely packed together (points with many nearby neighbors), marking as outliers points that lie alone in low-density regions (whose nearest neighbors are too far away) - $Can$ $work$ as it is based on density and can separate anomalies from normal data.

### **ISOLAION FOREST**

**Spoiler Alert:** Poor Output Below

In [30]:
y_train.value_counts()

class
normal     10759
anomaly     9394
Name: count, dtype: int64

In [31]:
y_test.value_counts()

class
normal     2690
anomaly    2349
Name: count, dtype: int64

In [32]:
# ISOLATION FOREST
from sklearn.ensemble import IsolationForest
isolation_forest = IsolationForest(n_estimators=100, random_state=42)

In [33]:
isolation_forest.fit(X_train)

In [34]:
y_pred_train = isolation_forest.predict(X_train)
y_pred_test = isolation_forest.predict(X_test)

In [35]:
y_pred_train.shape, y_pred_test.shape, y_train.shape, y_test.shape

((20153,), (5039,), (20153,), (5039,))

In [36]:
np.unique(y_pred_train, return_counts=True)

(array([-1,  1]), array([   16, 20137], dtype=int64))

In [37]:
# Anomaly Detection
# -1 -> Anomaly
# 1 -> Normal
y_pred_train = np.where(y_pred_train == -1, 'anomaly', 'normal')
y_pred_test = np.where(y_pred_test == -1, 'anomaly', 'normal')

In [38]:
# Classification Report
from sklearn.metrics import classification_report

In [39]:
print("TRAIN CLASSIFICATION REPORT")
print(classification_report(y_train, y_pred_train))

TRAIN CLASSIFICATION REPORT


              precision    recall  f1-score   support

     anomaly       0.38      0.00      0.00      9394
      normal       0.53      1.00      0.70     10759

    accuracy                           0.53     20153
   macro avg       0.45      0.50      0.35     20153
weighted avg       0.46      0.53      0.37     20153



In [40]:
# Classification Report
print("TEST CLASSIFICATION REPORT")
print(classification_report(y_test, y_pred_test))

TEST CLASSIFICATION REPORT
              precision    recall  f1-score   support

     anomaly       0.33      0.00      0.00      2349
      normal       0.53      1.00      0.70      2690

    accuracy                           0.53      5039
   macro avg       0.43      0.50      0.35      5039
weighted avg       0.44      0.53      0.37      5039



In [41]:
# Adjusted Rand Index
from sklearn.metrics import adjusted_rand_score

adjusted_rand_score(y_train, y_pred_train), adjusted_rand_score(y_test, y_pred_test)

(-3.939612269318777e-05, -8.635531018342895e-05)

**POOR OUTPUT**

### **LOCAL OUTLIER FACTOR**

In [42]:
# LOF
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor(n_neighbors=20)

In [43]:
lof.fit(X_train)

In [44]:
y_pred_train = lof.fit_predict(X_train)
y_pred_test = lof.fit_predict(X_test)

In [45]:
# Unique Values
np.unique(y_pred_train, return_counts=True)

(array([-1,  1]), array([ 3019, 17134], dtype=int64))

In [46]:
# Adjusted Rand Index
adjusted_rand_score(y_train, y_pred_train), adjusted_rand_score(y_test, y_pred_test)

(-0.0008591460201811297, 0.01846259698804876)

In [47]:
# Anomaly Detection
# -1 -> Anomaly
# 1 -> Normal
y_pred_train = np.where(y_pred_train == -1, 'anomaly', 'normal')
y_pred_test = np.where(y_pred_test == -1, 'anomaly', 'normal')

In [48]:
# Classification Report
print("TRAIN CLASSIFICATION REPORT")
print(classification_report(y_train, y_pred_train))

TRAIN CLASSIFICATION REPORT


              precision    recall  f1-score   support

     anomaly       0.45      0.14      0.22      9394
      normal       0.53      0.85      0.65     10759

    accuracy                           0.52     20153
   macro avg       0.49      0.50      0.44     20153
weighted avg       0.49      0.52      0.45     20153



In [49]:
# Classification Report
print("TEST CLASSIFICATION REPORT")
print(classification_report(y_test, y_pred_test))

TEST CLASSIFICATION REPORT
              precision    recall  f1-score   support

     anomaly       0.58      0.30      0.39      2349
      normal       0.57      0.81      0.67      2690

    accuracy                           0.57      5039
   macro avg       0.57      0.55      0.53      5039
weighted avg       0.57      0.57      0.54      5039



**Still POOR**

### **DBSCAN**

In [50]:
# DBSCAN
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.5, min_samples=5)

In [51]:
dbscan.fit(X_train)

In [52]:
y_pred_train = dbscan.fit_predict(X_train)
y_pred_test = dbscan.fit_predict(X_test)

In [53]:
# Unique Values
y_pred_train.shape, y_pred_test.shape

((20153,), (5039,))

In [54]:
y_pred_train

array([  0,  -1,   1, ...,   2, 114, 140], dtype=int64)

In [55]:
# Adjusted Rand Index
adjusted_rand_score(y_train, y_pred_train), adjusted_rand_score(y_test, y_pred_test)

(0.17912447916225435, 0.16924404315876476)

**Best of the 3** as we thought

#### **Possible Issue:**

We are treating the problem as anomaly detection, but it is not as 'anomaly' class is not in minority [not a real anomaly] - We should instead take out important features and do **Binary Classification** to get high accuracy on this dataset.

We are using the entire feature set for anomaly detection, which is not a good idea. We should use only the important features for anomaly detection.

To Find Important Features, we can do:

1. **Feature Importance** : We can use Random Forest to find the important features.

2. **PCA** : We can use PCA to find the important features.

3. **Autoencoders** : We can use Autoencoders to find the important features.

Also, Need to remove intercorrelated features.

In [56]:
NUMBER_IMPORTANT_FEATURES_TO_GET = 50

#### **Random Forest Important Features**

In [57]:
# Find Important Features using Random Forest
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)

In [58]:
random_forest.feature_importances_

array([4.61322736e-03, 1.66919989e-02, 1.45022339e-01, 2.76623620e-05,
       3.75151179e-03, 5.72945601e-05, 8.59240216e-03, 1.77062563e-04,
       5.55588968e-02, 6.51197180e-03, 1.47678801e-04, 4.41925599e-05,
       4.74856928e-04, 2.05856272e-04, 2.38696098e-05, 1.11677097e-04,
       0.00000000e+00, 0.00000000e+00, 8.18602778e-04, 3.13015079e-02,
       1.48596560e-02, 2.00618314e-02, 2.67451789e-02, 1.35120099e-02,
       7.16089187e-03, 5.65915116e-02, 4.30468265e-02, 8.02186267e-03,
       2.96222195e-02, 3.17245640e-02, 6.21373530e-02, 4.79103025e-02,
       3.67355564e-02, 2.07742175e-02, 2.42195391e-02, 4.35032421e-02,
       2.12551676e-02, 1.35812789e-02, 1.44065851e-03, 2.78700775e-04,
       3.69029908e-05, 1.94404115e-03, 1.71282668e-02, 4.11033858e-04,
       6.61864843e-05, 1.95287573e-06, 6.17475565e-02, 9.10828306e-05,
       1.48616495e-02, 1.08034785e-02, 1.02824942e-04, 0.00000000e+00,
       1.41682647e-04, 8.42988646e-10, 3.29815007e-10, 3.36594852e-09,
      

In [59]:
# Pick top - 10
important_features_rfe = random_forest.feature_importances_
important_features_rfe = important_features_rfe.argsort()[::-1][:NUMBER_IMPORTANT_FEATURES_TO_GET]

In [60]:
important_features_rfe, random_forest.feature_importances_[important_features_rfe]

(array([  2,  30,  46,  25,   8,  31,  35,  26,  32,  29,  19,  28,  22,
         71,  34,  36,  33,  21,  62,  42,   1,  48,  20,  95,  37,  23,
         49,  60,  63,   6,  27,  24,   9,  68,   0,   4, 100,  90,  41,
        110,  38,  67,  18, 106,  12,  43,  39,  13,   7,  66], dtype=int64),
 array([0.14502234, 0.06213735, 0.06174756, 0.05659151, 0.0555589 ,
        0.0479103 , 0.04350324, 0.04304683, 0.03673556, 0.03172456,
        0.03130151, 0.02962222, 0.02674518, 0.02661243, 0.02421954,
        0.02125517, 0.02077422, 0.02006183, 0.01849569, 0.01712827,
        0.016692  , 0.01486165, 0.01485966, 0.01440865, 0.01358128,
        0.01351201, 0.01080348, 0.01052371, 0.00979846, 0.0085924 ,
        0.00802186, 0.00716089, 0.00651197, 0.00608485, 0.00461323,
        0.00375151, 0.00288796, 0.00230502, 0.00194404, 0.00160745,
        0.00144066, 0.00094517, 0.0008186 , 0.00055656, 0.00047486,
        0.00041103, 0.0002787 , 0.00020586, 0.00017706, 0.00017608]))

In [61]:
X_train_rfe = X_train[:, important_features_rfe]
X_test_rfe = X_test[:, important_features_rfe]

In [62]:
X_train_rfe.shape, X_test_rfe.shape

((20153, 50), (5039, 50))

#### **PCA Important Features**

In [63]:
# Use PCA for find important features
from sklearn.decomposition import PCA

pca = PCA(n_components=NUMBER_IMPORTANT_FEATURES_TO_GET)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [64]:
X_train_pca.shape

(20153, 50)

In [65]:
# Find Important Features using PCA
pca.explained_variance_ratio_

array([0.42012571, 0.15893223, 0.10702746, 0.04838125, 0.03903922,
       0.02824248, 0.02105838, 0.01691001, 0.01497873, 0.01398697,
       0.01146688, 0.00961659, 0.0068889 , 0.00656151, 0.00593801,
       0.00539958, 0.00465044, 0.00399669, 0.00379658, 0.00343807,
       0.00326391, 0.00293212, 0.00237436, 0.00222494, 0.00203948,
       0.00188685, 0.00180428, 0.00172368, 0.00160871, 0.00149745,
       0.00148639, 0.00140387, 0.00137489, 0.00136717, 0.00133428,
       0.00130895, 0.00126805, 0.0012259 , 0.00118322, 0.00115968,
       0.00111428, 0.00110984, 0.00108755, 0.00107766, 0.00106366,
       0.001056  , 0.00104548, 0.00103382, 0.00101196, 0.00100281])

In [66]:
X_train_pca.shape, X_test_pca.shape

((20153, 50), (5039, 50))

#### **Autoencoders Important Features**

In [67]:
X_train.shape, X_test.shape

((20153, 115), (5039, 115))

In [68]:
# Autoencoder Batch Normalization
from keras.models import Model
from keras.layers import Input, Dense, BatchNormalization

input_layer = Input(shape=(X_train.shape[1],))
encoded = Dense(100, activation='relu')(input_layer)
encoded = BatchNormalization()(encoded)
encoded = Dense(50, activation='relu')(encoded)
encoded = BatchNormalization()(encoded)
encoded = Dense(25, activation='relu')(encoded)
encoded = BatchNormalization()(encoded)
decoded = Dense(50, activation='relu')(encoded)
decoded = BatchNormalization()(decoded)
decoded = Dense(100, activation='relu')(decoded)
decoded = BatchNormalization()(decoded)
decoded = Dense(X_train.shape[1], activation='sigmoid')(decoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

In [69]:
autoencoder.fit(X_train, X_train, epochs=100, batch_size=1024, validation_data=(X_test, X_test))

Epoch 1/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 0.2617 - val_loss: 0.2321
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.2177 - val_loss: 0.2065
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.1860 - val_loss: 0.1612
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.1393 - val_loss: 0.0954
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0840 - val_loss: 0.0546
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0482 - val_loss: 0.0394
Epoch 7/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0322 - val_loss: 0.0333
Epoch 8/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0244 - val_loss: 0.0296
Epoch 9/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x14480913750>

In [70]:
encoder = Model(input_layer, encoded)

X_train_autoencoder = encoder.predict(X_train)
X_test_autoencoder = encoder.predict(X_test)

[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 613us/step
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 549us/step


In [71]:
X_train_autoencoder.shape, X_test_autoencoder.shape

((20153, 25), (5039, 25))

**NOTE:** We can also try anomaly detection using autoencoders - Reconstruction Error > Threshold => Anomaly as less inputs for anomalies => Model will not be able to reconstruct properly. But it is wont work as 'anomaly' class is not minority [real anomaly].

### PLAN AHEAD:

**Maybe Above 3 Method with Important Features** : We can use the above 3 methods with important features to get better results - Might not work as we saw above our problem is not really anomaly detection in the data but it is actually **Binary Classification Problem**

**Binary Classification** : We can use the important features to do binary classification and get high accuracy on this dataset - Use Random Forest, XGBoost, AdaBoost, ANN, Voting Classifier, etc.



### $$**BINARY-CLASSIFICATION**$$

#### **Random Forest**

In [72]:
# Random Forest Binary Classification based on all 3 feature sets
random_forest_rfe = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_pca = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_autoencoder = RandomForestClassifier(n_estimators=100, random_state=42)

In [73]:
random_forest_rfe.fit(X_train_rfe, y_train)
random_forest_pca.fit(X_train_pca, y_train)
random_forest_autoencoder.fit(X_train_autoencoder, y_train)

In [74]:
y_pred_train_rfe = random_forest_rfe.predict(X_train_rfe)
y_pred_test_rfe = random_forest_rfe.predict(X_test_rfe)

y_pred_train_pca = random_forest_pca.predict(X_train_pca)
y_pred_test_pca = random_forest_pca.predict(X_test_pca)

y_pred_train_autoencoder = random_forest_autoencoder.predict(X_train_autoencoder)
y_pred_test_autoencoder = random_forest_autoencoder.predict(X_test_autoencoder)

In [75]:
# Classification Report
print("TRAIN CLASSIFICATION REPORT")
print("RFE")
print(classification_report(y_train, y_pred_train_rfe))
print("PCA")
print(classification_report(y_train, y_pred_train_pca))
print("AUTOENCODER")
print(classification_report(y_train, y_pred_train_autoencoder))

TRAIN CLASSIFICATION REPORT
RFE
              precision    recall  f1-score   support

     anomaly       1.00      1.00      1.00      9394
      normal       1.00      1.00      1.00     10759

    accuracy                           1.00     20153
   macro avg       1.00      1.00      1.00     20153
weighted avg       1.00      1.00      1.00     20153

PCA
              precision    recall  f1-score   support

     anomaly       1.00      1.00      1.00      9394
      normal       1.00      1.00      1.00     10759

    accuracy                           1.00     20153
   macro avg       1.00      1.00      1.00     20153
weighted avg       1.00      1.00      1.00     20153

AUTOENCODER
              precision    recall  f1-score   support

     anomaly       1.00      1.00      1.00      9394
      normal       1.00      1.00      1.00     10759

    accuracy                           1.00     20153
   macro avg       1.00      1.00      1.00     20153
weighted avg       1.00   

In [76]:
# Classification Report
print("TEST CLASSIFICATION REPORT")
print("RFE")
print(classification_report(y_test, y_pred_test_rfe))
print("PCA")
print(classification_report(y_test, y_pred_test_pca))
print("AUTOENCODER")
print(classification_report(y_test, y_pred_test_autoencoder))

TEST CLASSIFICATION REPORT
RFE
              precision    recall  f1-score   support

     anomaly       1.00      1.00      1.00      2349
      normal       1.00      1.00      1.00      2690

    accuracy                           1.00      5039
   macro avg       1.00      1.00      1.00      5039
weighted avg       1.00      1.00      1.00      5039

PCA
              precision    recall  f1-score   support

     anomaly       1.00      0.99      0.99      2349
      normal       0.99      1.00      0.99      2690

    accuracy                           0.99      5039
   macro avg       0.99      0.99      0.99      5039
weighted avg       0.99      0.99      0.99      5039

AUTOENCODER
              precision    recall  f1-score   support

     anomaly       1.00      0.99      0.99      2349
      normal       0.99      1.00      0.99      2690

    accuracy                           0.99      5039
   macro avg       0.99      0.99      0.99      5039
weighted avg       0.99    

In [77]:
# Accuracy
from sklearn.metrics import accuracy_score
rfe_accuracy = accuracy_score(y_test, y_pred_test_rfe)
pca_accuracy = accuracy_score(y_test, y_pred_test_pca)
autoencoder_accuracy = accuracy_score(y_test, y_pred_test_autoencoder)

In [78]:
print("RFE Features Random Forest ACCURACY: {}%".format(round(rfe_accuracy*100, 2)))
print("PCA Features Random Forest ACCURACY: {}%".format(round(pca_accuracy*100, 2)))
print("Autoencoder Features Random Forest ACCURACY: {}%".format(round(autoencoder_accuracy*100, 2)))

RFE Features Random Forest ACCURACY: 99.74%
PCA Features Random Forest ACCURACY: 99.46%
Autoencoder Features Random Forest ACCURACY: 99.42%


Random Forest Seems to be Giving an Amazing Accuracy of $99.74%$ on Test Data. Let's try to increase it by using a neural network.

**Artificial Neural Network**

In [79]:
# Input Shapes
X_train.shape, X_train_rfe.shape, X_train_pca.shape, X_train_autoencoder.shape

((20153, 115), (20153, 50), (20153, 50), (20153, 25))

In [80]:
# ANN-1
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Input

model_rfe = Sequential()
model_rfe.add(Input(shape=(X_train_rfe.shape[1],)))
model_rfe.add(Dense(50, activation='relu'))
model_rfe.add(BatchNormalization())
model_rfe.add(Dense(25, activation='relu'))
model_rfe.add(BatchNormalization())
model_rfe.add(Dense(1, activation='sigmoid'))

model_rfe.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [81]:
# ANN-2
model_pca = Sequential()
model_pca.add(Input(shape=(X_train_pca.shape[1],)))
model_pca.add(Dense(50, activation='relu'))
model_pca.add(BatchNormalization())
model_pca.add(Dense(25, activation='relu'))
model_pca.add(BatchNormalization())
model_pca.add(Dense(1, activation='sigmoid'))

model_pca.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [82]:
# ANN-3
model_autoencoder = Sequential()
model_autoencoder.add(Input(shape=(X_train_autoencoder.shape[1],)))
model_autoencoder.add(Dense(50, activation='relu'))
model_autoencoder.add(BatchNormalization())
model_autoencoder.add(Dense(25, activation='relu'))
model_autoencoder.add(BatchNormalization())
model_autoencoder.add(Dense(1, activation='sigmoid'))

model_autoencoder.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [83]:
y_train = np.where(y_train == 'normal', 1, 0)
y_test = np.where(y_test == 'normal', 1, 0)

In [84]:
# Train Models
model_rfe.fit(X_train_rfe, y_train, epochs=100, batch_size=1024, validation_data=(X_test_rfe, y_test))

Epoch 1/100


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8591 - loss: 0.3768 - val_accuracy: 0.8996 - val_loss: 0.4654
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9593 - loss: 0.1307 - val_accuracy: 0.9208 - val_loss: 0.3952
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9712 - loss: 0.0930 - val_accuracy: 0.9262 - val_loss: 0.3512
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9723 - loss: 0.0824 - val_accuracy: 0.9411 - val_loss: 0.3214
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9796 - loss: 0.0684 - val_accuracy: 0.9649 - val_loss: 0.2955
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9805 - loss: 0.0602 - val_accuracy: 0.9673 - val_loss: 0.2742
Epoch 7/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x14481b8c290>

In [85]:
# Train Models
model_pca.fit(X_train_pca, y_train, epochs=100, batch_size=1024, validation_data=(X_test_pca, y_test))

Epoch 1/100


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6791 - loss: 0.6088 - val_accuracy: 0.8845 - val_loss: 0.5747
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9553 - loss: 0.1728 - val_accuracy: 0.9587 - val_loss: 0.4836
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9691 - loss: 0.1120 - val_accuracy: 0.9667 - val_loss: 0.4283
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9736 - loss: 0.0920 - val_accuracy: 0.9692 - val_loss: 0.3849
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9768 - loss: 0.0828 - val_accuracy: 0.9708 - val_loss: 0.3482
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9778 - loss: 0.0756 - val_accuracy: 0.9726 - val_loss: 0.3158
Epoch 7/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x144826a0250>

In [86]:
# Train Models
model_autoencoder.fit(X_train_autoencoder, y_train, epochs=100, batch_size=2048, validation_data=(X_test_autoencoder, y_test))

Epoch 1/100


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.6305 - loss: 0.6773 - val_accuracy: 0.7331 - val_loss: 0.5501
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9156 - loss: 0.2951 - val_accuracy: 0.8273 - val_loss: 0.4191
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9498 - loss: 0.2023 - val_accuracy: 0.9053 - val_loss: 0.3488
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9536 - loss: 0.1648 - val_accuracy: 0.9246 - val_loss: 0.3051
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9580 - loss: 0.1421 - val_accuracy: 0.9349 - val_loss: 0.2746
Epoch 6/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9635 - loss: 0.1298 - val_accuracy: 0.9480 - val_loss: 0.2507
Epoch 7/100
[1m10/10[0m [32m━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x144823d3750>

In [87]:
# Predictions
y_pred_test_rfe = model_rfe.predict(X_test_rfe)
y_pred_test_pca = model_pca.predict(X_test_pca)
y_pred_test_autoencoder = model_autoencoder.predict(X_test_autoencoder)

[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 942us/step
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 733us/step
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 746us/step


In [89]:
y_pred_test_rfe = np.where(y_pred_test_rfe > 0.5, 1, 0)
y_pred_test_pca = np.where(y_pred_test_pca > 0.5, 1, 0)
y_pred_test_autoencoder = np.where(y_pred_test_autoencoder > 0.5, 1, 0)

In [90]:
# Accuracy
rfe_accuracy = accuracy_score(y_test, y_pred_test_rfe)
pca_accuracy = accuracy_score(y_test, y_pred_test_pca)
autoencoder_accuracy = accuracy_score(y_test, y_pred_test_autoencoder)

In [92]:
print("RFE Features ANN ACCURACY: {}%".format(round(rfe_accuracy*100, 2)))
print("PCA Features ANN ACCURACY: {}%".format(round(pca_accuracy*100, 2)))
print("Autoencoder Features ANN ACCURACY: {}%".format(round(autoencoder_accuracy*100, 2)))

RFE Features ANN ACCURACY: 99.52%
PCA Features ANN ACCURACY: 99.38%
Autoencoder Features ANN ACCURACY: 98.91%


Seems like we might need to improve the model by using some other techniques, as it has reached 99.5 but not beating the Random Forest Model.

Let's use more Decision Tree Based Models to get better results only on RFE features as they are giving best results.

## Better Visualised Anomaly Detection with Synthetic Data

We will generate the random traffic data with anomalies using below function.

In [2]:
import synthetic_data_generator as sdg

In [42]:
num_nodes = 3

In [43]:
data = np.array(sdg.generate_data(num_nodes=num_nodes, average_bytes_per_edge=20, count=10000, multiplier=5))

In [44]:
data.shape

(10000, 3)

Data Size = $NC2$ where $N$ is the number of nodes in the graph

Display using Plotly: A Graph with N Nodes with edges between them and edge weight defined by data array

Data Array = [[edge_weight_1, edge_weight_2, edge_weight_3, ... edge_weight_NC2] for all graphs]

#### **Lets Visualize our First Network Traffic Data**

In [45]:
# Create a Graph
import networkx as nx
G = nx.DiGraph()
# Add Nodes
G.add_nodes_from(range(1, num_nodes+1))
# Add Edges
edges = list(itertools.combinations(range(1, num_nodes+1), 2))
for i in range(len(edges)):
    G.add_edge(edges[i][0], edges[i][1], weight=data[0][i])

In [46]:
# Draw the Directed Graph with circular layout where node size = sum of weights of outgoing edges + sum of weights of incoming edges
node_sizes = []
for i in range(1, num_nodes+1):
    incoming_edges = G.in_edges(i, data=True)
    incoming_edges_sum = sum([edge[2]['weight'] for edge in incoming_edges])
    outgoing_edges = G.out_edges(i, data=True)
    outgoing_edges_sum = sum([edge[2]['weight'] for edge in outgoing_edges])
    node_sizes.append(incoming_edges_sum + outgoing_edges_sum)

In [47]:
data[0]

array([40.        , 30.84632715, 28.93072449])

In [48]:
pos = nx.circular_layout(G)

# Using Plotly
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)
    
edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines')

node_x = []
node_y = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

# Node Trace: Color of Node = Node Size As Well
node_trace = go.Scatter(x=node_x, y=node_y, mode='markers', hoverinfo='text', marker=dict(showscale=True, colorscale='YlGnBu', size=node_sizes, colorbar=dict(thickness=15, title='Node Data Throughput', xanchor='left', titleside='right'), line_width=2))

node_text = []
for node in G.nodes():
    node_text.append(f"Total Data Transfer: {round(node_sizes[node-1], 2)}")

node_trace.marker.color = node_sizes
node_trace.text = node_text

fig = go.Figure(data=[edge_trace, node_trace], layout=go.Layout(showlegend=False, hovermode='closest', margin=dict(b=0, l=0, r=0, t=0), xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))

fig.show()

Looks Good!

**Finding Anomalies**

In [49]:
# Import Isolation Forest
# from sklearn.ensemble import IsolationForest

In [50]:
# Let's Find Anomalies in the Data array using Isolation Forest
# isolation_forest = IsolationForest(n_estimators=200, random_state=42)
# isolation_forest.fit(data)
# y_pred = isolation_forest.predict(data)

# np.unique(y_pred, return_counts=True)

(array([-1,  1]), array([7966, 2034], dtype=int64))

In [64]:
# Anomaly Detection using Quantile
data_mean = np.mean(data)
data_std = np.std(data)
threshold = data_mean + 2*data_std

anomalies = data > threshold
anomaly_indices = np.where(anomalies == True)

In [65]:
anomaly_indices[0]

array([  19,   19,   19, ..., 9983, 9983, 9983], dtype=int64)

In [66]:
anomalies = anomaly_indices[0]

In [69]:
anomaly_data = data[anomalies]
normal_data = np.delete(data, anomalies, axis=0)

In [70]:
anomaly_data.shape, normal_data.shape

((1467, 3), (9511, 3))

In [71]:
# Display some Anomalies and Normal Data
anomaly_data[:5]

array([[76.570911  , 75.16001773, 79.78717769],
       [76.570911  , 75.16001773, 79.78717769],
       [76.570911  , 75.16001773, 79.78717769],
       [75.15057825, 77.51743507, 76.32111162],
       [75.15057825, 77.51743507, 76.32111162]])

In [72]:
normal_data[:5]

array([[40.        , 30.84632715, 28.93072449],
       [21.82087998, 30.7061938 , 40.        ],
       [21.62403934, 12.48738915, 40.        ],
       [25.68745832,  7.78671305,  2.14062082],
       [27.33509324, 13.25650089, 10.0039895 ]])

In [86]:
# Function to Display Graphs
def display_graph(G, existingFigure=None, row=1, col=1):
    pos = nx.circular_layout(G)
    node_sizes = []
    for i in range(1, G.number_of_nodes()+1):
        incoming_edges = G.in_edges(i, data=True)
        incoming_edges_sum = sum([edge[2]['weight'] for edge in incoming_edges])
        outgoing_edges = G.out_edges(i, data=True)
        outgoing_edges_sum = sum([edge[2]['weight'] for edge in outgoing_edges])
        node_sizes.append(incoming_edges_sum + outgoing_edges_sum)

    # Make Node Sizes atleast 0
    node_sizes = np.array(node_sizes)
    if np.min(node_sizes) < 0:
        node_sizes = node_sizes - np.min(node_sizes)
    
    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)
    edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines')
    if existingFigure is not None:
        existingFigure.add_trace(edge_trace, row=row, col=col)
    node_x = []
    node_y = []
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
    node_trace = go.Scatter(x=node_x, y=node_y, mode='markers', hoverinfo='text', marker=dict(showscale=True, colorscale='YlGnBu', size=node_sizes, colorbar=dict(thickness=15, title='Node Data Throughput', xanchor='left', titleside='right'), line_width=2))
    node_text = []
    for node in G.nodes():
        node_text.append(f"Total Data Transfer: {round(node_sizes[node-1], 2)} B")
    node_trace.marker.color = node_sizes
    node_trace.text = node_text
    if existingFigure is not None:
        existingFigure.add_trace(node_trace, row=row, col=col)
    fig = go.Figure(data=[edge_trace, node_trace], layout=go.Layout(showlegend=False, hovermode='closest', margin=dict(b=0, l=0, r=0, t=0), xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))
    return fig

In [87]:
# Display First Anomaly
G = nx.DiGraph()
G.add_nodes_from(range(1, num_nodes+1))
edges = list(itertools.combinations(range(1, num_nodes+1), 2))
for i in range(len(edges)):
    G.add_edge(edges[i][0], edges[i][1], weight=anomaly_data[0][i])

In [88]:
fig = display_graph(G)
fig.show()

In [89]:
# Display First Normal Data
G = nx.DiGraph()
G.add_nodes_from(range(1, num_nodes+1))
edges = list(itertools.combinations(range(1, num_nodes+1), 2))
for i in range(len(edges)):
    G.add_edge(edges[i][0], edges[i][1], weight=normal_data[0][i])

In [90]:
display_graph(G)

In [91]:
# Display First 5 Anomalies vs Normal Data using Subplots
from plotly.subplots import make_subplots
fig = make_subplots(rows=5, cols=2, subplot_titles=['Anomaly', 'Normal']*5)

In [92]:
for i in range(5):
    rand_anomaly = np.random.randint(0, anomaly_data.shape[0])
    random_normal = np.random.randint(0, normal_data.shape[0])
    G = nx.DiGraph()
    G.add_nodes_from(range(1, num_nodes+1))
    edges = list(itertools.combinations(range(1, num_nodes+1), 2))
    for j in range(len(edges)):
        G.add_edge(edges[j][0], edges[j][1], weight=anomaly_data[rand_anomaly][j])
    display_graph(G, fig, row=i+1, col=1)
    G = nx.DiGraph()
    G.add_nodes_from(range(1, num_nodes+1))
    edges = list(itertools.combinations(range(1, num_nodes+1), 2))
    for j in range(len(edges)):
        G.add_edge(edges[j][0], edges[j][1], weight=normal_data[random_normal][j])
    display_graph(G, fig, row=i+1, col=2)

In [93]:
fig.update_layout(height=2000, width=1000, title_text="Anomalies vs Normal Data")
fig.show()

In [94]:
# Visualise data array in 3d using plotly
fig = go.Figure(data=[go.Scatter3d(x=anomaly_data[:, 0], y=anomaly_data[:, 1], z=anomaly_data[:, 2], mode='markers', marker=dict(size=4, color='red', opacity=0.8))])
fig.add_trace(go.Scatter3d(x=normal_data[:, 0], y=normal_data[:, 1], z=normal_data[:, 2], mode='markers', marker=dict(size=4, color='blue', opacity=0.8)))
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()

In [96]:
# Visualise data array in a Histogram of 1st Column
fig = go.Figure()
fig.add_trace(go.Histogram(x=anomaly_data[:, 0], name='Anomaly', marker_color='red'))
fig.add_trace(go.Histogram(x=normal_data[:, 0], name='Normal', marker_color='blue'))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()

In [107]:
# 3D Frequency Heatmap
assert data.shape[1] >= 2

# Create a DataFrame
df = pd.DataFrame(data, columns=['x', 'y', 'z'])
df['freq'] = 1

# Group By x, y, z and Count
df = df.groupby(['x', 'y', 'z']).count().reset_index()

# Create a 3D Scatter Plot
fig = px.scatter_3d(df, x='x', y='y', z='z', size='freq', color='freq', opacity=0.7, title='3D Frequency Heatmap')

fig.show()
