In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
from joblib import Parallel, delayed


In [3]:
# List of CSV file names to merge
file_names = ['Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv', 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', 'Friday-WorkingHours-Morning.pcap_ISCX.csv', 'Monday-WorkingHours.pcap_ISCX.csv', 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv', 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv', 'Tuesday-WorkingHours.pcap_ISCX.csv', 'Wednesday-workingHours.pcap_ISCX.csv']

# Create an empty DataFrame to store merged data
data = pd.DataFrame()

# Merge CSV files
for filename in file_names:
    df = pd.read_csv(filename)
    data = pd.concat([data, df], ignore_index=True)



In [5]:
data.columns

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', '

In [7]:
dups = data[data.duplicated()]
print(f'Number of duplicates: {len(dups)}')

data.drop_duplicates(inplace = True)
data.shape



Number of duplicates: 308381


(2522362, 79)

In [9]:
# Renaming the columns by removing leading/trailing whitespace
col_names = {col: col.strip() for col in data.columns}
data.rename(columns = col_names, inplace = True)

In [11]:
missing_val = data.isna().sum()
print(missing_val.loc[missing_val > 0])

Flow Bytes/s    353
dtype: int64


In [13]:
# Checking for infinity values
numeric_cols = data.select_dtypes(include = np.number).columns
inf_count = np.isinf(data[numeric_cols]).sum()
print(inf_count[inf_count > 0])

Flow Bytes/s      1211
Flow Packets/s    1564
dtype: int64


In [15]:
# Replacing any infinite values (positive or negative) with NaN (not a number)
print(f'Initial missing values: {data.isna().sum().sum()}')

data.replace([np.inf, -np.inf], np.nan, inplace = True)

print(f'Missing values after processing infinite values: {data.isna().sum().sum()}')

Initial missing values: 353
Missing values after processing infinite values: 3128


In [17]:
missing = data.isna().sum()
print(missing.loc[missing > 0])

Flow Bytes/s      1564
Flow Packets/s    1564
dtype: int64


In [19]:
# Filling missing values with median
med_flow_bytes = data['Flow Bytes/s'].median()
med_flow_packets = data['Flow Packets/s'].median()
data.fillna({'Flow Bytes/s':med_flow_bytes}, inplace = True)
data.fillna({'Flow Packets/s':med_flow_packets}, inplace = True)

In [21]:
print('Number of \'Flow Bytes/s\' missing values:', data['Flow Bytes/s'].isna().sum())
print('Number of \'Flow Packets/s\' missing values:', data['Flow Packets/s'].isna().sum())

Number of 'Flow Bytes/s' missing values: 0
Number of 'Flow Packets/s' missing values: 0


In [23]:
data['Label'].unique()

array(['BENIGN', 'DDoS', 'PortScan', 'Bot', 'Infiltration',
       'Web Attack ï¿½ Brute Force', 'Web Attack ï¿½ XSS',
       'Web Attack ï¿½ Sql Injection', 'FTP-Patator', 'SSH-Patator',
       'DoS slowloris', 'DoS Slowhttptest', 'DoS Hulk', 'DoS GoldenEye',
       'Heartbleed'], dtype=object)

In [25]:
# Types of attacks & normal instances (BENIGN)
data['Label'].value_counts()

Label
BENIGN                        2096484
DoS Hulk                       172849
DDoS                           128016
PortScan                        90819
DoS GoldenEye                   10286
FTP-Patator                      5933
DoS slowloris                    5385
DoS Slowhttptest                 5228
SSH-Patator                      3219
Bot                              1953
Web Attack ï¿½ Brute Force         1470
Web Attack ï¿½ XSS                  652
Infiltration                       36
Web Attack ï¿½ Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64

In [27]:
attack_map = {
    'BENIGN': 'BENIGN',
    'DDoS': 'DDoS',
    'DoS Hulk': 'DoS',
    'DoS GoldenEye': 'DoS',
    'DoS slowloris': 'DoS',
    'DoS Slowhttptest': 'DoS',
    'PortScan': 'Port Scan',
    'FTP-Patator': 'Brute Force',
    'SSH-Patator': 'Brute Force',
    'Bot': 'Bot',
    'Web Attack ï¿½ Brute Force': 'Web Attack',
    'Web Attack ï¿½ XSS': 'Web Attack',
    'Web Attack ï¿½ Sql Injection': 'Web Attack',
    'Infiltration': 'Infiltration',
    'Heartbleed': 'Heartbleed'
}

# Creating a new column 'Attack Type' in the DataFrame based on the attack_map dictionary
data['Attack Type'] = data['Label'].map(attack_map)

In [29]:
data['Attack Type'].value_counts()

Attack Type
BENIGN          2096484
DoS              193748
DDoS             128016
Port Scan         90819
Brute Force        9152
Web Attack         2143
Bot                1953
Infiltration         36
Heartbleed           11
Name: count, dtype: int64

In [35]:
data.drop('Attack Type', axis = 1, inplace = True)

In [37]:
data.columns

Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Co

In [39]:

# Selecting key network-related features for graph construction
selected_features = ["Destination Port", "Flow Duration", "Total Fwd Packets", "Total Backward Packets","Flow Bytes/s", "Flow Packets/s", "SYN Flag Count", "ACK Flag Count", "FIN Flag Count"]

Analysis_data = data[selected_features]
Analysis_data.describe

<bound method NDFrame.describe of          Destination Port  Flow Duration  Total Fwd Packets  \
0                   54865              3                  2   
1                   55054            109                  1   
2                   55055             52                  1   
3                   46236             34                  1   
4                   54863              3                  2   
...                   ...            ...                ...   
2830738                53          32215                  4   
2830739                53            324                  2   
2830740             58030             82                  2   
2830741                53        1048635                  6   
2830742                53          94939                  4   

         Total Backward Packets  Flow Bytes/s  Flow Packets/s  SYN Flag Count  \
0                             0  4.000000e+06   666666.666700               0   
1                             1  1.100917e+05  

In [43]:

# Sample data to reduce processing time
subset_data = Analysis_data.sample(n=5000)

# Create a directed graph (Network Flow Graph)
G = nx.DiGraph()

# Add edges based on network traffic patterns
for _, row in subset_data.iterrows():
    src = f"SrcPort_{row['Destination Port']}"
    dst = f"Flow_{row['Flow Duration']}"
    weight = row['Flow Bytes/s']
    
    G.add_edge(src, dst, weight=weight)

# Convert to sparse matrix
from scipy.sparse import csr_matrix
adj_matrix = nx.to_scipy_sparse_array(G, weight='weight', dtype=float)


# Compute centrality measures in parallel
def compute_centrality(G, method):
    if method == "betweenness":
        return nx.betweenness_centrality(G, k=100)
    elif method == "closeness":
        return nx.closeness_centrality(G, wf_improved=True)
    elif method == "pagerank":
        return nx.pagerank(G, max_iter=50)

results = Parallel(n_jobs=3)(delayed(compute_centrality)(G, method) for method in ["betweenness", "closeness", "pagerank"])

betweenness, closeness, pagerank = results

# Identify potential attack nodes (high centrality values)
thresh = 0.01  # Threshold to detect anomalies
attack_nodes = [node for node, score in pagerank.items() if score > thresh]

print("Potential Anomalous Nodes (High PageRank):", attack_nodes)


Potential Anomalous Nodes (High PageRank): ['Flow_3.0']


In [45]:
unique_sources = Analysis_data['Destination Port'].nunique()
unique_destinations = Analysis_data['Flow Duration'].nunique()
print("Unique Source Nodes:", unique_sources)
print("Unique Destination Nodes:", unique_destinations)
print("Estimated Total Nodes:", unique_sources + unique_destinations)


Unique Source Nodes: 53805
Unique Destination Nodes: 1050899
Estimated Total Nodes: 1104704


In [49]:
pip install pyvis

Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting jsonpickle>=1.4.1 (from pyvis)
  Downloading jsonpickle-4.0.2-py3-none-any.whl.metadata (8.2 kB)
Downloading pyvis-0.3.2-py3-none-any.whl (756 kB)
   ---------------------------------------- 0.0/756.0 kB ? eta -:--:--
   -- ------------------------------------- 41.0/756.0 kB 2.0 MB/s eta 0:00:01
   ---- ---------------------------------- 92.2/756.0 kB 880.9 kB/s eta 0:00:01
   ----------- ---------------------------- 225.3/756.0 kB 1.5 MB/s eta 0:00:01
   -------------------------------- ------- 614.4/756.0 kB 3.0 MB/s eta 0:00:01
   ------------------------------------ --- 686.1/756.0 kB 3.3 MB/s eta 0:00:01
   ---------------------------------------  747.5/756.0 kB 2.5 MB/s eta 0:00:01
   ---------------------------------------- 756.0/756.0 kB 2.3 MB/s eta 0:00:00
Downloading jsonpickle-4.0.2-py3-none-any.whl (46 kB)
   ---------------------------------------- 0.0/46.3 kB ? eta -:--:--
   -------

In [53]:
# Create a subgraph with the first 5000 nodes
sample_nodes = list(G.nodes)[:5000]
subG = G.subgraph(sample_nodes)

net = Network(notebook=True, width="100%", height="800px", bgcolor="#222222", font_color="white")
net.from_nx(subG)
net.show("provenance_graph.html")  # Opens in browser


provenance_graph.html


In [55]:
centrality = nx.degree_centrality(G)
sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:10]  # Top 10 influential nodes


[('SrcPort_53.0', 0.274748322147651),
 ('SrcPort_80.0', 0.22839765100671142),
 ('SrcPort_443.0', 0.1950503355704698),
 ('Flow_3.0', 0.024328859060402684),
 ('Flow_4.0', 0.013003355704697987),
 ('SrcPort_123.0', 0.009228187919463088),
 ('Flow_1.0', 0.006711409395973154),
 ('Flow_48.0', 0.004614093959731544),
 ('SrcPort_21.0', 0.004614093959731544),
 ('Flow_2.0', 0.004614093959731544)]

In [57]:
betweenness = nx.betweenness_centrality(G)
sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:10]  # Top influential nodes


[('SrcPort_53.0', 0.0),
 ('Flow_75574.0', 0.0),
 ('SrcPort_60343.0', 0.0),
 ('Flow_21573431.0', 0.0),
 ('Flow_61748.0', 0.0),
 ('SrcPort_443.0', 0.0),
 ('Flow_5590616.0', 0.0),
 ('SrcPort_80.0', 0.0),
 ('Flow_115931856.0', 0.0),
 ('Flow_31296.0', 0.0)]

In [59]:
import networkx as nx

# Degree Centrality (Number of connections per node)
degree_centrality = nx.degree_centrality(G)

# Betweenness Centrality (Nodes acting as 'bridges' between clusters)
betweenness_centrality = nx.betweenness_centrality(G)

# Closeness Centrality (Nodes that quickly reach others)
closeness_centrality = nx.closeness_centrality(G)

# PageRank (Identifies influential nodes)
pagerank = nx.pagerank(G)

# Sort nodes by highest centrality scores
top_degree = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
top_betweenness = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
top_closeness = sorted(closeness_centrality.items(), key=lambda x: x[1], reverse=True)[:10]
top_pagerank = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:10]

# Display top 10 nodes for each measure
print("Top 10 Nodes by Degree Centrality:", top_degree)
print("Top 10 Nodes by Betweenness Centrality:", top_betweenness)
print("Top 10 Nodes by Closeness Centrality:", top_closeness)
print("Top 10 Nodes by PageRank:", top_pagerank)


Top 10 Nodes by Degree Centrality: [('SrcPort_53.0', 0.274748322147651), ('SrcPort_80.0', 0.22839765100671142), ('SrcPort_443.0', 0.1950503355704698), ('Flow_3.0', 0.024328859060402684), ('Flow_4.0', 0.013003355704697987), ('SrcPort_123.0', 0.009228187919463088), ('Flow_1.0', 0.006711409395973154), ('Flow_48.0', 0.004614093959731544), ('SrcPort_21.0', 0.004614093959731544), ('Flow_2.0', 0.004614093959731544)]
Top 10 Nodes by Betweenness Centrality: [('SrcPort_53.0', 0.0), ('Flow_75574.0', 0.0), ('SrcPort_60343.0', 0.0), ('Flow_21573431.0', 0.0), ('Flow_61748.0', 0.0), ('SrcPort_443.0', 0.0), ('Flow_5590616.0', 0.0), ('SrcPort_80.0', 0.0), ('Flow_115931856.0', 0.0), ('Flow_31296.0', 0.0)]
Top 10 Nodes by Closeness Centrality: [('Flow_3.0', 0.024328859060402684), ('Flow_4.0', 0.013003355704697987), ('Flow_1.0', 0.006711409395973154), ('Flow_48.0', 0.004614093959731544), ('Flow_2.0', 0.004614093959731544), ('Flow_54.0', 0.003984899328859061), ('Flow_49.0', 0.0037751677852348995), ('Flow_5

In [61]:
import numpy as np

# Convert centrality values to numpy arrays
degree_vals = np.array(list(degree_centrality.values()))
betweenness_vals = np.array(list(betweenness_centrality.values()))
closeness_vals = np.array(list(closeness_centrality.values()))
pagerank_vals = np.array(list(pagerank.values()))

# Compute mean and standard deviation
def detect_anomalies(values, threshold=3):
    mean = np.mean(values)
    std = np.std(values)
    anomalies = [node for node, val in zip(G.nodes, values) if (val - mean) / std > threshold]
    return anomalies

# Identify anomalous nodes
degree_anomalies = detect_anomalies(degree_vals)
betweenness_anomalies = detect_anomalies(betweenness_vals)
closeness_anomalies = detect_anomalies(closeness_vals)
pagerank_anomalies = detect_anomalies(pagerank_vals)

print("ðŸš¨ Degree-Based Anomalous Nodes:", degree_anomalies)
print("ðŸš¨ Betweenness-Based Anomalous Nodes:", betweenness_anomalies)
print("ðŸš¨ Closeness-Based Anomalous Nodes:", closeness_anomalies)
print("ðŸš¨ PageRank-Based Anomalous Nodes:", pagerank_anomalies)


ðŸš¨ Degree-Based Anomalous Nodes: ['SrcPort_53.0', 'SrcPort_443.0', 'SrcPort_80.0', 'Flow_3.0']
ðŸš¨ Betweenness-Based Anomalous Nodes: []
ðŸš¨ Closeness-Based Anomalous Nodes: ['Flow_43.0', 'Flow_3.0', 'Flow_64.0', 'Flow_48.0', 'Flow_58.0', 'Flow_49.0', 'Flow_55.0', 'Flow_53.0', 'Flow_50.0', 'Flow_72.0', 'Flow_97.0', 'Flow_66.0', 'Flow_1.0', 'Flow_29.0', 'Flow_4.0', 'Flow_57.0', 'Flow_2.0', 'Flow_70.0', 'Flow_16.0', 'Flow_45.0', 'Flow_51.0', 'Flow_75.0', 'Flow_47.0', 'Flow_26.0', 'Flow_54.0', 'Flow_46.0', 'Flow_62.0', 'Flow_63.0', 'Flow_60.0', 'Flow_56.0', 'Flow_52.0', 'Flow_24.0', 'Flow_83.0', 'Flow_34.0', 'Flow_17.0', 'Flow_65.0']
ðŸš¨ PageRank-Based Anomalous Nodes: ['Flow_3.0', 'Flow_64.0', 'Flow_48.0', 'Flow_58.0', 'Flow_49.0', 'Flow_38.0', 'Flow_55.0', 'Flow_53.0', 'Flow_50.0', 'Flow_72.0', 'Flow_97.0', 'Flow_66.0', 'Flow_1.0', 'Flow_29.0', 'Flow_4.0', 'Flow_57.0', 'Flow_2.0', 'Flow_70.0', 'Flow_45.0', 'Flow_51.0', 'Flow_75.0', 'Flow_47.0', 'Flow_26.0', 'Flow_54.0', 'Flow_46.0'

  anomalies = [node for node, val in zip(G.nodes, values) if (val - mean) / std > threshold]


In [63]:
from pyvis.network import Network

net = Network(notebook=True, width="100%", height="800px", bgcolor="#222222", font_color="white")

# Add nodes and color anomalies
for node in G.nodes:
    color = "white"
    if node in degree_anomalies or node in betweenness_anomalies or node in pagerank_anomalies:
        color = "red"  # Mark anomalies in red
    net.add_node(node, label=node, color=color)

# Add edges
for edge in G.edges:
    net.add_edge(edge[0], edge[1])

# Save and view
net.show("intrusion_detection_graph.html")


intrusion_detection_graph.html
