In [1]:
import json
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

# datasets is a list of available datasets descriptions containing: path, key columns names, and suitable complex network features
from src.data.dataset_info import datasets


In [2]:
dataset = datasets[5]
name = dataset.name
print("dataset: {}".format(name))

path = "./datasets/partitions/{}.pkl".format(name)
new_path = "./datasets/preprocessed/{}.pkl".format(name)
graph_path = "./datasets/preprocessed/graph_{}.gexf".format(name)
df = pd.read_pickle(path)

dataset: partition_3


In [3]:
# converting all infinity values into nan then dropping all records containing nan values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(axis=0, how='any', inplace=True)

df.drop_duplicates(subset=list(set(df.columns) - set([dataset.timestamp_col, dataset.flow_id_col])), keep="first", inplace=True)

In [4]:
total_count = len(df)

properties = {
    "name": dataset.name,
    "length": total_count,
}

num_benign = len(df[df['Label'] == 0])
num_attack = len(df[df['Label'] == 1])

properties["num_benign"] = num_benign
properties["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

properties["num_attack"] = num_attack
properties["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

properties["attacks"] = list(df["Attack"].unique())  # .to_list()

filename = ('./datasets_properties/{}.json'.format(dataset.name))
outfile = open(filename, 'w')
outfile.writelines(json.dumps(properties))
outfile.close()

In [5]:
G = nx.from_pandas_edgelist(
        df,
        source=dataset.src_ip_col,
        target=dataset.dst_ip_col,
        create_using=nx.DiGraph()
    )

In [6]:
G.remove_nodes_from(list(nx.isolates(G)))

In [7]:
import igraph as ig
G1 = ig.Graph.from_networkx(G)
part = G1.community_infomap()

communities = []
for com in part:
    communities.append([G1.vs[node_index]['_nx_name'] for node_index in com])

print(f"==>> number of communities: {len(communities)}")
for com in communities:
    print(f"==>> com: {len(com)}")

==>> number of communities: 127
==>> com: 47
==>> com: 68679
==>> com: 89
==>> com: 21
==>> com: 474
==>> com: 664
==>> com: 123
==>> com: 28
==>> com: 95
==>> com: 1077
==>> com: 3217
==>> com: 67
==>> com: 73
==>> com: 164
==>> com: 25
==>> com: 41
==>> com: 101
==>> com: 2
==>> com: 169
==>> com: 56
==>> com: 4
==>> com: 231
==>> com: 26
==>> com: 7
==>> com: 254
==>> com: 89
==>> com: 41
==>> com: 11
==>> com: 15
==>> com: 14
==>> com: 44
==>> com: 2
==>> com: 94
==>> com: 34
==>> com: 3
==>> com: 73
==>> com: 49
==>> com: 5
==>> com: 3
==>> com: 15
==>> com: 125
==>> com: 2
==>> com: 2
==>> com: 71
==>> com: 76
==>> com: 4
==>> com: 5
==>> com: 97
==>> com: 32
==>> com: 5
==>> com: 7
==>> com: 85
==>> com: 5
==>> com: 2
==>> com: 4
==>> com: 6
==>> com: 2
==>> com: 7
==>> com: 14
==>> com: 48
==>> com: 15
==>> com: 51
==>> com: 5
==>> com: 2
==>> com: 6
==>> com: 2
==>> com: 34
==>> com: 4
==>> com: 2
==>> com: 22
==>> com: 11
==>> com: 11
==>> com: 16
==>> com: 6
==>> com: 7
==>>

In [8]:
properties = {}

properties["number_of_nodes"] = G.number_of_nodes()
properties["number_of_edges"] = G.number_of_edges()

degrees = [degree for _, degree in G.degree()]
properties["max_degree"] = max(degrees)
properties["avg_degree"] = sum(degrees) / len(degrees)

In [9]:
properties["transitivity"] = nx.transitivity(G)


In [10]:
properties["density"] =  nx.density(G)


In [11]:
# Assuming G is your graph and communities is a list of sets, where each set contains the nodes in a community

# Step 1: Map each node to its community
node_to_community = {}
for community_index, community in enumerate(communities):
    for node in community:
        node_to_community[node] = community_index

# Step 2: Count inter-cluster edges efficiently
inter_cluster_edges = 0
for u, v in G.edges():
    # Directly check if u and v belong to different communities
    if node_to_community[u] != node_to_community[v]:
        inter_cluster_edges += 1


properties["mixing_parameter"] = inter_cluster_edges / G.number_of_edges()

In [12]:
properties["modularity"] = nx.community.modularity(G, communities)

In [13]:
filename = ('./datasets_properties/{}.json'.format("graph_" + name))
outfile = open(filename, 'w')
outfile.writelines(json.dumps(properties))
outfile.close()

properties

{'number_of_nodes': 77195,
 'number_of_edges': 115175,
 'max_degree': 24550,
 'avg_degree': 2.9840015545048253,
 'transitivity': 0.030208756030479282,
 'density': 1.9327937109780718e-05,
 'mixing_parameter': 0.01784241371825483,
 'modularity': 0.14314440370159914}

In [14]:
community_labels = {}
for i, community in enumerate(communities):
    for node in community:
        community_labels[node] = i

nx.set_node_attributes(G, community_labels, "new_community")

In [15]:
# getting inter and itra graph, to calculate the local and global variations of each centrality
from src.network.network_features import separate_graph

intra_graph, inter_graph = separate_graph(G, communities)

In [16]:
from src.network.network_features import cal_betweenness_centrality

if "betweenness" in dataset.cn_measures:
    nx.set_node_attributes(G, cal_betweenness_centrality(G), "betweenness")
    print("calculated")

calculated


In [17]:
if "local_betweenness" in dataset.cn_measures:
    nx.set_node_attributes(G, cal_betweenness_centrality(intra_graph), "local_betweenness")
    print("calculated")

In [18]:
if "global_betweenness" in dataset.cn_measures:
    nx.set_node_attributes(G, cal_betweenness_centrality(inter_graph), "global_betweenness")
    print("calculated")

calculated


In [19]:
if "degree" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.degree_centrality(G), "degree")
    print("calculated")

calculated


In [20]:
if "local_degree" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.degree_centrality(intra_graph), "local_degree")
    print("calculated")

In [21]:
if "global_degree" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.degree_centrality(inter_graph), "global_degree")
    print("calculated")

calculated


In [22]:
if "eigenvector" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.eigenvector_centrality(G, max_iter=600), "eigenvector")
    print("calculated")


calculated


In [23]:
if "local_eigenvector" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.eigenvector_centrality(intra_graph), "local_eigenvector")
    print("calculated")

In [24]:
if "global_eigenvector" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.eigenvector_centrality(inter_graph), "global_eigenvector")
    print("calculated")

In [25]:
if "closeness" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.closeness_centrality(G), "closeness")
    print("calculated")

calculated


In [26]:
if "local_closeness" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.closeness_centrality(intra_graph), "local_closeness")
    print("calculated")

In [27]:
if "global_closeness" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.closeness_centrality(inter_graph), "global_closeness")
    print("calculated")

In [28]:
if "pagerank" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.pagerank(G, alpha=0.85), "pagerank")
    print("calculated")

calculated


In [29]:
if "local_pagerank" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.pagerank(intra_graph, alpha=0.85), "local_pagerank")
    print("calculated")

In [30]:
if "global_pagerank" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.pagerank(inter_graph, alpha=0.85), "global_pagerank")
    print("calculated")

calculated


In [31]:
from src.network.network_features import cal_k_core

if "k_core" in dataset.cn_measures:
    nx.set_node_attributes(G, cal_k_core(G), "k_core")
    print("calculated")

calculated


In [32]:
from src.network.network_features import cal_k_truss
if "k_truss" in dataset.cn_measures:
    nx.set_node_attributes(G, cal_k_truss(G), "k_truss")
    print("calculated")

calculated


In [33]:
from src.network.CommCentralityCode import comm_centreality

if "Comm" in dataset.cn_measures:
    nx.set_node_attributes(G, comm_centreality(G, community_labels), "Comm")
    print("calculated")

In [34]:
from src.network.modularity_vitality import modularity_vitality

if "mv" in dataset.cn_measures:
    nx.set_node_attributes(G, modularity_vitality(G1, part), "mv")
    print("calculated")

calculated


In [35]:
nx.write_gexf(G, graph_path)

In [36]:
features_dicts = {}
for measure in dataset.cn_measures:
    features_dicts[measure] = nx.get_node_attributes(G, measure)
    print(f"==>> features_dicts: {measure , len(features_dicts[measure])}")
    
for feature in dataset.network_features:
        if feature[:3] == "src":
            df[feature] = df.apply(
                lambda row: features_dicts[feature[4:]].get(row[dataset.src_ip_col], -1), axis=1)
        if feature[:3] == "dst":
            df[feature] = df.apply(
                lambda row: features_dicts[feature[4:]].get(row[dataset.dst_ip_col], -1), axis=1)

==>> features_dicts: ('betweenness', 77195)
==>> features_dicts: ('global_betweenness', 77195)
==>> features_dicts: ('degree', 77195)
==>> features_dicts: ('global_degree', 77195)
==>> features_dicts: ('eigenvector', 77195)
==>> features_dicts: ('closeness', 77195)
==>> features_dicts: ('pagerank', 77195)
==>> features_dicts: ('global_pagerank', 77195)
==>> features_dicts: ('k_core', 77195)
==>> features_dicts: ('k_truss', 77195)
==>> features_dicts: ('mv', 77195)


In [37]:
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,src_pagerank,dst_pagerank,src_global_pagerank,dst_global_pagerank,src_k_core,dst_k_core,src_k_truss,dst_k_truss,src_mv,dst_mv
4013820,192.168.1.36-192.168.1.1-47392-53-17,192.168.1.36,47392.0,192.168.1.1,53.0,17.0,27/04/2019 03:28:20 pm,6525.0,2.0,2.0,...,6.3e-05,4.6e-05,3.2e-05,3e-05,1.0,1.0,0.583333,0.666667,0.000412,0.000271
4013821,141.189.192.168-1.195.192.168-0-0-0,141.189.192.168,0.0,1.195.192.168,0.0,0.0,30/04/2019 12:29:04 am,2.0,2.0,0.0,...,5e-06,0.019318,1.3e-05,0.00033,0.444444,0.666667,0.25,0.416667,-3e-06,-0.004418
4013822,18.194.169.124-192.168.1.32-80-44722-6,18.194.169.124,80.0,192.168.1.32,44722.0,6.0,27/04/2019 03:52:52 pm,721437.0,5.0,5.0,...,1.5e-05,5.8e-05,1.3e-05,3.2e-05,0.888889,1.0,0.166667,0.583333,0.000119,0.000456
4013823,176.28.50.165-192.168.1.39-80-55972-6,176.28.50.165,80.0,192.168.1.39,55972.0,6.0,27/04/2019 06:06:28 pm,2289.0,1.0,1.0,...,1.5e-05,7.1e-05,1.3e-05,2.1e-05,0.888889,1.0,0.166667,0.666667,0.000119,0.000479
4013824,192.168.1.195-192.168.1.79-60700-9197-6,192.168.1.195,60700.0,192.168.1.79,9197.0,6.0,27/04/2019 06:05:15 pm,22093.0,4.0,13.0,...,0.000444,1.7e-05,0.000102,1.6e-05,1.0,0.666667,1.0,0.5,0.002097,9.1e-05


In [38]:
pd.to_pickle(df, new_path)

In [39]:
#from unfied_features import create_unified_features

In [2]:
dataset = datasets[5]
name = dataset.name
print("dataset: {}".format(name))

new_path = "./datasets/preprocessed/{}.pkl".format(name)
graph_path = "./datasets/preprocessed/graph_{}.gexf".format(name)
G = nx.read_gexf(graph_path)

dataset: partition_3


In [41]:
"""network4_features = create_unified_features(G, dataset)
print(network4_features.shape)
print(network4_features[:5])  # Slicing to get the first 5 rows

(77195, 15)
[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  1.29543747e-05
   0.00000000e+00  0.00000000e+00  3.69547853e-13  0.00000000e+00
   5.03761292e-06  0.00000000e+00  1.26880411e-05  1.11111111e-01
   1.66666667e-01  0.00000000e+00  1.64462121e-05]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  2.59087494e-05
   0.00000000e+00  0.00000000e+00  3.69547853e-13  0.00000000e+00
   5.03761292e-06  0.00000000e+00  1.26880411e-05  2.22222222e-01
   1.66666667e-01  0.00000000e+00 -5.98429096e-07]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  1.29543747e-05
   0.00000000e+00  0.00000000e+00  3.69547853e-13  0.00000000e+00
   5.03761292e-06  0.00000000e+00  1.26880411e-05  1.11111111e-01
   1.66666667e-01  0.00000000e+00  1.59856786e-05]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  9.06806228e-04
   0.00000000e+00  0.00000000e+00  2.84921395e-10  9.06806228e-04
   2.75355286e-04  0.00000000e+00  1.26880411e-05  1.11111111e-01
   1.66666667e-01  0.00000000e+00  1.033015

In [4]:
from autoencoder import extract_features, build_autoencoder
import numpy as np
from tensorflow.keras import layers, Model

In [5]:
network4_features = extract_features(G, dataset.cn_measures)

In [6]:
print(network4_features)

[[ 0.00000000e+00  0.00000000e+00  1.29543747e-05 ...  1.11111111e-01
   1.66666667e-01  1.64462121e-05]
 [ 0.00000000e+00  0.00000000e+00  2.59087494e-05 ...  2.22222222e-01
   1.66666667e-01 -5.98429096e-07]
 [ 0.00000000e+00  0.00000000e+00  1.29543747e-05 ...  1.11111111e-01
   1.66666667e-01  1.59856786e-05]
 ...
 [ 0.00000000e+00  0.00000000e+00  1.29543747e-05 ...  1.11111111e-01
   1.66666667e-01  1.63993973e-05]
 [ 0.00000000e+00  0.00000000e+00  3.88631241e-05 ...  3.33333333e-01
   2.50000000e-01 -1.77077864e-06]
 [ 0.00000000e+00  0.00000000e+00  1.29543747e-05 ...  1.11111111e-01
   1.66666667e-01  1.63638151e-05]]


In [47]:
# Store input dimension
input_dim = len(dataset.cn_measures)  # Input dimension
# Build and train the autoencoder
autoencoder = build_autoencoder(input_dim)
autoencoder.fit(network4_features, network4_features, epochs=100, batch_size=32, validation_split=0.2)

Epoch 1/100


2025-02-26 13:01:05.533801: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.src.callbacks.History at 0x7fa0559b6bf0>

In [48]:
# Extract latent vectors
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.layers[1].output)
latent_vectors = encoder.predict(network4_features)

# Print latent vectors shape
print("Latent vectors shape:", latent_vectors.shape)

Latent vectors shape: (77195, 7)


In [49]:
print(latent_vectors[:12])  # Print the first 12 rows

[[0.12415282 0.         0.02429291 0.         0.12671897 0.02076196
  0.        ]
 [0.180379   0.         0.07920459 0.         0.11015292 0.02350754
  0.        ]
 [0.12415282 0.         0.0242929  0.         0.12671895 0.02076199
  0.        ]
 [0.12399453 0.         0.02375165 0.         0.12626277 0.02171947
  0.        ]
 [0.12415272 0.         0.02429252 0.         0.12671857 0.02076272
  0.        ]
 [0.12415282 0.         0.0242929  0.         0.12671895 0.02076199
  0.        ]
 [0.12415272 0.         0.02429252 0.         0.12671857 0.02076272
  0.        ]
 [0.12415272 0.         0.02429252 0.         0.12671857 0.02076272
  0.        ]
 [0.180379   0.         0.07920459 0.         0.11015292 0.02350754
  0.        ]
 [0.12415272 0.         0.02429252 0.         0.12671857 0.02076272
  0.        ]
 [0.12415272 0.         0.02429252 0.         0.12671857 0.02076272
  0.        ]
 [0.12415282 0.         0.0242929  0.         0.12671895 0.02076199
  0.        ]]
