In [2]:
import json
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

# datasets is a list of available datasets descriptions containing: path, key columns names, and suitable complex network features
from src.data.dataset_info import datasets


In [2]:
dataset = datasets[4]
name = dataset.name
print("dataset: {}".format(name))

path = "./datasets/partitions/{}.pkl".format(name)
new_path = "./datasets/preprocessed/{}.pkl".format(name)
graph_path = "./datasets/preprocessed/graph_{}.gexf".format(name)
df = pd.read_pickle(path)

dataset: partition_1


In [3]:
# converting all infinity values into nan then dropping all records containing nan values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(axis=0, how='any', inplace=True)

df.drop_duplicates(subset=list(set(df.columns) - set([dataset.timestamp_col, dataset.flow_id_col])), keep="first", inplace=True)

In [4]:
total_count = len(df)

properties = {
    "name": dataset.name,
    "length": total_count,
}

num_benign = len(df[df['Label'] == 0])
num_attack = len(df[df['Label'] == 1])

properties["num_benign"] = num_benign
properties["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

properties["num_attack"] = num_attack
properties["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

properties["attacks"] = list(df["Attack"].unique())  # .to_list()

filename = ('./datasets_properties/{}.json'.format(dataset.name))
outfile = open(filename, 'w')
outfile.writelines(json.dumps(properties))
outfile.close()

In [5]:
G = nx.from_pandas_edgelist(
        df,
        source=dataset.src_ip_col,
        target=dataset.dst_ip_col,
        create_using=nx.DiGraph()
    )

In [6]:
G.remove_nodes_from(list(nx.isolates(G)))

In [7]:
import igraph as ig
G1 = ig.Graph.from_networkx(G)
part = G1.community_infomap()

communities = []
for com in part:
    communities.append([G1.vs[node_index]['_nx_name'] for node_index in com])

print(f"==>> number of communities: {len(communities)}")
for com in communities:
    print(f"==>> com: {len(com)}")

==>> number of communities: 147
==>> com: 73
==>> com: 89
==>> com: 66669
==>> com: 684
==>> com: 941
==>> com: 79
==>> com: 65
==>> com: 255
==>> com: 25
==>> com: 56
==>> com: 2
==>> com: 3343
==>> com: 841
==>> com: 98
==>> com: 106
==>> com: 162
==>> com: 118
==>> com: 547
==>> com: 6
==>> com: 83
==>> com: 85
==>> com: 10
==>> com: 6
==>> com: 92
==>> com: 78
==>> com: 7
==>> com: 474
==>> com: 1048
==>> com: 37
==>> com: 22
==>> com: 2
==>> com: 7
==>> com: 190
==>> com: 19
==>> com: 12
==>> com: 68
==>> com: 8
==>> com: 2
==>> com: 64
==>> com: 101
==>> com: 10
==>> com: 97
==>> com: 12
==>> com: 125
==>> com: 2
==>> com: 15
==>> com: 2
==>> com: 2
==>> com: 2
==>> com: 2
==>> com: 58
==>> com: 7
==>> com: 2
==>> com: 2
==>> com: 6
==>> com: 3
==>> com: 16
==>> com: 11
==>> com: 37
==>> com: 2
==>> com: 3
==>> com: 87
==>> com: 2
==>> com: 36
==>> com: 2
==>> com: 2
==>> com: 2
==>> com: 39
==>> com: 3
==>> com: 22
==>> com: 2
==>> com: 3
==>> com: 21
==>> com: 9
==>> com: 20
==

In [8]:
properties = {}

properties["number_of_nodes"] = G.number_of_nodes()
properties["number_of_edges"] = G.number_of_edges()

degrees = [degree for _, degree in G.degree()]
properties["max_degree"] = max(degrees)
properties["avg_degree"] = sum(degrees) / len(degrees)

In [9]:
properties["transitivity"] = nx.transitivity(G)


In [10]:
properties["density"] =  nx.density(G)


In [11]:
# Assuming G is your graph and communities is a list of sets, where each set contains the nodes in a community

# Step 1: Map each node to its community
node_to_community = {}
for community_index, community in enumerate(communities):
    for node in community:
        node_to_community[node] = community_index

# Step 2: Count inter-cluster edges efficiently
inter_cluster_edges = 0
for u, v in G.edges():
    # Directly check if u and v belong to different communities
    if node_to_community[u] != node_to_community[v]:
        inter_cluster_edges += 1


properties["mixing_parameter"] = inter_cluster_edges / G.number_of_edges()

In [12]:
properties["modularity"] = nx.community.modularity(G, communities)

In [13]:
filename = ('./datasets_properties/{}.json'.format("graph_" + name))
outfile = open(filename, 'w')
outfile.writelines(json.dumps(properties))
outfile.close()

properties

{'number_of_nodes': 77530,
 'number_of_edges': 115734,
 'max_degree': 24818,
 'avg_degree': 2.9855281826389786,
 'transitivity': 0.04012570053928307,
 'density': 1.925426732344657e-05,
 'mixing_parameter': 0.054651182884891215,
 'modularity': 0.17697251172536843}

In [14]:
community_labels = {}
for i, community in enumerate(communities):
    for node in community:
        community_labels[node] = i

nx.set_node_attributes(G, community_labels, "new_community")

In [15]:
# getting inter and itra graph, to calculate the local and global variations of each centrality
from src.network.network_features import separate_graph

intra_graph, inter_graph = separate_graph(G, communities)

In [16]:
from src.network.network_features import cal_betweenness_centrality

if "betweenness" in dataset.cn_measures:
    nx.set_node_attributes(G, cal_betweenness_centrality(G), "betweenness")
    print("calculated")

calculated


In [17]:
if "local_betweenness" in dataset.cn_measures:
    nx.set_node_attributes(G, cal_betweenness_centrality(intra_graph), "local_betweenness")
    print("calculated")

In [18]:
if "global_betweenness" in dataset.cn_measures:
    nx.set_node_attributes(G, cal_betweenness_centrality(inter_graph), "global_betweenness")
    print("calculated")

calculated


In [19]:
if "degree" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.degree_centrality(G), "degree")
    print("calculated")

calculated


In [20]:
if "local_degree" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.degree_centrality(intra_graph), "local_degree")
    print("calculated")

In [21]:
if "global_degree" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.degree_centrality(inter_graph), "global_degree")
    print("calculated")

calculated


In [22]:
if "eigenvector" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.eigenvector_centrality(G, max_iter=600), "eigenvector")
    print("calculated")


calculated


In [23]:
if "local_eigenvector" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.eigenvector_centrality(intra_graph), "local_eigenvector")
    print("calculated")

In [24]:
if "global_eigenvector" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.eigenvector_centrality(inter_graph), "global_eigenvector")
    print("calculated")

In [25]:
if "closeness" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.closeness_centrality(G), "closeness")
    print("calculated")

calculated


In [26]:
if "local_closeness" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.closeness_centrality(intra_graph), "local_closeness")
    print("calculated")

In [27]:
if "global_closeness" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.closeness_centrality(inter_graph), "global_closeness")
    print("calculated")

In [28]:
if "pagerank" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.pagerank(G, alpha=0.85), "pagerank")
    print("calculated")

calculated


In [29]:
if "local_pagerank" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.pagerank(intra_graph, alpha=0.85), "local_pagerank")
    print("calculated")

In [30]:
if "global_pagerank" in dataset.cn_measures:
    nx.set_node_attributes(G, nx.pagerank(inter_graph, alpha=0.85), "global_pagerank")
    print("calculated")

calculated


In [31]:
from src.network.network_features import cal_k_core

if "k_core" in dataset.cn_measures:
    nx.set_node_attributes(G, cal_k_core(G), "k_core")
    print("calculated")

calculated


In [32]:
from src.network.network_features import cal_k_truss
if "k_truss" in dataset.cn_measures:
    nx.set_node_attributes(G, cal_k_truss(G), "k_truss")
    print("calculated")

calculated


In [33]:
from src.network.CommCentralityCode import comm_centreality

if "Comm" in dataset.cn_measures:
    nx.set_node_attributes(G, comm_centreality(G, community_labels), "Comm")
    print("calculated")

In [34]:
from src.network.modularity_vitality import modularity_vitality

if "mv" in dataset.cn_measures:
    nx.set_node_attributes(G, modularity_vitality(G1, part), "mv")
    print("calculated")

calculated


In [35]:
nx.write_gexf(G, graph_path)

In [36]:
features_dicts = {}
for measure in dataset.cn_measures:
    features_dicts[measure] = nx.get_node_attributes(G, measure)
    print(f"==>> features_dicts: {measure , len(features_dicts[measure])}")
    
for feature in dataset.network_features:
        if feature[:3] == "src":
            df[feature] = df.apply(
                lambda row: features_dicts[feature[4:]].get(row[dataset.src_ip_col], -1), axis=1)
        if feature[:3] == "dst":
            df[feature] = df.apply(
                lambda row: features_dicts[feature[4:]].get(row[dataset.dst_ip_col], -1), axis=1)

==>> features_dicts: ('betweenness', 77530)
==>> features_dicts: ('global_betweenness', 77530)
==>> features_dicts: ('degree', 77530)
==>> features_dicts: ('global_degree', 77530)
==>> features_dicts: ('eigenvector', 77530)
==>> features_dicts: ('closeness', 77530)
==>> features_dicts: ('pagerank', 77530)
==>> features_dicts: ('global_pagerank', 77530)
==>> features_dicts: ('k_core', 77530)
==>> features_dicts: ('k_truss', 77530)
==>> features_dicts: ('mv', 77530)


In [37]:
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,src_pagerank,dst_pagerank,src_global_pagerank,dst_global_pagerank,src_k_core,dst_k_core,src_k_truss,dst_k_truss,src_mv,dst_mv
1337940,18.194.169.124-192.168.1.36-80-52390-6,18.194.169.124,80.0,192.168.1.36,52390.0,6.0,27/04/2019 03:32:56 pm,742498.0,7.0,6.0,...,1.3e-05,4.3e-05,1.1e-05,8.6e-05,0.888889,1.0,0.003361,0.010084,0.000309,0.000616
1337941,192.168.1.195-192.168.1.39-80-39630-6,192.168.1.195,80.0,192.168.1.39,39630.0,6.0,27/04/2019 10:30:11 pm,15.0,2.0,0.0,...,0.000414,5e-05,0.000252,5.9e-05,1.0,1.0,0.020168,0.011765,0.002155,0.000605
1337942,192.168.1.35-18.194.169.124-52926-80-6,192.168.1.35,52926.0,18.194.169.124,80.0,6.0,27/04/2019 07:46:40 pm,825916.0,6.0,4.0,...,5.5e-05,1.3e-05,5.5e-05,1.1e-05,1.0,0.888889,0.013445,0.003361,0.000614,0.000309
1337943,98.102.10.246-69.151.192.168-0-0-0,98.102.10.246,0.0,69.151.192.168,0.0,0.0,30/04/2019 12:35:45 am,34488823.0,3.0,0.0,...,5e-06,0.076029,1.1e-05,1.1e-05,0.111111,0.222222,0.003361,0.005042,0.000198,-0.030097
1337944,192.168.1.39-192.168.1.1-45333-53-17,192.168.1.39,45333.0,192.168.1.1,53.0,17.0,27/04/2019 05:52:15 pm,61.0,2.0,2.0,...,5e-05,3.8e-05,5.9e-05,5.4e-05,1.0,1.0,0.011765,0.013445,0.000605,0.000473


In [38]:
pd.to_pickle(df, new_path)

In [5]:
#from unfied_features import create_unified_features

In [3]:
dataset = datasets[4]
name = dataset.name
print("dataset: {}".format(name))

new_path = "./datasets/preprocessed/{}.pkl".format(name)
graph_path = "./datasets/preprocessed/graph_{}.gexf".format(name)
G = nx.read_gexf(graph_path)

dataset: partition_2


In [7]:
"""network3_features = create_unified_features(G, dataset)
print(network3_features.shape)
print(network3_features[:5])  # Slicing to get the first 5 rows

(77329, 15)
[[0.00000000e+00 0.00000000e+00 0.00000000e+00 1.29319263e-05
  0.00000000e+00 0.00000000e+00 3.40761134e-13 0.00000000e+00
  5.43512577e-06 0.00000000e+00 8.98679610e-06 1.11111111e-01
  2.00000000e-01 0.00000000e+00 1.22176794e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.29319263e-05
  0.00000000e+00 0.00000000e+00 3.40761134e-13 0.00000000e+00
  5.43512577e-06 0.00000000e+00 8.98679610e-06 1.11111111e-01
  2.00000000e-01 0.00000000e+00 1.20684941e-03]
 [3.01026387e-08 0.00000000e+00 0.00000000e+00 1.17680530e-03
  0.00000000e+00 1.29319263e-05 3.37694284e-10 1.16387337e-03
  4.33571723e-04 0.00000000e+00 2.27660522e-05 1.11111111e-01
  2.00000000e-01 0.00000000e+00 2.42931761e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.29319263e-05
  0.00000000e+00 0.00000000e+00 3.40761134e-13 0.00000000e+00
  5.43512577e-06 0.00000000e+00 8.98679610e-06 1.11111111e-01
  2.00000000e-01 0.00000000e+00 1.20684941e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 1.293

In [5]:
from autoencoder import extract_features, build_autoencoder
import numpy as np
from tensorflow.keras import layers, Model

In [6]:
network3_features = extract_features(G, dataset.cn_measures)
print(network3_features)

[[0.00000000e+00 0.00000000e+00 1.29319263e-05 ... 1.11111111e-01
  2.00000000e-01 1.22176794e-03]
 [0.00000000e+00 0.00000000e+00 1.29319263e-05 ... 1.11111111e-01
  2.00000000e-01 1.20684941e-03]
 [3.01026387e-08 0.00000000e+00 1.17680530e-03 ... 1.11111111e-01
  2.00000000e-01 2.42931761e-03]
 ...
 [0.00000000e+00 0.00000000e+00 2.58638527e-05 ... 2.22222222e-01
  2.00000000e-01 1.20550005e-03]
 [0.00000000e+00 0.00000000e+00 1.29319263e-05 ... 1.11111111e-01
  2.00000000e-01 1.22167293e-03]
 [0.00000000e+00 0.00000000e+00 1.29319263e-05 ... 1.11111111e-01
  2.00000000e-01 1.22167293e-03]]


In [13]:
# Store input dimension
input_dim = len(dataset.cn_measures)  # Input dimension
# Build and train the autoencoder
autoencoder = build_autoencoder(input_dim)
autoencoder.fit(network3_features, network3_features, epochs=100, batch_size=32, validation_split=0.2)

Epoch 1/100


2025-02-26 12:49:53.901978: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.src.callbacks.History at 0x7fb1d61de380>

In [14]:
# Extract latent vectors
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.layers[1].output)
latent_vectors = encoder.predict(network3_features)

# Print latent vectors shape
print("Latent vectors shape:", latent_vectors.shape)

Latent vectors shape: (77329, 7)


In [15]:
print(latent_vectors[:12])  # Print the first 12 rows

[[0.10269186 0.00070649 0.01854086 0.00341405 0.00087464 0.04781345
  0.        ]
 [0.10269158 0.00070715 0.01854333 0.00341588 0.00087286 0.04780988
  0.        ]
 [0.10334972 0.00054727 0.01815611 0.00490579 0.00088614 0.04765771
  0.        ]
 [0.10269158 0.00070715 0.01854333 0.00341588 0.00087286 0.04780988
  0.        ]
 [0.10269158 0.00070715 0.01854333 0.00341588 0.00087286 0.04780988
  0.        ]
 [0.10582863 0.05044217 0.02614235 0.00243813 0.07013893 0.04284716
  0.        ]
 [0.10269158 0.00070715 0.01854333 0.00341588 0.00087286 0.04780988
  0.        ]
 [0.10269185 0.00070651 0.01854094 0.00341411 0.00087458 0.04781333
  0.        ]
 [0.10269185 0.00070651 0.01854095 0.00341411 0.00087457 0.04781332
  0.        ]
 [0.16642408 0.09747075 0.09221076 0.01062869 0.10451015 0.07871825
  0.        ]
 [0.10269158 0.00070715 0.01854333 0.00341588 0.00087286 0.04780988
  0.        ]
 [0.10269158 0.00070715 0.01854333 0.00341588 0.00087286 0.04780988
  0.        ]]
