# Analysis of Network Traffic for Intrusion Detection

This notebook is dedicated to analyzing network traffic data to detect potential intrusions. The analysis includes data preprocessing, feature exploration, and the application of machine learning models to classify network behavior as normal or suspicious. We used CNN, LSTM and GRU.

## Data Loading

In this section, we load the network traffic dataset from its source. The dataset includes various features related to network activity, such as source IP, destination IP, packet sizes, and timestamps. Understanding the structure of this data is crucial for our analysis and subsequent feature engineering.

In [5]:
import time
import json
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler

from src.data.dataset_info import datasets
from src.models import MyCNN, MyLSTM, MyGRU
# from src.models.dense_nn import  MyDenseNN

#specifying main configuration of the experiment
multi_class = True
with_network_features = False

with_sort_timestamp = True
sequence_length = 3
with_cross_validation = True
cross_validation_splits_num = 5

# choosing the dataset
dataset = datasets[0]
name = dataset.name
print("dataset: {}".format(name))
path = "./datasets/preprocessed/{}.pkl".format(name)
# graph_path = "./datasets/preprocessed/graph_{}.gexf".format(name)

# loading the dataframe
df = pd.read_pickle(path)

dataset: cic_ton_iot


In [6]:
# the input dimension of the training set
input_dim = df.shape[1] - len(dataset.drop_columns) - len(dataset.weak_columns) - 1  # for the label_column

if not with_network_features:
    input_dim = input_dim - len(dataset.network_features)
    
# specifying the number of classes, since it is different from one dataset to another and also if binary or multi-class classification
num_classes = 2
if multi_class:
    num_classes = len(df["Attack"].unique())

num_epochs = 30
    
dropped_columns = dataset.drop_columns
dataset_name = dataset.name

## Models intialization

In [7]:
nf = []
if with_network_features:
    nf = dataset.network_features

models = [
    MyCNN(
        input_dim=input_dim,
        dataset_name=dataset_name,
        num_classes=num_classes,
        multi_class=multi_class,
        network_features=nf,
        epochs=num_epochs,
        batch_size=256,
        early_stop_patience=10,
    ),
    # MyLSTM(
    #     sequence_length=sequence_length,
    #     input_dim=input_dim,
    #     dataset_name=dataset_name,
    #     num_classes=num_classes,
    #     multi_class=multi_class,
    #     network_features=nf,
    #     use_generator=True,
    #     epochs=num_epochs,
    #     batch_size=256,,
        # early_stop_patience=10,
    # ),
    # MyGRU(
    #     sequence_length=sequence_length,
    #     input_dim=input_dim,
    #     dataset_name=dataset_name,
    #     num_classes=num_classes,
    #     multi_class=multi_class,
    #     network_features=nf,
    #     use_generator=True,
    #     epochs=num_epochs,
    #     batch_size=256,,
        # early_stop_patience=10,
    # )
]

In [8]:
results = {}  # a dictionary that will contain all the options and results of models
# add all options to the results dictionary, to know what options selected for obtained results
results["configuration"] = "stratified k-fold cross validation - manual sequences"
results["multi_class"] = multi_class
results["with_sort_timestamp"] = with_sort_timestamp
results["sequence_length"] = sequence_length
results["with_cross_validation"] = with_cross_validation
results["cross_validation_splits_num"] = cross_validation_splits_num
results["with_network_features"] = with_network_features
results["network_features"] = dataset.cn_measures

results["dataset_name"] = dataset_name
results["input_dim"] = input_dim
results["dropped_columns"] = dropped_columns
results["num_dropped_columns"] = len(dropped_columns)

results["models"] = {}
results["average_acc"] = {}
results["average"] = {}

## Data Preprocessing

In [9]:

if with_sort_timestamp:
    df[dataset.timestamp_col] = pd.to_datetime(df[dataset.timestamp_col].str.strip(), format=dataset.timestamp_format)
    df.sort_values(dataset.timestamp_col, inplace= True)

labels_names = {0: "benign", 1: "attack"}
if multi_class:
    fac = pd.factorize(df[dataset.class_col])
    labels_names = {index: value for index, value in enumerate(fac[1])}
    print(f"==>> labels_names: {labels_names}")
    df[dataset.label_col] = fac[0]  # type: ignore


df.drop(dataset.drop_columns, axis=1, inplace=True)
df.drop(dataset.weak_columns, axis=1, inplace=True)

if not with_network_features:
    df = df.drop(dataset.network_features, axis=1)

df.reset_index(drop=True, inplace=True)


==>> labels_names: {0: 'Benign', 1: 'xss', 2: 'password', 3: 'scanning', 4: 'injection', 5: 'ransomware', 6: 'backdoor', 7: 'mitm', 8: 'ddos', 9: 'dos'}


In [10]:
df.head()

Unnamed: 0,Protocol,Tot Fwd Pkts,TotLen Fwd Pkts,Fwd Pkt Len Min,Bwd Pkt Len Min,Flow Byts/s,Flow IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Std,...,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Max,Active Min,Idle Std,Idle Max,Idle Min,Label
0,0.0,3.0,0.0,0.0,0.0,0.0,63064970.0,102196170.0,13008834.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,897317100000000.0,1554199000000000.0,13008830.0,0
1,0.0,3.0,0.0,0.0,0.0,0.0,1373953.0,3991265.0,2048202.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1554199000000000.0,1554199000000000.0,0
2,0.0,3.0,0.0,0.0,0.0,0.0,15640270.0,43827824.0,21709138.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,897317200000000.0,1554199000000000.0,21709140.0,0
3,0.0,3.0,0.0,0.0,0.0,0.0,42313000.0,75008320.0,15168707.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,897317200000000.0,1554199000000000.0,15168710.0,0
4,0.0,3.0,0.0,0.0,0.0,0.0,1880029.0,57137762.0,54478999.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,897317200000000.0,1554199000000000.0,54479000.0,0


In [11]:
labels = df['Label'].to_numpy()
df = df.drop([dataset.label_col], axis=1).to_numpy()

## Time series Training

In [12]:

tscv = TimeSeriesSplit(n_splits=cross_validation_splits_num)
i = 0
for train_index, test_index in tscv.split(df):
    training_labels = labels[train_index]
    print(f"==>> train_index: {train_index}")
    print(f"==>> training_labels: {training_labels.shape}")
    testing_labels = labels[test_index]
    print(f"==>> test_index: {test_index}")
    print(f"==>> testing_labels: {testing_labels.shape}")

    i += 1
    print("fold: {}".format(i))
    # print("train_index: {}".format(train_index))
    print("=====================================")
    print("=====================================")
    # print("fold: {}/{}".format(i, len(list_of_dfs)))
    print("fold: {}/{}".format(i, cross_validation_splits_num))

    for model in models:
        print("training: {}".format(model.model_name()))
        print("sequential: {}".format(model.sequential))

        training = df[train_index]
        testing = df[test_index]
        
        scaler = MinMaxScaler()
        training = scaler.fit_transform(training)
        testing = scaler.transform(testing)

        model.build()
        model.train(training,
                    training_labels)  # type: ignore
        predictions, prediction_time = model.predict(
            testing)  # type: ignore
        model_name, scores, class_report = model.evaluate(  # type: ignore
            predictions,
            testing_labels,
            prediction_time
        )
        scores["fold"] = i
        if i == 1:
            results["models"][model_name] = {}
            results["models"][model_name]["scores"] = [scores]
            results["models"][model_name]["class_report"] = [class_report]
        else:
            results["models"][model_name]["scores"].append(scores)
            results["models"][model_name]["class_report"].append(
                class_report)
        # results[str(i) + model_name] = scores
        print("{}: {}".format(model_name, scores))

    for model in models:
        model_name = model.model_name()
        average_acc = 0
        average_recall = 0
        average_precision = 0
        average_f1s = 0
        average_FPR = 0
        average_FNR = 0
        for result in results["models"][model_name]["scores"]:  # type: ignore
            average_acc += result["accuracy"]
            average_recall += result["recall"]
            average_precision += result["precision"]
            average_f1s += result["f1s"]
            average_FPR += result["FPR"]
            average_FNR += result["FNR"]
        average_acc = average_acc / i
        average_recall = average_recall / i
        average_precision = average_precision / i
        average_f1s = average_f1s / i
        average_FPR = average_FPR / i
        average_FNR = average_FNR / i
        if i == 1:
            results["models"][model_name]["average"] = [
                {
                    "average_acc": average_acc,
                    "average_recall": average_recall,
                    "average_precision": average_precision,
                    "average_f1s": average_f1s,
                    "average_FPR": average_FPR,
                    "average_FNR": average_FNR,
                    "fold": i
                }
            ]
            results["average_acc"][model_name] = average_acc
            results["average"][model_name] = {
                "average_acc": average_acc,
                "average_recall": average_recall,
                "average_precision": average_precision,
                "average_f1s": average_f1s,
                "average_FPR": average_FPR,
                "average_FNR": average_FNR
            }
        else:
            results["models"][model_name]["average"].append(
                {
                    "average_acc": average_acc,
                    "average_recall": average_recall,
                    "average_precision": average_precision,
                    "average_f1s": average_f1s,
                    "average_FPR": average_FPR,
                    "average_FNR": average_FNR,
                    "fold": i
                })
            results["average_acc"][model_name] = average_acc
            results["average"][model_name] = {
                "average_acc": average_acc,
                "average_recall": average_recall,
                "average_precision": average_precision,
                "average_f1s": average_f1s,
                "average_FPR": average_FPR,
                "average_FNR": average_FNR
            }
        print("{} average accuracy: {}".format(model_name, average_acc))

results["endtime"] = time.strftime("%Y:%m:%d-%H:%M:%S")

print(f"==>> results: {results}")


==>> train_index: [     0      1      2 ... 891765 891766 891767]
==>> training_labels: (891768,)
==>> test_index: [ 891768  891769  891770 ... 1783528 1783529 1783530]
==>> testing_labels: (891763,)
fold: 1
fold: 1/5
training: cnn mc  cnn-64-64
sequential: False
37
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 35, 64)            256       
                                                                 
 max_pooling1d (MaxPooling1  (None, 17, 64)            0         
 D)                                                              
                                                                 
 batch_normalization (Batch  (None, 17, 64)            256       
 Normalization)                                                  
                                                                 
 flatten (Flatten)           (None, 1088)          

  saving_api.save_model(


Epoch 2: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-02-0.5021.hdf5
Epoch 3/30
Epoch 3: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-03-0.4890.hdf5
Epoch 4/30
Epoch 4: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-04-0.4830.hdf5
Epoch 5/30
Epoch 5: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-05-0.4794.hdf5
Epoch 6/30
Epoch 6: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-06-0.4767.hdf5
Epoch 7/30
Epoch 7: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-07-0.4744.hdf5
Epoch 8/30
Epoch 8: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-08-0.4730.hdf5
Epoch 9/30
Epoch 9: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-09-0.4721.hdf5
Epoch 10/30
Epoch 10: saving model to ./models/weights/cic_ton_iot/

  saving_api.save_model(


Epoch 2: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-02-0.5424.hdf5
Epoch 3/30
Epoch 3: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-03-0.5352.hdf5
Epoch 4/30
Epoch 4: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-04-0.5315.hdf5
Epoch 5/30
Epoch 5: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-05-0.5292.hdf5
Epoch 6/30
Epoch 6: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-06-0.5275.hdf5
Epoch 7/30
Epoch 7: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-07-0.5260.hdf5
Epoch 8/30
Epoch 8: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-08-0.5248.hdf5
Epoch 9/30
Epoch 9: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-09-0.5237.hdf5
Epoch 10/30
Epoch 10: saving model to ./models/weights/cic_ton_iot/

  saving_api.save_model(


Epoch 2: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-02-0.4919.hdf5
Epoch 3/30
Epoch 3: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-03-0.4853.hdf5
Epoch 4/30
Epoch 4: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-04-0.4819.hdf5
Epoch 5/30
Epoch 5: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-05-0.4797.hdf5
Epoch 6/30
Epoch 6: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-06-0.4783.hdf5
Epoch 7/30
Epoch 7: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-07-0.4772.hdf5
Epoch 8/30
Epoch 8: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-08-0.4762.hdf5
Epoch 9/30
Epoch 9: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-09-0.4753.hdf5
Epoch 10/30
Epoch 10: saving model to ./models/weights/cic_ton_iot/

  saving_api.save_model(


Epoch 2: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-02-0.4627.hdf5
Epoch 3/30
Epoch 3: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-03-0.4573.hdf5
Epoch 4/30
Epoch 4: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-04-0.4504.hdf5
Epoch 5/30
Epoch 5: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-05-0.4524.hdf5
Epoch 6/30
Epoch 6: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-06-0.4506.hdf5
Epoch 7/30
Epoch 7: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-07-0.4492.hdf5
Epoch 8/30
Epoch 8: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-08-0.4480.hdf5
Epoch 9/30
Epoch 9: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-09-0.4471.hdf5
Epoch 10/30
Epoch 10: saving model to ./models/weights/cic_ton_iot/

  saving_api.save_model(


Epoch 2: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-02-0.4411.hdf5
Epoch 3/30
Epoch 3: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-03-0.4357.hdf5
Epoch 4/30
Epoch 4: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-04-0.4332.hdf5
Epoch 5/30
Epoch 5: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-05-0.4313.hdf5
Epoch 6/30
Epoch 6: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-06-0.4293.hdf5
Epoch 7/30
Epoch 7: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-07-0.4275.hdf5
Epoch 8/30
Epoch 8: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-08-0.4263.hdf5
Epoch 9/30
Epoch 9: saving model to ./models/weights/cic_ton_iot/cnn mc  cnn-64-64/weights-improvement-09-0.4253.hdf5
Epoch 10/30
Epoch 10: saving model to ./models/weights/cic_ton_iot/

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


confusion_matrix:
[[643212  28994   7585      4   1072      0      0      1      0      2]
 [  2082 139526    346      0     13      0      0      0      0      0]
 [   272  20507    271      0      5      0      0      0      0      0]
 [    16     13      0      0      0      0      0      0      0      0]
 [   236  14372    126      0      1      0      0      0      0      0]
 [  2666   2387     45      0      0      0      0      0      0      0]
 [ 26960    185      0      0      0      0      0      0      0      0]
 [   225    286      3      0      3      0      0      0      0      0]
 [    71    131      0      0      0      0      0      0      0      0]
 [    59     86      0      0      0      0      0      0      0      0]]
End of confusion_matrix:
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95    680870
           1       0.68      0.98      0.80    141967
           2       0.03      0.01      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  class_precision[i] = tp / (tp + fp)


## Saving Results

In [13]:
# creating the directories if they don't exist
if not os.path.isdir('./results'):
    os.mkdir('./results')

if not os.path.isdir('./results/{}'.format(dataset_name)):
    os.mkdir('./results/{}'.format(dataset_name))

class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):'
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)

# saving the results to a file for future refernece
filename = ('./results/{}/{}.json'.format(dataset_name,
            time.strftime("%Y%m%d-%H%M%S")))
outfile = open(filename, 'w')
outfile.writelines(json.dumps(results, cls=NumpyEncoder))
outfile.close()
