In [64]:
# Helper libraries
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import sklearn as sk
import pandas as pd
from pandas import read_csv
from datetime import datetime
import math
import os

# fixing random seed for reproducibility
seed = 2022
np.random.seed(seed)


# New Section

In [65]:
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

print(tf.__version__)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, BatchNormalization, Activation
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D,Input
from tensorflow.keras.layers import LSTM, GRU
from tensorflow.keras.regularizers import L1L2
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras import optimizers
from tensorflow.keras.losses import MAE


2.17.0


In [66]:

from google.colab import drive
drive.mount('/content/drive')
#

##!ls '/content/gdrive/My Drive/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [67]:
data = pd.read_csv('/content/drive/My Drive/pcap/packets_per_sec_analysis.csv')

# Extracting the column 'packets_per_sec'
data = data['packets_per_sec']

# Calculating the indices at which to split the data
train_split_index = int(len(data) * 0.45)
val_split_index = int(len(data) * 0.725)

# Splitting the data into training, validation, and test datasets
train_dataset = data[:train_split_index]
val_dataset = data[train_split_index:val_split_index]
test_dataset = data[val_split_index:]

print(train_dataset.shape)
print(val_dataset.shape)
print(test_dataset.shape)


(809,)
(494,)
(495,)


In [68]:
# Selecting the column 'packets_per_sec' as the feature for the model
features = ['packets_per_sec']

# train and validate
train_values = np.asarray(train_dataset.values, dtype=np.float32).reshape(-1, 1)
train_labels = np.asarray(train_dataset.values, dtype=np.float32)

val_values = np.asarray(val_dataset.values, dtype=np.float32).reshape(-1, 1)
val_labels = np.asarray(val_dataset.values, dtype=np.float32)

test_values = np.asarray(test_dataset.values, dtype=np.float32).reshape(-1, 1)
test_labels = np.asarray(test_dataset.values, dtype=np.float32)

In [69]:
# imports to show that there are many different scalers
# especially with recurrent NNs, the choice of scaler can make a difference

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


SS1 = StandardScaler()
SS1.fit(train_values)

train_scaled = SS1.transform(train_values)
val_scaled = SS1.transform(val_values)
test_scaled = SS1.transform(test_values)

In [70]:
# window-based and recurrent networks:

"""
This function creates a dataset for time-series analysis, specifically for window-based.

Parameters:
- features: A numpy array containing the input data for the model.
- labels: A numpy array containing the corresponding output data for the model.
- ahead: An integer that specifies how many steps ahead in the future the labels are. Default is 4.
- window_size: An integer that specifies the size of the sliding window that is used to create the input data.
- max_window_size: An integer that specifies the maximum size of the sliding window.

Returns:
- dataX: A 3D numpy array of shape (num_samples, window_size, num_features) containing the input data samples.
- labels: A 1D numpy array of shape (num_samples,) containing the corresponding labels for the input data samples.

The function calculates the number of samples that can be created based on the size of the features array, the 'ahead' parameter, and the 'max_window_size' parameter. 
It then creates a list of input data samples by sliding a window of size 'window_size' over the features array. Each input data sample is a 2D numpy array of shape (window_size, num_features), 
where 'num_features' is the number of features in the features array. The function returns a tuple containing two numpy arrays: the first is a 3D numpy array of shape (num_samples, window_size, 
num_features) containing the input data samples, and the second is a 1D numpy array of shape (num_samples,) containing the corresponding labels for the input data samples. 
The labels are shifted 'ahead' steps into the future, and only the labels that correspond to the input data samples are included in the output array.
"""

def create_dataset_windowed(features, labels, ahead=4, window_size=1, max_window_size=360):
    samples = features.shape[0] - ahead - (max_window_size - 1)
    window_size = min(max(window_size, 1), max_window_size)

    dataX = np.array([features[(i + max_window_size - window_size):(i + max_window_size), :] for i in range(samples)])
    dataY = labels[ahead + max_window_size - 1 : ahead + max_window_size - 1 + samples]

    return dataX, dataY

In [71]:
def PlotResults(labels, predictions, window_size, ahead, dataset_type=""):
    fig = plt.figure(figsize=(6, 4), dpi=300)
    ax1 = fig.add_subplot(111)

    # Plotting the actual vs. predicted values
    ax1.plot(labels, 'k-', label='Observed traffic', linewidth=1)
    ax1.plot(predictions, 'r-', label='Predicted traffic', linewidth=1)

    # Labeling the axes
    ax1.set_ylabel('Traffic (Packets per second)', fontsize=12)
    ax1.set_xlabel('Time (Seconds)', fontsize=12)
    # Adding the legend for clarification
    ax1.legend(loc='upper right', fontsize=10)

    # Adding a more descriptive title
    ax1.set_title(f"Comparison of Actual vs Predicted traffic\n"
                  f"Look-back period: {window_size} Seconds | Forecasting horizon: {ahead} Seconds", fontsize=14, pad=15)

    # Showing the plot
    plt.tight_layout()  # Ensuring everything fits without overlapping
    plt.show()

In [72]:
# plotting the loss curves
def plot_history(history):
  plt.figure(figsize = (6,4))

  plt.xlabel('Epoch')
  plt.ylabel('Mae')
  plt.plot(history.epoch, np.array(history.history['mae']),'g-',
           label='Train MAE')
  plt.plot(history.epoch, np.array(history.history['val_mae']),'r-',
           label = 'Validation MAE')
  plt.legend()
  plt.show()

In [73]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv1D, Flatten, Dense, Activation
from tabulate import tabulate

"""
The script trains a CNN model with different window sizes for prediction.

The script uses the `create_dataset_windowed` function to create input data samples by sliding a window of size `window_size` over the features array.
The script then trains a CNN model with the input data samples and corresponding labels. The model is compiled with the Adam optimizer, mean absolute error (MAE) loss function, and MAE and MSE metrics.
The model is trained for 50 epochs with a batch size of 32. The training and validation MAE are calculated and stored in a pandas DataFrame.
The results are displayed as a table using the `tabulate` library and saved to a CSV file.

Parameters:
- window_sizes: A list of integers specifying the window sizes to use for training the model.
- train_scaled: A numpy array containing the scaled input data for training the model.
- train_labels: A numpy array containing the corresponding output data for training the model.
- val_scaled: A numpy array containing the scaled input data for validating the model.
- val_labels: A numpy array containing the corresponding output data for validating the model.

Returns:
- None
The script prints the model summary, training and validation MAE for each window size, and a table summarizing the results. The results are also saved to a CSV file.
"""

histories = {}

folder_path = '/content/drive/My Drive/pcap/'

# Defining window sizes and ahead values
window_sizes = [60, 120, 180, 300]
ahead_values = [30, 60, 300, 600]
mae_results = pd.DataFrame()
mse_results = pd.DataFrame()
rmse_results = pd.DataFrame()

for WINDOW in window_sizes:
    for AHEAD in ahead_values:

        print(f"Training model with window size: {WINDOW}")

        X_train_w, r_train_w = create_dataset_windowed(train_scaled, train_labels, window_size=WINDOW)
        X_val_w, r_val_w = create_dataset_windowed(val_scaled, val_labels, window_size=WINDOW)
        X_test_w, r_test_w = create_dataset_windowed(test_scaled, test_labels, window_size=WINDOW)

        # Rebuild the model for each window size
        CNNmodel = Sequential()

        # Adding input layer explicitly with Input()
        CNNmodel.add(Input(shape=(WINDOW, X_train_w.shape[-1])))

        CNNmodel.add(Conv1D(filters=128, kernel_size=4, padding='same', activation='relu'))
        CNNmodel.add(Conv1D(filters=128, kernel_size=4, padding='same', activation='relu'))
        CNNmodel.add(Conv1D(filters=64, kernel_size=4, padding='same', activation='relu'))
        CNNmodel.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))

        CNNmodel.add(Flatten())

        # Automatically handling the input size
        CNNmodel.add(Dense(64, activation='relu'))
        CNNmodel.add(Dense(1))
        CNNmodel.add(Activation('linear'))

        CNNmodel.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mae', metrics=['mae', 'mse'])

        # Printing the model summary to check the shape
        print(CNNmodel.summary())

        batch_size = 32
        epochs = 50
        CNN_history = CNNmodel.fit(X_train_w, r_train_w,
                                   batch_size=batch_size,
                                   epochs=epochs,
                                   verbose=1,
                                   validation_data=(X_val_w, r_val_w),
                                   shuffle=True)

        histories[WINDOW] = CNN_history.history

        y_val_CNNmodel = CNNmodel.predict(X_val_w)
        y_test_CNNmodel = CNNmodel.predict(X_test_w)

        # Calculating MAE
        mae_val_CNNmodel = mean_absolute_error(r_val_w, y_val_CNNmodel)
        mae_test_CNNmodel = mean_absolute_error(r_test_w, y_test_CNNmodel)

        # Calculating MSE
        mse_val_CNNmodel = mean_squared_error(r_val_w, y_val_CNNmodel)
        mse_test_CNNmodel = mean_squared_error(r_test_w, y_test_CNNmodel)

        # Calculating RMSE
        rmse_val_CNNmodel = np.sqrt(mse_val_CNNmodel)
        rmse_test_CNNmodel = np.sqrt(mse_test_CNNmodel)

        # Storing the MAE results
        mae_new_row = pd.DataFrame({
            'Look-back Period (Seconds)': [WINDOW],
            'Forecasting horizon (Seconds)': [AHEAD],
            'Validation MAE': [mae_val_CNNmodel],
            'Test MAE': [mae_test_CNNmodel]
        })
        mae_results = pd.concat([mae_results, mae_new_row], ignore_index=True)

        # Storing the MSE results
        mse_new_row = pd.DataFrame({
            'Look-back Period (Seconds)': [WINDOW],
            'Forecasting horizon (Seconds)': [AHEAD],
            'Validation MSE': [mse_val_CNNmodel],
            'Test MSE': [mse_test_CNNmodel]
        })
        mse_results = pd.concat([mse_results, mse_new_row], ignore_index=True)

        # Storing the RMSE results
        rmse_new_row = pd.DataFrame({
            'Look-back Period (Seconds)': [WINDOW],
            'Forecasting horizon (Seconds)': [AHEAD],
            'Validation RMSE': [rmse_val_CNNmodel],
            'Test RMSE': [rmse_test_CNNmodel]
        })
        rmse_results = pd.concat([rmse_results, rmse_new_row], ignore_index=True)

        # Printing the results
        print(f"Window size: {WINDOW}, Ahead: {AHEAD}")
        print(f"Validation MAE: {mae_val_CNNmodel}, Test MAE: {mae_test_CNNmodel}")
        print(f"Validation MSE: {mse_val_CNNmodel}, Test MSE: {mse_test_CNNmodel}")
        print(f"Validation RMSE: {rmse_val_CNNmodel}, Test RMSE: {rmse_test_CNNmodel}")

        # Saving predictions to CSV
        predictions_df = pd.DataFrame({
            'Actual': r_val_w.flatten(),
            'Predicted': y_val_CNNmodel.flatten()
        })
        predictions_df.to_csv(f'{folder_path}predictions_val_window_{WINDOW}_ahead_{AHEAD}.csv', index=False)

        predictions_test_df = pd.DataFrame({
            'Actual': r_test_w.flatten(),
            'Predicted': y_test_CNNmodel.flatten()
        })
        predictions_test_df.to_csv(f'{folder_path}predictions_test_window_{WINDOW}_ahead_{AHEAD}.csv', index=False)

        # Plotting results and history
        PlotResults(r_val_w[:1000], y_val_CNNmodel[:1000, 0], WINDOW, AHEAD, dataset_type="Validation")
        PlotResults(r_test_w[:1000], y_test_CNNmodel[:1000, 0], WINDOW, AHEAD, dataset_type="Test")
        plot_history(CNN_history)

# Saving MAE, MSE, and RMSE results to CSV files
mae_results.to_csv(f'{folder_path}mae_results_summary.csv', index=False)
mse_results.to_csv(f'{folder_path}mse_results_summary.csv', index=False)
rmse_results.to_csv(f'{folder_path}rmse_results_summary.csv', index=False)

print("Loop stopped")

Output hidden; open in https://colab.research.google.com to view.