In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import logging

logging.basicConfig(format="%(asctime)s ; %(levelname)s ; %(message)s", level=logging.DEBUG)
logging.getLogger("scapy").setLevel(logging.CRITICAL)
logger = logging.getLogger("adAPT")

from typing import Tuple
from pathlib import Path
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.regularizers import L2
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import optimizers
from tensorflow import keras
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from collections import Counter

2023-05-14 17:11:44.319484: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-14 17:11:46,397 ; DEBUG ; Creating converter from 7 to 5
2023-05-14 17:11:46,397 ; DEBUG ; Creating converter from 5 to 7
2023-05-14 17:11:46,398 ; DEBUG ; Creating converter from 7 to 5
2023-05-14 17:11:46,399 ; DEBUG ; Creating converter from 5 to 7


In [4]:
b_pkl = "./data/benign_features.pkl"
m_pkl = "./data/malicious_features.pkl"

b_df = pd.read_pickle(b_pkl)
m_df = pd.read_pickle(m_pkl)

In [5]:
all_df = pd.concat([b_df, m_df])

In [6]:
all_df.sample(5)

Unnamed: 0,protocol,app_layer,source_addr,dest_addr,source_port,dest_port,proto_packet_length,proto_packet_cache,ip_packet_length,ip_packet_cache,...,base_domain_entropy,host_length,proto_packet_entropy,source_ip_class_a,source_ip_class_b,source_ip_class_c,dest_ip_class_a,dest_ip_class_b,dest_ip_class_c,malware
62431,IP,Unknown,172.16.1.137,50.87.177.214,64198,587,1500,b'E\x00\x05\xdc7\xef@\x00\x80\x06+f\xac\x10\x0...,1500,b'E\x00\x05\xdc7\xef@\x00\x80\x06+f\xac\x10\x0...,...,0.0,0,4.221928,172,172.16,172.16.1,50,50.87,50.87.177,1.0
8951,IP,HTTPResponse,165.22.246.219,10.3.18.101,8080,51020,71,b'E\x00\x00G\x97\xbc\x00\x00\x80\x06\xea\x9a\x...,71,b'E\x00\x00G\x97\xbc\x00\x00\x80\x06\xea\x9a\x...,...,0.0,0,3.921928,165,165.22,165.22.246,10,10.3,10.3.18,1.0
17932,IP,Unknown,51.195.169.87,10.1.11.101,8080,64823,40,b'E\x00\x00(7\x14\x00\x00\x80\x06\x11<3\xc3\xa...,40,b'E\x00\x00(7\x14\x00\x00\x80\x06\x11<3\xc3\xa...,...,0.0,0,3.921928,51,51.195,51.195.169,10,10.1,10.1.11,1.0
63661,IP,Unknown,178.128.31.80,172.16.1.137,443,64214,1500,b'E\x00\x05\xdc5`\x00\x00\x80\x06\x80R\xb2\x80...,1500,b'E\x00\x05\xdc5`\x00\x00\x80\x06\x80R\xb2\x80...,...,0.0,0,3.846439,178,178.128,178.128.31,172,172.16,172.16.1,1.0
13970,IP,Unknown,10.127.0.71,185.172.129.192,49825,1775,40,"b""E\x00\x00(\x12v@\x00\x80\x06\xa2'\n\x7f\x00G...",40,"b""E\x00\x00(\x12v@\x00\x80\x06\xa2'\n\x7f\x00G...",...,0.0,0,3.921928,10,10.127,10.127.0,185,185.172,185.172.129,1.0


In [7]:
def prepare_df_for_ml(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    fields = [
        "protocol",
        "app_layer",
        "source_port",
        "dest_port",
        "proto_packet_length",
        "ip_packet_length",
        # "url",
        "base_domain",
        "tld",
        "url_entropy",
        "host_entropy",
        "base_domain_entropy",
        "host_length",
        "proto_packet_entropy",
        "source_ip_class_a",
        "source_ip_class_b",
        "source_ip_class_c",
        "dest_ip_class_a",
        "dest_ip_class_b",
        "dest_ip_class_c",
    ]
    new_df = df[fields]
    new_df.loc[:, ["source_port", "dest_port"]] = new_df[
        [
            "source_port",
            "dest_port",
        ]
    ].astype(str)
    new_df.loc[:, ["ip_packet_length", "source_port"]] = new_df[
        [
            "ip_packet_length",
            "source_port",
        ]
    ].astype(float)

    return pd.get_dummies(new_df)

In [8]:
y = all_df["malware"].values
input_values = all_df.drop("malware", axis=1)
prepped = prepare_df_for_ml(input_values)
X = np.asarray(prepped.values).astype("float64")

In [9]:
y.shape

(196981,)

In [10]:
X.shape

(196981, 6867)

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.40, random_state=234)

In [12]:
X_test, X_cv, Y_test, Y_cv = train_test_split(X_test, Y_test, test_size=0.50, random_state=33)

In [13]:
print(f"X_train.shape: {X_train.shape}")
print(f"X_test.shape: {X_test.shape}")
print(f"X_cv.shape: {X_cv.shape}")
print(f"Y_train.shape: {Y_train.shape}")
print(f"Y_test.shape: {Y_test.shape}")
print(f"Y_cv.shape: {Y_cv.shape}")

X_train.shape: (118188, 6867)
X_test.shape: (39396, 6867)
X_cv.shape: (39397, 6867)
Y_train.shape: (118188,)
Y_test.shape: (39396,)
Y_cv.shape: (39397,)


In [14]:
normalizer = tf.keras.layers.Normalization(axis=-1)

layer_1 = layers.Dense(units=9, input_shape=(X_train.shape[-1],), activation="relu", kernel_regularizer=L2(0.01))
layer_2 = layers.Dense(units=15, activation="relu", kernel_regularizer=L2(0.01))
layer_3 = layers.Dense(units=1, activation="sigmoid", kernel_regularizer=L2(0.01))

model = keras.Sequential([normalizer, layer_1, layer_2, layer_3])

In [15]:
model.compile(
    optimizer="adam",  # Optimizer
    # Loss function to minimize
    loss=losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.FalseNegatives()],
)

In [16]:
model.fit(X_train, Y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x16fcbdcd0>

In [17]:
print("Evaluate on test data")
results = model.evaluate(X_test, Y_test)
print("test loss, test acc:", results)

Evaluate on test data
test loss, test acc: [0.11317555606365204, 0.9931718707084656, 16.0]


In [18]:
print("Evaluate on Cross Validation data")
results_cv = model.evaluate(X_cv, Y_cv)
print(f"cv loss and acc: {results_cv}")

Evaluate on Cross Validation data
cv loss and acc: [0.11276879906654358, 0.9936035871505737, 14.0]


In [19]:
print("Evaluate on full data set")
results_full = model.evaluate(X, y)
print(f"cv loss and acc: {results_full}")

Evaluate on full data set
cv loss and acc: [0.11373991519212723, 0.993146538734436, 86.0]
