In [32]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
import logging

logging.basicConfig(
    format="%(asctime)s ; %(levelname)s ; %(message)s",
    level=logging.DEBUG
)
logging.getLogger("scapy").setLevel(logging.CRITICAL)
logger = logging.getLogger("adAPT")

from typing import Tuple
from pathlib import Path
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.regularizers import L2
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import optimizers
from tensorflow import keras
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from collections import Counter


In [34]:
b_pkl = "./data/benign_features.pkl"
m_pkl = "./data/malicious_features.pkl"

b_df = pd.read_pickle(b_pkl)
m_df = pd.read_pickle(m_pkl)


In [35]:
b_df["malware"] = 0.0

In [36]:
m_df["malware"] = 1.0


In [37]:
all_df = pd.concat([b_df, m_df])

In [40]:
all_df.sample(5)

Unnamed: 0,time,day_hour,protocol,app_layer,source_addr,dest_addr,source_port,dest_port,proto_packet_length,proto_packet_cache,...,host_entropy,base_domain_entropy,host_length,proto_packet_entropy,source_ip_class_a,source_ip_class_b,source_ip_class_c,dest_ip_class_a,dest_ip_class_b,dest_ip_class_c
4120,1663773679.201731,2022092108,IP,Unknown,10.9.20.144,76.13.32.146,49762,443,41,b'E\x00\x00)\xf6\xba@\x00\x80\x06x\xdc\n\t\x14...,...,0.0,0.0,0,4.084184,10,10.9,10.9.20,76,76.13,76.13.32
36656,1678205820.441952,2023030708,IP,Unknown,172.16.1.137,192.185.4.22,64164,465,393,b'E\x00\x01\x89\xe2\xad@\x00\x80\x06\xa4X\xac\...,...,0.0,0.0,0,4.021928,172,172.16,172.16.1,192,192.185,192.185.4
17576,1679109073.816603,2023031720,IP,Unknown,10.3.18.18,10.3.18.101,445,50954,308,b'E\x00\x0142\x9c@\x00\x80\x06\x8e\xab\n\x03\x...,...,0.0,0.0,0,3.784184,10,10.3,10.3.18,10,10.3,10.3.18
19375,1681410477.6595,2023041311,IP,Unknown,10.127.0.71,185.172.129.192,49825,1775,40,b'E\x00\x00(\x1a(@\x00\x80\x06\x9au\n\x7f\x00G...,...,0.0,0.0,0,3.821928,10,10.127,10.127.0,185,185.172,185.172.129
69480,1678205857.919678,2023030708,IP,Unknown,172.16.1.137,54.144.214.6,64234,587,1500,b'E\x00\x05\xdc\xe3B@\x00\x80\x06W\xa9\xac\x10...,...,0.0,0.0,0,4.121928,172,172.16,172.16.1,54,54.144,54.144.214


In [20]:
def prepare_df_for_ml(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    fields = [
        "protocol",
        "app_layer",
        "source_addr", 
        "dest_addr",
        "source_port",
        "dest_port",
        "proto_packet_length",
        "ip_packet_length",
        # "url",
        "url_entropy",
        "host_entropy",
        "base_domain_entropy",
        "host_length",
        "proto_packet_entropy",
    ]
    new_df = df[fields]
    new_df.loc[:, ["source_port", "dest_port"]] = new_df[["source_port", "dest_port", ]].astype(str)
    new_df.loc[:, ["ip_packet_length", "source_port"]] = new_df[["ip_packet_length", "source_port", ]].astype(float)
        
    return pd.get_dummies(new_df)

In [21]:
y = all_df["malware"].values
input_values = all_df.drop("malware", axis=1)
prepped = prepare_df_for_ml(input_values)
X = np.asarray(prepped.values).astype("float64")

In [22]:
y.shape

(196981,)

In [23]:
X.shape

(196981, 6057)

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.40, random_state=234)

In [25]:
X_test, X_cv, Y_test, Y_cv = train_test_split(X_test, Y_test, test_size=0.50, random_state=33)

In [26]:
print(f"X_train.shape: {X_train.shape}")
print(f"X_test.shape: {X_test.shape}")
print(f"X_cv.shape: {X_cv.shape}")
print(f"Y_train.shape: {Y_train.shape}")
print(f"Y_test.shape: {Y_test.shape}")
print(f"Y_cv.shape: {Y_cv.shape}")


X_train.shape: (118188, 6057)
X_test.shape: (39396, 6057)
X_cv.shape: (39397, 6057)
Y_train.shape: (118188,)
Y_test.shape: (39396,)
Y_cv.shape: (39397,)


In [27]:
normalizer = tf.keras.layers.Normalization(axis=-1)

layer_1 = layers.Dense(units=9, input_shape=(X_train.shape[-1], ), activation="relu", kernel_regularizer=L2(0.01))
layer_2 = layers.Dense(units=15, activation="relu", kernel_regularizer=L2(0.01))
layer_3 = layers.Dense(units=1, activation="sigmoid", kernel_regularizer=L2(0.01))

model = keras.Sequential([
    normalizer,
    layer_1,
    layer_2,
    layer_3
])

In [28]:
model.compile(
    optimizer="adam",  # Optimizer
    # Loss function to minimize
    loss=losses.BinaryCrossentropy(),
    metrics=[
        tf.keras.metrics.BinaryAccuracy(),
        tf.keras.metrics.FalseNegatives()
    ],
)

In [29]:
model.fit(X_train, Y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x16db29c70>

In [30]:
print("Evaluate on test data")
results = model.evaluate(X_test, Y_test)
print("test loss, test acc:", results)


Evaluate on test data
test loss, test acc: [0.19354166090488434, 0.9673570990562439, 170.0]


In [31]:
print(f"cv loss and acc: {model.evaluate(X_cv, Y_cv)}")

cv loss and acc: [0.19438008964061737, 0.9676371216773987, 164.0]
