In [149]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [150]:
import logging

logging.basicConfig(
    format="%(asctime)s ; %(levelname)s ; %(message)s",
    level=logging.DEBUG
)
logging.getLogger("scapy").setLevel(logging.CRITICAL)
logger = logging.getLogger("adAPT")

from pathlib import Path
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.regularizers import L2
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import optimizers
from tensorflow import keras
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from collections import Counter


In [151]:
b_pkl = "./data/benign_features.pkl"
m_pkl = "./data/malicious_features.pkl"

b_df = pd.read_pickle(b_pkl)
m_df = pd.read_pickle(m_pkl)


In [152]:
b_df["malware"] = 0.0

In [153]:
m_df["malware"] = 1.0


In [154]:
all_df = pd.concat([b_df, m_df])

In [155]:
from math import log

def shannon(counts, thing):
    frequencies = ((i / len(thing)) for i in counts.values())
    return - sum(f * log(f, 2) for f in frequencies)

def string_shannon(string):
    counts = Counter(string)
    return shannon(counts, string)
    

def bytes_shannon(bytes):
    counts = Counter(bytes)
    return shannon(counts, bytes)


In [156]:
all_df["url_entropy"] = all_df.url.apply(lambda x: string_shannon(x) if x is not None else 0)
all_df["host_entropy"] = all_df.host.apply(lambda x: string_shannon(x) if x is not None else 0)
all_df["base_domain_entropy"] = all_df.base_domain.apply(lambda x: string_shannon(x) if x is not None else 0)
all_df["host_length"] = all_df.host.apply(lambda x: len(x) if x is not None else 0)
all_df["proto_packet_entropy"] = all_df.proto_packet_cache.apply(lambda x: bytes_shannon(x) if x is not None else 0)

In [157]:
def prepare_df_for_ml(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    fields = [
        "protocol",
        "app_layer",
        "source_addr", 
        "dest_addr",
        "source_port",
        "dest_port",
        "proto_packet_length",
        "ip_packet_length",
        # "url",
        "url_entropy",
        "host_entropy",
        "base_domain_entropy",
        "host_length",
        "proto_packet_entropy",
    ]
    new_df = df[fields]
    new_df.loc[:, ["source_port", "dest_port"]] = new_df[["source_port", "dest_port", ]].astype(str)
    new_df.loc[:, ["ip_packet_length", "source_port"]] = new_df[["ip_packet_length", "source_port", ]].astype(float)
        
    return pd.get_dummies(new_df)

In [158]:
y = all_df["malware"].values
input_values = all_df.drop("malware", axis=1)
prepped = prepare_df_for_ml(input_values)
X = np.asarray(prepped.values).astype("float64")

In [159]:
y.shape

(196981,)

In [160]:
X.shape

(196981, 6057)

In [161]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.40, random_state=234)

In [162]:
X_test, X_cv, Y_test, Y_cv = train_test_split(X_test, Y_test, test_size=0.50, random_state=33)

In [163]:
print(f"X_train.shape: {X_train.shape}")
print(f"X_test.shape: {X_test.shape}")
print(f"X_cv.shape: {X_cv.shape}")
print(f"Y_train.shape: {Y_train.shape}")
print(f"Y_test.shape: {Y_test.shape}")
print(f"Y_cv.shape: {Y_cv.shape}")


X_train.shape: (118188, 6057)
X_test.shape: (39396, 6057)
X_cv.shape: (39397, 6057)
Y_train.shape: (118188,)
Y_test.shape: (39396,)
Y_cv.shape: (39397,)


In [164]:
normalizer = tf.keras.layers.Normalization(axis=-1)

layer_1 = layers.Dense(units=9, input_shape=(X_train.shape[-1], ), activation="relu", kernel_regularizer=L2(0.01))
layer_2 = layers.Dense(units=15, activation="relu", kernel_regularizer=L2(0.01))
layer_3 = layers.Dense(units=1, activation="sigmoid", kernel_regularizer=L2(0.01))

model = keras.Sequential([
    normalizer,
    layer_1,
    layer_2,
    layer_3
])

In [165]:
model.compile(
    optimizer="adam",  # Optimizer
    # Loss function to minimize
    loss=losses.BinaryCrossentropy(),
    metrics=[
        tf.keras.metrics.BinaryAccuracy(),
        tf.keras.metrics.FalseNegatives()
    ],
)

In [166]:
model.fit(X_train, Y_train, epochs=10)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x172651d90>

In [167]:
print("Evaluate on test data")
results = model.evaluate(X_test, Y_test)
print("test loss, test acc:", results)


Evaluate on test data
test loss, test acc: [0.1785838007926941, 0.9784241914749146, 284.0]


In [173]:
print(f"cv loss and acc: {model.evaluate(X_cv, Y_cv)}")

cv loss and acc: [0.17835314571857452, 0.9790847301483154, 268.0]
