In [9]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import logging

logging.basicConfig(
    format="%(asctime)s ; %(levelname)s ; %(message)s",
    level=logging.DEBUG
)
logging.getLogger("scapy").setLevel(logging.CRITICAL)
logger = logging.getLogger("adAPT")

from typing import Tuple
from pathlib import Path
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.regularizers import L2
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import optimizers
from tensorflow import keras
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from collections import Counter


In [11]:
b_pkl = "./data/benign_features.pkl"
m_pkl = "./data/malicious_features.pkl"

b_df = pd.read_pickle(b_pkl)
m_df = pd.read_pickle(m_pkl)


In [12]:
b_df["malware"] = 0.0

In [13]:
m_df["malware"] = 1.0


In [14]:
all_df = pd.concat([b_df, m_df])

In [15]:
from math import log

def shannon(counts, thing):
    frequencies = ((i / len(thing)) for i in counts.values())
    return - sum(f * log(f, 2) for f in frequencies)

def string_shannon(string):
    counts = Counter(string)
    return shannon(counts, string)
    

def bytes_shannon(bytes):
    counts = Counter(bytes)
    return shannon(counts, bytes)

def get_net_class(ip: str, class_type: str) -> Tuple[str]:
    """ For ip = 192.168.1.5, provide the fillowing:
    ("192", "192.168", "192.168.1", "192.168.1.5")
    """
    parts = ip.split(".")
    if len(parts) != 4:
        return None
    if class_type.lower() == "a":
        return parts[0]
    elif class_type.lower() == "b":
        return ".".join(parts[:2])
    elif class_type.lower() == "c":
        return ".".join(parts[:3])
    elif class_type.lower() == "d":
        return ip
    else:
        raise ValueError("Class type must be A, B, C, or D")
    



In [16]:
all_df["url_entropy"] = all_df.url.apply(lambda x: string_shannon(x) if x is not None else 0)
all_df["host_entropy"] = all_df.host.apply(lambda x: string_shannon(x) if x is not None else 0)
all_df["base_domain_entropy"] = all_df.base_domain.apply(lambda x: string_shannon(x) if x is not None else 0)
all_df["host_length"] = all_df.host.apply(lambda x: len(x) if x is not None else 0)
all_df["proto_packet_entropy"] = all_df.proto_packet_cache.apply(lambda x: bytes_shannon(x) if x is not None else 0)

all_df["source_ip_class_a"] = all_df.source_addr.apply(lambda x: get_net_class(x, "A"))
all_df["source_ip_class_b"] = all_df.source_addr.apply(lambda x: get_net_class(x, "B"))
all_df["source_ip_class_c"] = all_df.source_addr.apply(lambda x: get_net_class(x, "C"))
all_df["dest_ip_class_a"] = all_df.dest_addr.apply(lambda x: get_net_class(x, "A"))
all_df["dest_ip_class_b"] = all_df.dest_addr.apply(lambda x: get_net_class(x, "B"))
all_df["dest_ip_class_c"] = all_df.dest_addr.apply(lambda x: get_net_class(x, "C"))


In [19]:
all_df.sample(5)

Unnamed: 0,protocol,app_layer,source_addr,dest_addr,source_port,dest_port,proto_packet_length,proto_packet_cache,ip_packet_length,ip_packet_cache,...,host_entropy,base_domain_entropy,host_length,proto_packet_entropy,source_ip_class_a,source_ip_class_b,source_ip_class_c,dest_ip_class_a,dest_ip_class_b,dest_ip_class_c
10466,IP,Unknown,10.9.20.144,104.21.50.34,49790,80,40,b'E\x00\x00(\xe5Z@\x00\x80\x06\\\xa5\n\t\x14\x...,40,b'E\x00\x00(\xe5Z@\x00\x80\x06\\\xa5\n\t\x14\x...,...,0.0,0.0,0,4.084184,10,10.9,10.9.20,104,104.21,104.21.50
23087,Ethernet,Unknown,84.146.135.221,217.0.5.215,7078,5690,202,b'<a\x04P\xd2\x1a\xc8\x0e\x14~3\xa0\x81\x00',202,b'E\x00\x00\xc8m\xf6@\x00@\x11\x10\xe8T\x92\x8...,...,0.0,0.0,0,3.807355,84,84.146,84.146.135,217,217.0,217.0.5
468,IP,HTTPResponse,104.223.119.167,10.1.4.101,80,50046,1428,b'E\x00\x05\x94\x03\x01\x00\x00\x80\x06Cwh\xdf...,1428,b'E\x00\x05\x94\x03\x01\x00\x00\x80\x06Cwh\xdf...,...,0.0,0.0,0,3.884184,104,104.223,104.223.119,10,10.1,10.1.4
10932,Ethernet,Unknown,217.0.5.215,84.146.135.221,15020,7078,202,b'\xc8\x0e\x14~3\xa0<a\x04P\xd2\x1a\x81\x00',202,b'E\xb8\x00\xc8\x00\x00\x00\x00q\x11\x8d&\xd9\...,...,0.0,0.0,0,3.807355,217,217.0,217.0.5,84,84.146,84.146.135
16433,IP,Unknown,162.246.19.18,172.16.1.137,465,64088,40,b'E\x00\x00(\xc6\x04\x00\x00\x80\x06\x11*\xa2\...,40,b'E\x00\x00(\xc6\x04\x00\x00\x80\x06\x11*\xa2\...,...,0.0,0.0,0,3.921928,162,162.246,162.246.19,172,172.16,172.16.1


In [20]:
def prepare_df_for_ml(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    fields = [
        "protocol",
        "app_layer",
        "source_addr", 
        "dest_addr",
        "source_port",
        "dest_port",
        "proto_packet_length",
        "ip_packet_length",
        # "url",
        "url_entropy",
        "host_entropy",
        "base_domain_entropy",
        "host_length",
        "proto_packet_entropy",
    ]
    new_df = df[fields]
    new_df.loc[:, ["source_port", "dest_port"]] = new_df[["source_port", "dest_port", ]].astype(str)
    new_df.loc[:, ["ip_packet_length", "source_port"]] = new_df[["ip_packet_length", "source_port", ]].astype(float)
        
    return pd.get_dummies(new_df)

In [21]:
y = all_df["malware"].values
input_values = all_df.drop("malware", axis=1)
prepped = prepare_df_for_ml(input_values)
X = np.asarray(prepped.values).astype("float64")

In [22]:
y.shape

(196981,)

In [23]:
X.shape

(196981, 6057)

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.40, random_state=234)

In [25]:
X_test, X_cv, Y_test, Y_cv = train_test_split(X_test, Y_test, test_size=0.50, random_state=33)

In [26]:
print(f"X_train.shape: {X_train.shape}")
print(f"X_test.shape: {X_test.shape}")
print(f"X_cv.shape: {X_cv.shape}")
print(f"Y_train.shape: {Y_train.shape}")
print(f"Y_test.shape: {Y_test.shape}")
print(f"Y_cv.shape: {Y_cv.shape}")


X_train.shape: (118188, 6057)
X_test.shape: (39396, 6057)
X_cv.shape: (39397, 6057)
Y_train.shape: (118188,)
Y_test.shape: (39396,)
Y_cv.shape: (39397,)


In [27]:
normalizer = tf.keras.layers.Normalization(axis=-1)

layer_1 = layers.Dense(units=9, input_shape=(X_train.shape[-1], ), activation="relu", kernel_regularizer=L2(0.01))
layer_2 = layers.Dense(units=15, activation="relu", kernel_regularizer=L2(0.01))
layer_3 = layers.Dense(units=1, activation="sigmoid", kernel_regularizer=L2(0.01))

model = keras.Sequential([
    normalizer,
    layer_1,
    layer_2,
    layer_3
])

In [28]:
model.compile(
    optimizer="adam",  # Optimizer
    # Loss function to minimize
    loss=losses.BinaryCrossentropy(),
    metrics=[
        tf.keras.metrics.BinaryAccuracy(),
        tf.keras.metrics.FalseNegatives()
    ],
)

In [29]:
model.fit(X_train, Y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x16db29c70>

In [30]:
print("Evaluate on test data")
results = model.evaluate(X_test, Y_test)
print("test loss, test acc:", results)


Evaluate on test data
test loss, test acc: [0.19354166090488434, 0.9673570990562439, 170.0]


In [31]:
print(f"cv loss and acc: {model.evaluate(X_cv, Y_cv)}")

cv loss and acc: [0.19438008964061737, 0.9676371216773987, 164.0]
