In [1]:
%load_ext autoreload
%autoreload 2


In [58]:
import logging

logging.basicConfig(
    format="%(asctime)s ; %(levelname)s ; %(message)s",
    level=logging.DEBUG
)
logging.getLogger("scapy").setLevel(logging.CRITICAL)
log = logging.getLogger("adAPT")

from pathlib import Path
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.regularizers import L2
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import optimizers
from tensorflow import keras
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from collections import Counter



In [23]:
b_pkl = "./data/benign_features.pkl"
m_pkl = "./data/malicious_features.pkl"

b_df = pd.read_pickle(b_pkl)
m_df = pd.read_pickle(m_pkl)


In [24]:
b_df["malware"] = 0

In [25]:
m_df["malware"] = 1


In [26]:
all_df = pd.concat([b_df, m_df])

In [27]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 196981 entries, 0 to 13709
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   protocol             196981 non-null  object
 1   app_layer            196981 non-null  object
 2   source_addr          196981 non-null  object
 3   dest_addr            196981 non-null  object
 4   source_port          196981 non-null  int64 
 5   dest_port            196981 non-null  int64 
 6   proto_packet_length  196981 non-null  int64 
 7   proto_packet_cache   196981 non-null  object
 8   ip_packet_length     196981 non-null  int64 
 9   ip_packet_cache      196981 non-null  object
 10  parsed               36832 non-null   object
 11  malware              196981 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 19.5+ MB


In [28]:
all_df

Unnamed: 0,protocol,app_layer,source_addr,dest_addr,source_port,dest_port,proto_packet_length,proto_packet_cache,ip_packet_length,ip_packet_cache,parsed,malware
0,IP,Unknown,192.168.110.10,80.237.133.136,1152,80,52,b'E\x00\x004\x10M@\x00\x80\x06\x00\x00\xc0\xa8...,52,b'E\x00\x004\x10M@\x00\x80\x06\x00\x00\xc0\xa8...,,0
1,IP,Unknown,80.237.133.136,192.168.110.10,80,1152,52,b'E\x00\x004\x00\x00@\x006\x06?\x9cP\xed\x85\x...,52,b'E\x00\x004\x00\x00@\x006\x06?\x9cP\xed\x85\x...,,0
2,IP,Unknown,192.168.110.10,80.237.133.136,1152,80,40,b'E\x00\x00(\x10N@\x00\x80\x06\x00\x00\xc0\xa8...,40,b'E\x00\x00(\x10N@\x00\x80\x06\x00\x00\xc0\xa8...,,0
3,IP,HTTPRequest,192.168.110.10,80.237.133.136,1152,80,351,b'E\x00\x01_\x10O@\x00\x80\x06\x00\x00\xc0\xa8...,351,b'E\x00\x01_\x10O@\x00\x80\x06\x00\x00\xc0\xa8...,"{'method': 'GET', 'path': '/', 'host': 'ip.web...",0
4,IP,Unknown,80.237.133.136,192.168.110.10,80,1152,40,b'E\x00\x00(:\xad@\x006\x06\x04\xfbP\xed\x85\x...,40,b'E\x00\x00(:\xad@\x006\x06\x04\xfbP\xed\x85\x...,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
13705,UDP,DNSRequestResponse,10.9.20.5,10.9.20.144,53,50860,100,b'\x005\xc6\xac\x00d?\xb1',100,b'E\x00\x00x\x0eR\x00\x00\x80\x11\xef|\n\t\x14...,"{'qname': 'vjur2fho2j3.clus.ga.', 'qtype': 1, ...",1
13706,UDP,DNSQueryRequest,10.9.20.144,10.9.20.5,62772,53,55,b'\xf54\x005\x007!j',55,b'E\x00\x00K\x9da\x00\x00\x80\x11`\x9a\n\t\x14...,"{'qname': 'v10.events.data.microsoft.com.', 'q...",1
13707,UDP,DNSRequestResponse,10.9.20.5,10.9.20.144,53,62772,185,b'\x005\xf54\x00\xb9\xb8\x97',185,b'E\x00\x00\xcd\x0eS\x00\x00\x80\x11\xef&\n\t\...,"{'qname': 'v10.events.data.microsoft.com.', 'q...",1
13708,UDP,DNSQueryRequest,10.9.20.144,10.9.20.5,58421,53,45,b'\xe45\x005\x00-\xea\x17',45,b'E\x00\x00A\x9d\x8d\x00\x00\x80\x11`x\n\t\x14...,"{'qname': 'vjur2fho2j3.clus.ga.', 'qtype': 1, ...",1


In [72]:
from valid_tlds import TLDS

def get_url(d: dict) -> str:
    if d is None:
        return ""
    url = ""
    if "host" in d:
        url = d["host"]
    if "qname" in d:
        url = d["qname"]
    if not isinstance(url, str):
        return ""
    if url.endswith("."):
        url = url[:-1]
    return url
    
def get_base_domain(s: str) -> str:
    if not isinstance(s, str):
        return s
    if not "." in s:
        return ""
    index = 0
    for tld in TLDS:
        if s.endswith("." + tld):
            tld_parts = tld.split(".")
            index = len(tld_parts)
            break
    if not index:
        return ""  # not a valid tld
    index = index + 1
    parts = s.rsplit(".", maxsplit=index)
    return ".".join(parts[-1 * index:])

def get_host_part(s: str) -> str:
    if not isinstance(s, str):
        return s
    if not "." in s:
        return ""
    base_domain = get_base_domain(s)
    if base_domain:
        tail_length = -1 * len(base_domain) - 1
    else:
        tail_length = len(s)
    return s[:tail_length]  # extra -1 to account for trailing "."

In [73]:
for d in [{"host": s} for s in ["domain.com", "www.domain.com", "asdf.1234.domain.com", "www.domain.com.br", "www.domain.co.uk", "asdf.132.domain.co.uk", "asf.local", "asf.arpa", "asf.doesntexist"]]:
    s = get_url(d)
    print(f"d: {d}")
    print(f"get_url(d): {get_url(d)}")
    print(f"get_base_domain(s): {get_base_domain(s)}")
    print(f"get_host_part(s): {get_host_part(s)}")


d: {'host': 'domain.com'}
get_url(d): domain.com
get_base_domain(s): domain.com
get_host_part(s): 
d: {'host': 'www.domain.com'}
get_url(d): www.domain.com
get_base_domain(s): domain.com
get_host_part(s): www
d: {'host': 'asdf.1234.domain.com'}
get_url(d): asdf.1234.domain.com
get_base_domain(s): domain.com
get_host_part(s): asdf.1234
d: {'host': 'www.domain.com.br'}
get_url(d): www.domain.com.br
get_base_domain(s): domain.com.br
get_host_part(s): www
d: {'host': 'www.domain.co.uk'}
get_url(d): www.domain.co.uk
get_base_domain(s): domain.co.uk
get_host_part(s): www
d: {'host': 'asdf.132.domain.co.uk'}
get_url(d): asdf.132.domain.co.uk
get_base_domain(s): domain.co.uk
get_host_part(s): asdf.132
d: {'host': 'asf.local'}
get_url(d): asf.local
get_base_domain(s): asf.local
get_host_part(s): 
d: {'host': 'asf.arpa'}
get_url(d): asf.arpa
get_base_domain(s): asf.arpa
get_host_part(s): 
d: {'host': 'asf.doesntexist'}
get_url(d): asf.doesntexist
get_base_domain(s): 
get_host_part(s): asf.doesnt

In [74]:
all_df["url"] = all_df.parsed.apply(lambda x: get_url(x))
all_df["domain"] = all_df.url.apply(lambda x: get_base_domain(x))
all_df["host_name"] = all_df.url.apply(lambda x: get_host_part(x))

In [79]:
def prepare_df_for_ml(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    new_df = df[[
        "protocol",
        "app_layer",
        "source_addr", 
        "dest_addr",
        "source_port",
        "dest_port",
        "proto_packet_length",
        "ip_packet_length",
        "url",
    ]]
    new_df.loc[:, ["source_port", "dest_port"]] = new_df[["source_port", "dest_port", ]].astype(str)
    new_df.loc[:, ["ip_packet_length", "source_port"]] = new_df[["ip_packet_length", "source_port", ]].astype(float)
        
    return pd.get_dummies(new_df)

In [80]:
y = np.asarray(all_df["malware"].values).astype("float64")
input_values = all_df.drop("malware", axis=1)
prepped = prepare_df_for_ml(input_values)
X = np.asarray(prepped.values).astype("float64")

In [81]:
y.shape

(196981,)

In [82]:
X.shape

(196981, 6422)

In [83]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.40, random_state=234)

In [84]:
X_test, X_cv, Y_test, Y_cv = train_test_split(X_test, Y_test, test_size=0.50, random_state=33)

In [85]:
print(f"X_train.shape: {X_train.shape}")
print(f"X_test.shape: {X_test.shape}")
print(f"X_cv.shape: {X_cv.shape}")
print(f"Y_train.shape: {Y_train.shape}")
print(f"Y_test.shape: {Y_test.shape}")
print(f"Y_cv.shape: {Y_cv.shape}")


X_train.shape: (118188, 6422)
X_test.shape: (39396, 6422)
X_cv.shape: (39397, 6422)
Y_train.shape: (118188,)
Y_test.shape: (39396,)
Y_cv.shape: (39397,)


In [86]:
normalizer = tf.keras.layers.Normalization(axis=-1)

layer_1 = layers.Dense(units=9, input_shape=(X_train.shape[-1], ), activation="relu", kernel_regularizer=L2(0.01))
layer_2 = layers.Dense(units=15, activation="relu", kernel_regularizer=L2(0.01))
layer_3 = layers.Dense(units=1, activation="sigmoid", kernel_regularizer=L2(0.01))

model = keras.Sequential([
    normalizer,
    layer_1,
    layer_2,
    layer_3
])

In [87]:
model.compile(
    optimizer="adam",  # Optimizer
    # Loss function to minimize
    loss=losses.BinaryCrossentropy(),
    metrics=[
        tf.keras.metrics.BinaryAccuracy(),
        tf.keras.metrics.FalseNegatives()
    ],
)

In [88]:
model.fit(X_train, Y_train, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x168628850>

In [89]:
print("Evaluate on test data")
results = model.evaluate(X_test, Y_test)
print("test loss, test acc:", results)


Evaluate on test data
test loss, test acc: [0.172318235039711, 0.978094220161438, 171.0]


In [90]:
cv_output = model.predict(X_cv)
cv_output



array([[0.991368  ],
       [0.2177625 ],
       [0.97260594],
       ...,
       [0.9822493 ],
       [0.9917499 ],
       [0.9847761 ]], dtype=float32)

In [91]:
all_df[all_df.malware == False].head(5)

Unnamed: 0,protocol,app_layer,source_addr,dest_addr,source_port,dest_port,proto_packet_length,proto_packet_cache,ip_packet_length,ip_packet_cache,parsed,malware,url,domain,host_name
0,IP,Unknown,192.168.110.10,80.237.133.136,1152,80,52,b'E\x00\x004\x10M@\x00\x80\x06\x00\x00\xc0\xa8...,52,b'E\x00\x004\x10M@\x00\x80\x06\x00\x00\xc0\xa8...,,0,,,
1,IP,Unknown,80.237.133.136,192.168.110.10,80,1152,52,b'E\x00\x004\x00\x00@\x006\x06?\x9cP\xed\x85\x...,52,b'E\x00\x004\x00\x00@\x006\x06?\x9cP\xed\x85\x...,,0,,,
2,IP,Unknown,192.168.110.10,80.237.133.136,1152,80,40,b'E\x00\x00(\x10N@\x00\x80\x06\x00\x00\xc0\xa8...,40,b'E\x00\x00(\x10N@\x00\x80\x06\x00\x00\xc0\xa8...,,0,,,
3,IP,HTTPRequest,192.168.110.10,80.237.133.136,1152,80,351,b'E\x00\x01_\x10O@\x00\x80\x06\x00\x00\xc0\xa8...,351,b'E\x00\x01_\x10O@\x00\x80\x06\x00\x00\xc0\xa8...,"{'method': 'GET', 'path': '/', 'host': 'ip.web...",0,ip.webernetz.net,webernetz.net,ip
4,IP,Unknown,80.237.133.136,192.168.110.10,80,1152,40,b'E\x00\x00(:\xad@\x006\x06\x04\xfbP\xed\x85\x...,40,b'E\x00\x00(:\xad@\x006\x06\x04\xfbP\xed\x85\x...,,0,,,


In [92]:
all_df[all_df.malware == True].head(5)

Unnamed: 0,protocol,app_layer,source_addr,dest_addr,source_port,dest_port,proto_packet_length,proto_packet_cache,ip_packet_length,ip_packet_cache,parsed,malware,url,domain,host_name
0,IP,Unknown,172.16.1.137,203.26.41.132,64020,443,52,b'E\x00\x004\xaa!@\x00\x80\x06\xaej\xac\x10\x0...,52,b'E\x00\x004\xaa!@\x00\x80\x06\xaej\xac\x10\x0...,,1,,,
1,IP,Unknown,203.26.41.132,172.16.1.137,443,64020,44,"b'E\x00\x00,\x8f\xe3\x00\x00\x80\x06\x08\xb1\x...",44,"b'E\x00\x00,\x8f\xe3\x00\x00\x80\x06\x08\xb1\x...",,1,,,
2,IP,Unknown,172.16.1.137,203.26.41.132,64020,443,40,"b'E\x00\x00(\xaa""@\x00\x80\x06\xaeu\xac\x10\x0...",40,"b'E\x00\x00(\xaa""@\x00\x80\x06\xaeu\xac\x10\x0...",,1,,,
3,IP,Unknown,172.16.1.137,203.26.41.132,64020,443,230,b'E\x00\x00\xe6\xaa#@\x00\x80\x06\xad\xb6\xac\...,230,b'E\x00\x00\xe6\xaa#@\x00\x80\x06\xad\xb6\xac\...,,1,,,
4,IP,Unknown,203.26.41.132,172.16.1.137,443,64020,40,b'E\x00\x00(\x8f\xe4\x00\x00\x80\x06\x08\xb4\x...,40,b'E\x00\x00(\x8f\xe4\x00\x00\x80\x06\x08\xb4\x...,,1,,,


In [93]:
prepped

Unnamed: 0,proto_packet_length,ip_packet_length,protocol_Ethernet,protocol_IP,protocol_IPv6,protocol_UDP,protocol_cooked linux,app_layer_DNSQueryRequest,app_layer_DNSRequestResponse,app_layer_HTTPRequest,...,url_xn--ddabeekggjjjx59c0ay7a7a9dtb0a6a6b4b7f2bxcwc1e0cvc8c7c.weberdns.de,url_xn--dsire-bsad.weberdns.de,url_xn--fan-2na.weberdns.de,url_xn--fnf-hoa.weberdns.de,url_xn--heizlrckstossabdmpfung-g5b33b6e.weberdns.de,url_xn--hr-yia.weberdns.de,url_xn--ser-0ma.weberdns.de,url_xn--ss-xja9aehhiki25gyaz3a4a6a7a3bzb4b8b5b3bzcxczc1c1c2ewc3c.weberdns.de,url_xn--yourt-l1a.weberdns.de,url_yiuahd.sophiaemarlibuffetme.link
0,52,52,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,52,52,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,40,40,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,351,351,False,True,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
4,40,40,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13705,100,100,False,False,False,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
13706,55,55,False,False,False,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
13707,185,185,False,False,False,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
13708,45,45,False,False,False,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [95]:
all_df.domain.unique()

array(['', 'webernetz.net', 'heise.de', 'in-addr.arpa', 'ip6.arpa',
       'weberdns.de', '_tcp.local', 'Johannes-ei-Patt.local',
       'vtuner.com', '_udp.local', 'ripe.net', 'netflix.com', 'co.uk',
       'ultradns.net', 'ultradns.biz', 'ultradns.com', 'ultradns.org',
       'awsdns-29.net', 'awsdns-46.com', 'awsdns-58.org', 'weberlab.de',
       'sshfp.net', 'nobody.invalid', 'cisco.com', 'radb.net', 'denic.de',
       'verisign-grs.com', 'apnic.net', 'ubuntu.com', 'ntp.org',
       'netsec.blog', 'duckduckgo.com', 'hornbach.de', 'cloudfront.net',
       'outbrain.com', 'wp.com', 'kachelmannwetter.com',
       'google-analytics.com', 'youtube.com', 'riot.im', 'akamaiedge.net',
       'com.pk', 'co.jp', 'fskasna.com', 'securemail.pro', 'ne.jp',
       'net.pk', 'fujibousaisetubi.com', 'co.id', 'com.vn', 'sim23.ua',
       'goserver.host', 'com.br', 't-online.hu', 'gotrans.asia',
       'host.bg', 'ovh.net', 'alice.it', 'seymetal.com', 'ranus.hr',
       'com.au', 'lwspanel.com', 'un