In [188]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [189]:
import logging

logging.basicConfig(format="%(asctime)s ; %(levelname)s ; %(message)s", level=logging.DEBUG)
logging.getLogger("scapy").setLevel(logging.CRITICAL)
logger = logging.getLogger("data_exploring")

from typing import Tuple
from pathlib import Path
import pandas as pd
import numpy as np
from collections import Counter
import plotly.graph_objects as go
import chart_studio.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.subplots import make_subplots

from nfstream import NFStreamer, NFPlugin

init_notebook_mode(connected=True)


In [190]:
b_pkl = "./data/benign_features.pkl"
m_pkl = "./data/malicious_features.pkl"

b_df = pd.read_pickle(b_pkl)
m_df = pd.read_pickle(m_pkl)

all_df = pd.concat([b_df, m_df])
all_df.sample(5)

Unnamed: 0,protocol,app_layer,source_addr,dest_addr,source_port,dest_port,proto_packet_length,proto_packet_cache,ip_packet_length,ip_packet_cache,...,base_domain_entropy,host_length,proto_packet_entropy,source_ip_class_a,source_ip_class_b,source_ip_class_c,dest_ip_class_a,dest_ip_class_b,dest_ip_class_c,malware
4905,IP,HTTPRequest,192.168.7.26,192.168.7.12,57789,1030,156,b'E\x00\x00\x9c\x00\x00@\x00@\x06\xaa\xe5\xc0\...,156,b'E\x00\x00\x9c\x00\x00@\x00@\x06\xaa\xe5\xc0\...,...,0.0,0,3.341446,192,192.168,192.168.7,192,192.168,192.168.7,0.0
43381,IP,Unknown,172.16.1.137,162.241.194.95,64163,465,1500,b'E\x00\x05\xdc7\xfc@\x00\x80\x06\xaa5\xac\x10...,1500,b'E\x00\x05\xdc7\xfc@\x00\x80\x06\xaa5\xac\x10...,...,0.0,0,4.221928,172,172.16,172.16.1,162,162.241,162.241.194,1.0
48106,IP,Unknown,162.215.96.249,172.16.1.137,465,64176,40,b'E\x00\x00(\n\x82\x00\x00\x80\x06~\xe4\xa2\xd...,40,b'E\x00\x00(\n\x82\x00\x00\x80\x06~\xe4\xa2\xd...,...,0.0,0,3.921928,162,162.215,162.215.96,172,172.16,172.16.1,1.0
50418,IP,Unknown,119.18.54.136,172.16.1.137,587,64175,104,b'E\x00\x00h\x0f\xda\x00\x00\x80\x06\xcf\x82w\...,104,b'E\x00\x00h\x0f\xda\x00\x00\x80\x06\xcf\x82w\...,...,0.0,0,3.921928,119,119.18,119.18.54,172,172.16,172.16.1,1.0
68901,IP,Unknown,54.144.214.6,172.16.1.137,587,64234,40,b'E\x00\x00(@\xb7\x00\x00\x80\x06?\xe96\x90\xd...,40,b'E\x00\x00(@\xb7\x00\x00\x80\x06?\xe96\x90\xd...,...,0.0,0,3.821928,54,54.144,54.144.214,172,172.16,172.16.1,1.0


In [191]:
chal_path = Path("challenge-datasets")
pcaps = []

fns = [x for x in chal_path.glob("**/*.pcap*") if x.is_file()]
for pcap in fns:
    if "malware" in str(pcap):
        malware = True
    else:
        malware = False
    pcaps.append((pcap, malware))

In [192]:
dfs = []

for pcap, malware in pcaps:
    df = NFStreamer(source=pcap, accounting_mode=1, statistical_analysis=True).to_pandas()
    df["malicious"] = malware
    dfs.append(df)
    

In [193]:
df_0 = dfs[0]
# df_0.columns
dfs[8].application_name.unique()

array(['ICMP', 'OSPF', 'Unknown', 'BGP', 'ICMPV6', 'Skype_Teams',
       'CAPWAP', 'DNS', 'DHCP', 'DHCPV6', 'IGMP', 'HTTP', 'MDNS', 'SSDP',
       'RTP', 'HTTP.RTSP', 'GRE', 'NTP', 'HSRP'], dtype=object)

In [194]:
df_0 = df_0.drop([x for x in df_0.columns if x.endswith("oui")], axis=1)
df_0 = df_0.drop([x for x in df_0.columns if x.endswith("fingerprint")], axis=1)
df_0[df_0.application_is_guessed == 1]

Unnamed: 0,id,expiration_id,src_ip,src_mac,src_port,dst_ip,dst_mac,dst_port,protocol,ip_version,...,dst2src_rst_packets,dst2src_fin_packets,application_name,application_category_name,application_is_guessed,application_confidence,requested_server_name,user_agent,content_type,malicious
55,55,0,172.16.1.137,00:02:fb:34:b4:fa,64072,96.116.224.188,00:0b:46:93:86:da,587,6,4,...,1,0,SMTP,Email,1,1,,,,True
56,56,0,172.16.1.137,00:02:fb:34:b4:fa,64071,96.116.224.188,00:0b:46:93:86:da,587,6,4,...,1,0,SMTP,Email,1,1,,,,True
57,57,0,172.16.1.137,00:02:fb:34:b4:fa,64073,96.116.224.188,00:0b:46:93:86:da,587,6,4,...,1,0,SMTP,Email,1,1,,,,True
58,58,0,172.16.1.137,00:02:fb:34:b4:fa,64075,96.116.224.188,00:0b:46:93:86:da,587,6,4,...,1,0,SMTP,Email,1,1,,,,True
59,59,0,172.16.1.137,00:02:fb:34:b4:fa,64074,96.116.224.188,00:0b:46:93:86:da,587,6,4,...,1,0,SMTP,Email,1,1,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,360,0,172.16.1.137,00:02:fb:34:b4:fa,64253,219.117.46.74,00:0b:46:93:86:da,25,6,4,...,1,0,SMTP,Email,1,1,,,,True
361,361,0,172.16.1.137,00:02:fb:34:b4:fa,64248,191.6.216.7,00:0b:46:93:86:da,587,6,4,...,0,0,SMTP,Email,1,1,,,,True
363,363,0,172.16.1.137,00:02:fb:34:b4:fa,64254,98.137.156.39,00:0b:46:93:86:da,587,6,4,...,0,0,SMTP,Email,1,1,,,,True
364,364,0,172.16.1.137,00:02:fb:34:b4:fa,64251,191.252.112.194,00:0b:46:93:86:da,25,6,4,...,0,0,SMTP,Email,1,1,,,,True


In [202]:
def replace_with_indices(row, cols, node_list):
    for col in cols:
        row[col] = node_list.index(row[col])
    return row

In [205]:
# fig = make_subplots(rows=len(dfs), cols=1)
# row = 1

def create_sankey_from_df(df):
    data = df[[
    "src_ip",
    "application_name",
    "dst_ip",
    "dst_port"
    ]].groupby(
        ["src_ip", "application_name", "dst_ip"]
    ).count().reset_index()

    data.columns = ["src_ip", "application_name", "dst_ip", "count"]
    data.loc[:, "application_name"] = data.application_name.apply(lambda x: x.split(".")[0])
    
    ## All chart nodes
    all_nodes = data.src_ip.values.tolist() + data.dst_ip.values.tolist() + data.application_name.values.tolist()

    tmp_links_1_df = data.groupby(["src_ip", "application_name"]).count()["dst_ip"].reset_index(name="value")
    tmp_links_2_df = data.groupby(["application_name", "dst_ip"]).count()["src_ip"].reset_index(name="value")
    
    links_1_df = tmp_links_1_df.apply(lambda x: replace_with_indices(x, ["src_ip", "application_name"], all_nodes), axis=1)
    links_1_df.columns = ["source", "target", "value"]
    links_2_df = tmp_links_2_df.apply(lambda row: replace_with_indices(row, ["application_name", "dst_ip"], all_nodes), axis=1)
    links_2_df.columns = ["source", "target", "value"]
    
    links_df = pd.concat([links_1_df, links_2_df]).dropna(axis=0, how="any")
    sankey_trace = go.Sankey(
        node = {
            "label": all_nodes,
            # "color": "green",
        },
        link = {
            "source": links_df["source"],
            "target": links_df["target"],
            "value": links_df["value"],
        }
    )
    return sankey_trace
    
traces = [create_sankey_from_df(df) for df in dfs]


In [206]:
len(fns)

9

In [207]:

idx = 0
fig = dict(data=[traces[idx]], layout=(dict(title=str(fns[idx]))))
iplot(fig, validate=False)


In [208]:

idx = 1
fig = dict(data=[traces[idx]], layout=(dict(title=str(fns[idx]))))
iplot(fig, validate=False)


In [209]:
idx = 2
fig = dict(data=[traces[idx]], layout=(dict(title=str(fns[idx]))))
iplot(fig, validate=False)


In [210]:
idx = 3
fig = dict(data=[traces[idx]], layout=(dict(title=str(fns[idx]))))
iplot(fig, validate=False)


In [211]:

idx = 4
fig = dict(data=[traces[idx]], layout=(dict(title=str(fns[idx]))))
iplot(fig, validate=False)


In [212]:

idx = 5
fig = dict(data=[traces[idx]], layout=(dict(title=str(fns[idx]))))
iplot(fig, validate=False)


In [216]:
traces[5]

Sankey({
    'link': {'source': array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
                                0, 106, 106, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
                              120, 121, 244, 245, 250, 250, 250, 250, 250, 250, 250, 250, 250, 250,
                              260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260,
                              260, 260, 260, 260, 260, 260, 362, 280, 281, 282, 283, 284, 285, 286,
                              286, 286, 286, 292, 352, 293, 294, 294, 294, 294, 294, 294, 294, 294,
                              294, 294, 294, 294, 294, 294, 294, 294, 294, 294, 294, 294, 294, 294,
                              294, 294, 294, 294, 294, 294, 294, 294, 294, 294, 294, 294, 294, 294,
                              294, 294, 294, 294, 294, 294, 294, 294, 294, 294, 294, 294, 343, 343,
                              343, 343, 343, 343, 343]),
             'target': array([244,

In [213]:

idx = 6
fig = dict(data=[traces[idx]], layout=(dict(title=str(fns[idx]))))
iplot(fig, validate=False)


In [214]:

idx = 7
fig = dict(data=[traces[idx]], layout=(dict(title=str(fns[idx]))))
iplot(fig, validate=False)


In [215]:

idx = 8
fig = dict(data=[traces[idx]], layout=(dict(title=str(fns[idx]))))
iplot(fig, validate=False)
