In [10]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
import logging

logging.basicConfig(
    format="%(asctime)s ; %(levelname)s ; %(message)s",
    level=logging.DEBUG
)
logging.getLogger("scapy").setLevel(logging.CRITICAL)
log = logging.getLogger("adAPT")

from pathlib import Path
import tensorflow as tf
import pandas as pd
import numpy as np
from scapy import all as sp


In [12]:
FILE_DIR = Path("~/GitRepos/challenge-datasets/").expanduser()
BENIGN_DIR = FILE_DIR / "benign"
MALWAR_DIR = FILE_DIR / "malware"

BENIGN_FILES = list([f for f in BENIGN_DIR.iterdir() if str(f).endswith(".pcap") or str(f).endswith(".pcapng")])
MALWAR_FILES = list([f for f in MALWAR_DIR.iterdir() if str(f).endswith(".pcap") or str(f).endswith(".pcapng")])


assert BENIGN_DIR.exists(), "Benign dir cannot be found"
assert MALWAR_DIR.exists(), "Malware dir cannot be found"


In [13]:
BENIGN_FILES

[PosixPath('/Users/jedmitten/GitRepos/challenge-datasets/benign/The Ultimate PCAP v20221220.pcapng')]

In [14]:
MALWAR_FILES

[PosixPath('/Users/jedmitten/GitRepos/challenge-datasets/malware/2023-03-07-Emotet-epoch4-infection-with-spambot-traffic-carved.pcap'),
 PosixPath('/Users/jedmitten/GitRepos/challenge-datasets/malware/2023-04-13-MetaStealer-C2-traffic.pcap'),
 PosixPath('/Users/jedmitten/GitRepos/challenge-datasets/malware/2023-01-05-Agent-Tesla-variant-traffic.pcap'),
 PosixPath('/Users/jedmitten/GitRepos/challenge-datasets/malware/2023-01-16-IcedID-infection-with-Backonnect-and-VNC-and-Cobalt-Strike.pcap'),
 PosixPath('/Users/jedmitten/GitRepos/challenge-datasets/malware/2022-01-04-Remcos-RAT-infection-traffic.pcap'),
 PosixPath('/Users/jedmitten/GitRepos/challenge-datasets/malware/2023-03-18-Emotet-E5-infection-traffic.pcap'),
 PosixPath('/Users/jedmitten/GitRepos/challenge-datasets/malware/2022-12-07-Bumblebee-infection-with-Cobalt-Strike.pcap'),
 PosixPath('/Users/jedmitten/GitRepos/challenge-datasets/malware/2022-09-21-Astaroth-Guildma-infection.pcap')]

In [15]:
from collections import Counter

EXCLUDE_NAMES = ["Ethernet", "802.3", "cooked linux", "MPacket Preamble"]
INTERESTING_SERVICE_PORTS = [80, 443, 22, 53, 21, 20, 25, 179, 465]
IGNORE_SERVICE_PORTS = list(range(20))  # skip packets with ports lower than 20
IGNORE_SERVICE_PORTS.append(37)  # time protocol
IGNORE_SERVICE_PORTS.append(646)  # ldp protocol
IGNORE_SERVICE_PORTS.append(179)  # bgp protocol


In [16]:
from typing import Any, Dict, List
sp.load_layer("http")

class Protocol:
    UDP = 17
    IPv4 = 6
    IPv6 = 34525
    IPv6_enc = 41
    

class App:
    Unknown = -1
    HTTP = 0
    HTTPS = 1
    DNS = 2
    FTP = 3
    SSH = 4
    SMTP = 5
    
HTTP_METHODS = ["GET", "POST", "HEAD", "PUT", "DELETE", "CONNECT", "OPTIONS", "TRACE", "PATCH"]
    

def get_proto(pkt: sp.Packet) -> Any:
    try:
        if pkt.proto == Protocol.IPv6 or pkt.proto == Protocol.IPv6_enc:
            return sp.IPv6
        if pkt.proto == Protocol.IPv4:
            return sp.IP
        if pkt.proto == Protocol.UDP:
            return sp.UDP
        return None
    except:
        return None
    

def identify_raw(pkt: sp.Packet) -> App:
    lines = pkt.load.encode().split("\n")
    log.debug(f"identifying raw from lines[0]: {lines[0]}")
    if lines[0].split(" ")[0] in HTTP_METHODS:
        return App.HTTP
    return App.Unknown

    
def parse_raw(pkt: sp.Packet) -> Dict:
    """ Turn a Raw payload into a dictionary of data """
    log.info("entering parse_raw...")
    raw_app = identify_raw(pkt)
    if raw_app != App.Unknown:
        log.debug(f"Identified app: {raw_app}")

    try:
        raw_text = pkt.load.encode()
        lines = raw_text.split("\n")
        if raw_app == App.HTTP:
            command, path, _ = lines[0].split(" ", maxsplit=2)
            _, host = lines[1].split(": ")
            _, user_agent = lines[2].split(": ")
            _, accept = lines[3].split(": ")
            _, accept_language = lines[4].split(": ")
            _, accept_encoding = lines[5].split(": ")
            
            d = {
                "command": command,
                "path": path,
                "host": host,
                "user_agent": user_agent,
                "accept": accept,
                "accept_language": accept_language,
                "accept_encoding": accept_encoding,
            }
            
            return d
        else:
            log.warning(f"Could not parse {raw_text}")
            return {"raw": raw_text}
    except:
        pass
    
def make_rows(pkts: sp.PacketList) -> List[Dict]:
    """ Read a packet, output a dict of values """
    
    log.debug(f"Filtering packets on IP, IPv6, and UDP")

    for pkt in pkts[sp.IP] + pkts[sp.IPv6] + pkts[sp.UDP]:
        proto = get_proto(pkt)
        log.debug(f"Identified proto as {proto}")
        try:
            try:
                pkt[proto].sport
                pkt[proto].dport
            except:
                # this is not a packet with necessary attrs
                log.warning("Could not find src or dst ports, skipping...")
                continue
            if pkt[proto].sport in IGNORE_SERVICE_PORTS or pkt[proto].dport in IGNORE_SERVICE_PORTS:
                # skip packets with certain service ports
                continue
            parsed = None
            if 'Raw' in pkt:
                log.debug("Found 'Raw' layer in packet. Parsing...")
                parsed = parse_raw(pkt['Raw'])
            row = {
                "protocol": pkt[proto].name,
                "source_addr": pkt[sp.IP].src,
                "dest_addr": pkt[sp.IP].dst,
                "source_port": pkt[proto].sport,
                "dest_port": pkt[proto].dport,
                "proto_packet_length": pkt[proto].len,
                "proto_packet_cache": pkt[proto].raw_packet_cache,
                "ip_packet_length": pkt[sp.IP].len,
                "ip_options": pkt[sp.IP].options,
                "ip_packet_cache": pkt[sp.IP].raw_packet_cache,
                "parsed": parsed,
            }
            yield row
        except Exception as e:
            continue
            

In [18]:
with open(BENIGN_FILES[0], "rb") as f:
    log.info(f"Reading {BENIGN_FILES[0]}...")
    rows = make_rows(sp.rdpcap(f))

2023-05-09 15:07:59,015 ; INFO ; Reading /Users/jedmitten/GitRepos/challenge-datasets/benign/The Ultimate PCAP v20221220.pcapng...


In [72]:
[r for r in rows if r['parsed'] is not None]

[]

In [42]:
len(list(rows))

0

In [23]:
from collections import Counter

EXCLUDE_NAMES = ["Ethernet", "802.3", "cooked linux", "MPacket Preamble"]
INTERESTING_SERVICE_PORTS = [80, 443, 22, 53, 21, 20, 25, 179, 465]
IGNORE_SERVICE_PORTS = list(range(20))  # skip packets with ports lower than 20
IGNORE_SERVICE_PORTS.append(37)  # time protocol
IGNORE_SERVICE_PORTS.append(646)  # ldp protocol
IGNORE_SERVICE_PORTS.append(179)  # bgp protocol

def process_pcap(pcap_file):
    pcap = sp.rdpcap(pcap_file)
    dports = []
    urls = []
    
    clients = []
    servers = []
    
    ipv4_packets = pcap[sp.IP]
    ipv6_packets = pcap[sp.IPv6]
    tcp_packets = pcap[sp.TCP]
    udp_packets = pcap[sp.UDP]
    protocols = []
    
    print(f"Found {len(ipv4_packets)} IPv4 packets")
    print(f"Found {len(ipv6_packets)} IPv6 packets")
    print(f"Found {len(tcp_packets)} TCP packets")
    print(f"Found {len(udp_packets)} UDP packets")
    # print(f"TCP Fields: {tcp_packets[0].fields}")
    # print(f"UDP Fields: {udp_packets[0].fields}")
    
    sessions = pcap.sessions()
    
    for sess in sessions:
        for pkt in sessions[sess]:
            try:
                if pkt.sport in IGNORE_SERVICE_PORTS or pkt.dport in IGNORE_SERVICE_PORTS:
                    # skip certain packets
                    continue

                protocols.append(pkt.proto)
                if pkt.proto == 34525 or pkt.proto == 41:
                    payload_class = sp.IPv6
                elif pkt.proto == 6:
                    payload_class = sp.IP
                elif pkt.proto == 17:
                    payload_class = sp.UDP
                else:
                    # skip non-IP packets
                    continue
                if pkt.sport in INTERESTING_SERVICE_PORTS:
                    servers.append(pkt[payload_class].src)
                    clients.append(pkt[payload_class].dst)
                elif pkt.dport in INTERESTING_SERVICE_PORTS:
                    servers.append(pkt[payload_class].dst)
                    clients.append(pkt[payload_class].src)

                dports.append(pkt.dport)
                pkt_len = len(pkt[sp.TCP])
                # urls.append(sp.get_url_from_payload(payload))
                pass
            except IndexError as e:
                # print("TCP not found in packet")
                pass
            except AttributeError as e:
                # no url found 
                pass

    print(f"dports: {Counter(dports)}")
    print(f"servers: {Counter(servers)}")
    print(f"protocols: {Counter(protocols)}")
    # print(f"urls: {Counter(urls)}")


In [24]:
for fn in BENIGN_FILES:
    with open(fn, "rb") as pcap_file:
        print(f"Processing {fn}...")
        process_pcap(pcap_file)


Processing /Users/jedmitten/GitRepos/challenge-datasets/benign/The Ultimate PCAP v20221220.pcapng...
Found 23056 IPv4 packets
Found 11145 IPv6 packets
Found 8374 TCP packets
Found 17442 UDP packets
dports: Counter({1303: 3368, 123: 703, 3785: 472, 3222: 388, 10051: 373, 1985: 338, 500: 226, 5353: 204, 1030: 197, 8080: 172, 57789: 169, 514: 161, 162: 158, 57556: 146, 53: 143, 1305: 103, 601: 103, 21: 102, 443: 81, 38889: 73, 520: 68, 1900: 68, 3784: 65, 65535: 60, 54723: 46, 1307: 46, 51111: 46, 3295: 42, 51079: 41, 51104: 39, 49: 36, 3296: 34, 53390: 33, 65534: 30, 51108: 30, 57895: 28, 51072: 27, 51109: 26, 53702: 26, 3293: 25, 45271: 23, 80: 21, 65439: 20, 6514: 20, 9100: 19, 515: 19, 1967: 18, 3292: 18, 3298: 18, 51078: 16, 50274: 16, 57221: 14, 50912: 13, 51106: 12, 58486: 12, 3128: 11, 5246: 11, 5247: 11, 25: 11, 33878: 11, 64199: 11, 64091: 11, 57994: 11, 52984: 11, 51297: 11, 51587: 11, 59001: 11, 59325: 11, 57595: 11, 54445: 11, 12176: 11, 56372: 11, 40925: 11, 33054: 11, 1556:

In [86]:
for fn in MALWAR_FILES:
    with open(fn, "rb") as pcap_file:
        print(f"Processing {fn}...")
        scapy_pcap = sp.rdpcap(pcap_file)
        process_pcap(scapy_pcap)

Processing /Users/jedmitten/GitRepos/challenge-datasets/malware/2023-03-07-Emotet-epoch4-infection-with-spambot-traffic-carved.pcap...
Found 75333 IPv4 packets
Found 214 IPv6 packets
Found 75225 TCP packets
Found 322 UDP packets
dports: Counter({465: 16125, 587: 7545, 64234: 6164, 64164: 6043, 64173: 5562, 64068: 5362, 64088: 5012, 64163: 4038, 443: 2017, 64195: 1759, 64176: 1573, 64110: 1385, 64067: 1364, 8080: 1193, 7080: 1186, 64198: 991, 64157: 965, 64101: 910, 64129: 656, 64021: 649, 64214: 649, 64065: 623, 64055: 419, 64031: 385, 25: 245, 64046: 215, 64039: 205, 80: 125, 64060: 121, 64058: 119, 64026: 113, 64023: 112, 64033: 53, 64036: 53, 64204: 34, 64135: 32, 64160: 32, 64108: 25, 64085: 21, 64070: 21, 64136: 21, 64169: 21, 64175: 21, 64181: 21, 64180: 21, 64208: 21, 64216: 21, 64099: 20, 64134: 20, 64133: 20, 64138: 20, 64161: 20, 64167: 20, 64183: 20, 64241: 20, 64246: 20, 64247: 20, 64086: 17, 64090: 17, 64132: 17, 64182: 17, 64215: 17, 64252: 17, 64104: 16, 64158: 16, 64239

### Using PyCapKIT

In [83]:
BENIGN_FILES[0]

PosixPath('/Users/jedmitten/GitRepos/challenge-datasets/benign/The Ultimate PCAP v20221220.pcapng')

In [85]:
from pcapkit import extract

d = extract(fin=BENIGN_FILES[0], nofile=True)


[ERROR] 05/08/2023 09:50:19 PM - FileNotFound: 


FileNotFound: [Errno 2] No such file or directory: '/Users/jedmitten/GitRepos/challenge-datasets/benign/The Ultimate PCAP v20221220.pcapng.pcap'