In [1]:
import numpy as np
import pandas as pd
from scapy.all import rdpcap, IP, IPv6, TCP, UDP

# What we consider "local" IPs to decide sent vs received
DEFAULT_LOCAL_PREFIXES = ["192.168.", "10.", "172.16."]

def is_local_ip(ip: str, local_prefixes=None) -> bool:
    """
    Return True if the IP belongs to a local/private range
    based on configured prefixes.
    """
    if local_prefixes is None:
        local_prefixes = DEFAULT_LOCAL_PREFIXES
    return any(ip.startswith(pref) for pref in local_prefixes)




In [2]:
def get_flow_key(pkt):
    """
    Build a 5-tuple flow key: (src_ip, dst_ip, src_port, dst_port, proto)
    Only for TCP/UDP over IPv4/IPv6.
    Returns None for non-IP or non-TCP/UDP packets.
    """
    ip_layer = None
    if IP in pkt:
        ip_layer = pkt[IP]
    elif IPv6 in pkt:
        ip_layer = pkt[IPv6]
    else:
        return None  # non-IP traffic

    if TCP in pkt:
        l4 = pkt[TCP]
        proto = "TCP"
    elif UDP in pkt:
        l4 = pkt[UDP]
        proto = "UDP"
    else:
        return None  # ignore non-TCP/UDP traffic

    src_ip = ip_layer.src
    dst_ip = ip_layer.dst
    src_port = int(l4.sport)
    dst_port = int(l4.dport)

    return (src_ip, dst_ip, src_port, dst_port, proto)


In [3]:
def process_pcap(pcap_path, local_prefixes=None):
    """
    Read a pcap file and aggregate packets into flows.
    Returns a DataFrame with flow-level features compatible
    with your CORE_FEATURES for the ML model.
    """
    if local_prefixes is None:
        local_prefixes = DEFAULT_LOCAL_PREFIXES

    print(f"Loading pcap from {pcap_path} ...")
    packets = rdpcap(pcap_path)
    print(f"Total packets: {len(packets)}")

    flows = {}

    for pkt in packets:
        key = get_flow_key(pkt)
        if key is None:
            continue

        ts = float(pkt.time)
        pkt_len = len(pkt)

        src_ip, dst_ip, src_port, dst_port, proto = key

        if key not in flows:
            flows[key] = {
                "SourceIP": src_ip,
                "DestinationIP": dst_ip,
                "SourcePort": src_port,
                "DestinationPort": dst_port,
                "Protocol": proto,
                "first_ts": ts,
                "last_ts": ts,
                "bytes_sent": 0,
                "bytes_received": 0,
                "pkt_lengths": [],
                "pkt_times": [],
            }

        f = flows[key]

        # Update timestamps
        if ts < f["first_ts"]:
            f["first_ts"] = ts
        if ts > f["last_ts"]:
            f["last_ts"] = ts

        # Direction: from local to remote = sent, else received
        if is_local_ip(src_ip, local_prefixes):
            f["bytes_sent"] += pkt_len
        else:
            f["bytes_received"] += pkt_len

        f["pkt_lengths"].append(pkt_len)
        f["pkt_times"].append(ts)

    # Build feature rows
    rows = []
    for key, f in flows.items():
        src_ip = f["SourceIP"]
        dst_ip = f["DestinationIP"]
        src_port = f["SourcePort"]
        dst_port = f["DestinationPort"]

        times = np.array(f["pkt_times"])
        lengths = np.array(f["pkt_lengths"])

        if len(times) < 2:
            deltas = np.array([0.0])
        else:
            deltas = np.diff(np.sort(times))

        duration = max(f["last_ts"] - f["first_ts"], 1e-6)  # avoid zero

        flow_bytes_sent = float(f["bytes_sent"])
        flow_bytes_received = float(f["bytes_received"])

        flow_sent_rate = flow_bytes_sent / duration
        flow_received_rate = flow_bytes_received / duration

        # Packet length stats
        pl_mean = float(lengths.mean())
        pl_var = float(lengths.var(ddof=0))
        pl_std = float(lengths.std(ddof=0))
        pl_cv = float(pl_std / pl_mean) if pl_mean != 0 else 0.0

        # Packet time stats (inter-arrival times)
        pt_mean = float(deltas.mean())
        pt_var = float(deltas.var(ddof=0))
        pt_std = float(deltas.std(ddof=0))
        pt_cv = float(pt_std / pt_mean) if pt_mean != 0 else 0.0

        row = {
            "SourceIP": src_ip,
            "DestinationIP": dst_ip,
            "SourcePort": src_port,
            "DestinationPort": dst_port,
            "TimeStamp": f["first_ts"],  # first seen
            "Duration": duration,
            "FlowBytesSent": flow_bytes_sent,
            "FlowSentRate": flow_sent_rate,
            "FlowBytesReceived": flow_bytes_received,
            "FlowReceivedRate": flow_received_rate,
            "PacketLengthMean": pl_mean,
            "PacketLengthVariance": pl_var,
            "PacketLengthStandardDeviation": pl_std,
            "PacketLengthCoefficientofVariation": pl_cv,
            "PacketTimeMean": pt_mean,
            "PacketTimeVariance": pt_var,
            "PacketTimeStandardDeviation": pt_std,
            "PacketTimeCoefficientofVariation": pt_cv,
        }

        rows.append(row)

    df = pd.DataFrame(rows)
    print(f"Built {len(df)} flows")
    return df


In [4]:
pcap_path = "capture.pcap"          # change this to your file
output_csv = "capture_flows.csv"    # output path

df_flows = process_pcap(pcap_path)
df_flows.to_csv(output_csv, index=False)

df_flows.head()


Loading pcap from capture.pcap ...
Total packets: 198
Built 51 flows


Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthMean,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthCoefficientofVariation,PacketTimeMean,PacketTimeVariance,PacketTimeStandardDeviation,PacketTimeCoefficientofVariation
0,172.31.233.30,185.125.190.57,45011,123,1764168000.0,1e-06,0.0,0.0,90.0,90000000.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,185.125.190.57,172.31.233.30,123,45011,1764168000.0,1e-06,0.0,0.0,90.0,90000000.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,172.31.233.30,185.125.190.57,40054,123,1764168000.0,1e-06,0.0,0.0,90.0,90000000.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,185.125.190.57,172.31.233.30,123,40054,1764168000.0,1e-06,0.0,0.0,90.0,90000000.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,172.31.233.30,185.125.190.57,60345,123,1764168000.0,1e-06,0.0,0.0,90.0,90000000.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df_flows.sample(n=10, random_state=42)

Unnamed: 0,SourceIP,DestinationIP,SourcePort,DestinationPort,TimeStamp,Duration,FlowBytesSent,FlowSentRate,FlowBytesReceived,FlowReceivedRate,PacketLengthMean,PacketLengthVariance,PacketLengthStandardDeviation,PacketLengthCoefficientofVariation,PacketTimeMean,PacketTimeVariance,PacketTimeStandardDeviation,PacketTimeCoefficientofVariation
43,185.125.190.57,172.31.233.30,123,32873,1764169000.0,1e-06,0.0,0.0,90.0,90000000.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,172.31.233.30,185.125.190.57,45782,123,1764169000.0,15.250419,0.0,0.0,180.0,11.80295,90.0,0.0,0.0,0.0,15.250419,0.0,0.0,0.0
46,172.31.233.30,185.125.190.57,41905,123,1764169000.0,1e-06,0.0,0.0,90.0,90000000.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,172.31.233.30,185.125.190.57,46378,123,1764168000.0,1e-06,0.0,0.0,90.0,90000000.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,172.31.233.30,185.125.190.57,42880,123,1764168000.0,1e-06,0.0,0.0,90.0,90000000.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,185.125.190.57,172.31.233.30,123,36388,1764168000.0,1e-06,0.0,0.0,90.0,90000000.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,185.125.190.57,172.31.233.30,123,54457,1764168000.0,1e-06,0.0,0.0,90.0,90000000.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32,172.31.233.30,185.125.190.57,42717,123,1764168000.0,1e-06,0.0,0.0,90.0,90000000.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,185.125.190.57,172.31.233.30,123,40054,1764168000.0,1e-06,0.0,0.0,90.0,90000000.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30,172.31.233.30,185.125.190.57,36388,123,1764168000.0,1e-06,0.0,0.0,90.0,90000000.0,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
