In [1]:
import pyshark
from pathlib import Path
import nest_asyncio
nest_asyncio.apply()
import pandas as pd

In [2]:
path_to_data = Path.cwd().parent / "Data"

path_to_pcapng = path_to_data / "raw/ALL_20250721_194104.pcapng"
# point to your capture
cap = pyshark.FileCapture(path_to_pcapng,
                          display_filter='tcp.port == 443 or udp.port == 55555')

# iterate packets
for pkt in cap:
    # you can inspect layers, fields, etc.
    print(pkt.number, pkt.highest_layer, getattr(pkt.tcp, 'len', ''))

1 TCP 0
2 TCP 0
3 TCP 0
4 TCP 0
5 TCP 0
6 TCP 0
7 TLS 1440
8 TCP 0
9 TLS 13
10 TCP 0
11 TLS 1440
12 TLS 1440
13 TLS 271
14 TCP 0
15 TCP 0
16 TCP 0
17 TLS 80
18 TLS 91
19 TLS 271
20 TLS 271
21 TLS 1283
22 TCP 0
23 TCP 0
24 TCP 0
25 TCP 0
26 TCP 0
27 TCP 0
28 TCP 0
29 TLS 1408
30 TCP 0
31 TLS 328
32 TCP 0
33 TCP 0
34 TLS 1408
35 TCP 0
36 TLS 360
37 TCP 0
38 TLS 1408
39 TLS 1408
40 TLS 354
41 TLS 1408
42 TLS 1408
43 TLS 354
44 TCP 0
45 TCP 0
46 TCP 0
47 TCP 0
48 TLS 80
49 TLS 80
50 TLS 955
51 TCP 0
52 TCP 0
53 TLS 319
54 TLS 303
55 TLS 319
56 TLS 319
57 TLS 896
58 TCP 0
59 TCP 0
60 TCP 0
61 TLS 909
62 TCP 0
63 TLS 932
64 TCP 0
65 TLS 252
66 TCP 0
67 TCP 0
68 TLS 249
69 TCP 0
70 TCP 0
71 TCP 0
72 TLS 1408
73 TCP 0
74 TLS 663
75 TCP 0
76 TLS 1408
77 TCP 0
78 TLS 663
79 TCP 0
80 TLS 892
81 TLS 885
82 TLS 260
83 TLS 260
84 TLS 250
85 TLS 243
86 TCP 0
87 TLS 80
88 TLS 319
89 TCP 0
90 TLS 80
91 TLS 319
92 TCP 0
93 TCP 0
94 TLS 971
95 TCP 0
96 TLS 250
97 TCP 0
98 TCP 0
99 TLS 933
100 TLS 444
101

In [3]:
# Collect packet data
records = []
for pkt in cap:
    rec = {}
    # Packet metadata
    rec['packet_number'] = int(pkt.number)
    rec['timestamp'] = pkt.sniff_time
    rec['layer'] = pkt.highest_layer

    # Frame length
    rec['frame_len'] = int(pkt.frame_info.len)
    # Inter-packet delta
    rec['delta_time'] = float(pkt.frame_info.time_delta)

    # TCP payload length
    rec['tcp_payload_len'] = int(pkt.tcp.len) if hasattr(pkt, 'tcp') else None

    # TLS record length
    if hasattr(pkt, 'tls') and hasattr(pkt.tls, 'record_length'):
        rec['tls_record_len'] = int(pkt.tls.record_length)
    else:
        rec['tls_record_len'] = None

    # UDP marker payload
    if hasattr(pkt, 'udp') and pkt.udp.dstport == '55555':
        rec['udp_marker'] = pkt.udp.payload.binary_value.decode('utf-8', errors='ignore')
    else:
        rec['udp_marker'] = None

    records.append(rec)

# Create DataFrame
df = pd.DataFrame(records)

In [4]:
df

Unnamed: 0,packet_number,timestamp,layer,frame_len,delta_time,tcp_payload_len,tls_record_len,udp_marker
0,1,2025-07-21 21:41:29.502857,TCP,60,0.000000,0,,
1,2,2025-07-21 21:41:29.503312,TCP,58,0.000455,0,,
2,3,2025-07-21 21:41:29.534823,TCP,60,0.031511,0,,
3,4,2025-07-21 21:41:29.552265,TCP,74,0.017442,0,,
4,5,2025-07-21 21:41:29.552460,TCP,74,0.000195,0,,
...,...,...,...,...,...,...,...,...
12264,12265,2025-07-21 21:43:56.778651,TCP,66,0.010560,0,,
12265,12266,2025-07-21 21:43:56.784629,TLS,321,0.005977,255,250.0,
12266,12267,2025-07-21 21:43:56.791352,TCP,66,0.006723,0,,
12267,12268,2025-07-21 21:43:56.791502,TCP,66,0.000151,0,,


In [5]:
# Collect packet data
records = []
for pkt in cap:
    rec = {}
    # Packet metadata
    rec['packet_number'] = int(pkt.number)
    # Absolute timestamp
    rec['timestamp'] = pkt.sniff_time

    # Layer
    rec['layer'] = pkt.highest_layer

    # Ethernet
    if hasattr(pkt, 'eth'):
        rec['eth_src'] = pkt.eth.src
        rec['eth_dst'] = pkt.eth.dst
    else:
        rec['eth_src'] = None
        rec['eth_dst'] = None

    # IP
    if hasattr(pkt, 'ip'):
        rec['ip_src'] = pkt.ip.src
        rec['ip_dst'] = pkt.ip.dst
        rec['ip_ttl'] = int(pkt.ip.ttl)
        # IP ID can be hex; use base=0 to parse hex or decimal
        try:
            rec['ip_id'] = int(pkt.ip.id, 0)
        except Exception:
            rec['ip_id'] = None
    else:
        rec['ip_src'] = rec['ip_dst'] = None
        rec['ip_ttl'] = rec['ip_id'] = None

    # Frame length
    rec['frame_len'] = int(pkt.frame_info.len)

    # TCP
    if hasattr(pkt, 'tcp'):
        rec['tcp_srcport'] = int(pkt.tcp.srcport)
        rec['tcp_dstport'] = int(pkt.tcp.dstport)
        # Flags
        rec['tcp_syn'] = int(pkt.tcp.flags_syn)
        rec['tcp_ack'] = int(pkt.tcp.flags_ack)
        rec['tcp_fin'] = int(pkt.tcp.flags_fin)
        rec['tcp_rst'] = int(pkt.tcp.flags_reset)
        rec['tcp_psh'] = int(pkt.tcp.flags_push)
        rec['tcp_urg'] = int(pkt.tcp.flags_urg)
        # Sequence / Ack numbers
        rec['tcp_seq'] = int(pkt.tcp.seq)
        rec['tcp_acknum'] = int(pkt.tcp.ack)
        # Window size
        rec['tcp_window_size'] = int(pkt.tcp.window_size_value)
        # TCP payload length
        rec['tcp_payload_len'] = int(pkt.tcp.len)
    else:
        for field in ['tcp_srcport','tcp_dstport','tcp_syn','tcp_ack','tcp_fin','tcp_rst','tcp_psh','tcp_urg','tcp_seq','tcp_acknum','tcp_window_size','tcp_payload_len']:
            rec[field] = None

    # TLS
    if hasattr(pkt, 'tls'):
        rec['tls_record_len'] = int(pkt.tls.record_length) if hasattr(pkt.tls, 'record_length') else 0
        rec['tls_version'] = pkt.tls.record_version if hasattr(pkt.tls, 'record_version') else None
        rec['tls_handshake_type'] = pkt.tls.handshake_type if hasattr(pkt.tls, 'handshake_type') else None
        rec['tls_ciphersuite'] = pkt.tls.handshake_ciphersuite if hasattr(pkt.tls, 'handshake_ciphersuite') else None
        rec['tls_sni'] = pkt.tls.handshake_extensions_server_name if hasattr(pkt.tls, 'handshake_extensions_server_name') else None
    else:
        rec['tls_record_len'] = 0
        rec['tls_version'] = rec['tls_handshake_type'] = rec['tls_ciphersuite'] = rec['tls_sni'] = None

    # HTTP
    if hasattr(pkt, 'http'):
        rec['http_method'] = pkt.http.request_method if hasattr(pkt.http, 'request_method') else None
        rec['http_uri'] = pkt.http.request_uri if hasattr(pkt.http, 'request_uri') else None
        rec['http_response_code'] = int(pkt.http.response_code) if hasattr(pkt.http, 'response_code') else None
    else:
        rec['http_method'] = rec['http_uri'] = None
        rec['http_response_code'] = None

    # DNS
    if hasattr(pkt, 'dns'):
        rec['dns_qry_name'] = pkt.dns.qry_name if hasattr(pkt.dns, 'qry_name') else None
        rec['dns_a'] = pkt.dns.a if hasattr(pkt.dns, 'a') else None
        rec['dns_txt'] = pkt.dns.txt if hasattr(pkt.dns, 'txt') else None
    else:
        rec['dns_qry_name'] = rec['dns_a'] = rec['dns_txt'] = None

    # UDP marker
    if hasattr(pkt, 'udp') and int(pkt.udp.dstport) == 55555:
        rec['udp_marker'] = pkt.udp.payload.binary_value.decode('utf-8', errors='ignore')
    else:
        rec['udp_marker'] = None

    records.append(rec)

# Create DataFrame
import pandas as pd
df = pd.DataFrame(records)

# Compute true inter-arrival times
# Ensure sorted by timestamp
df = df.sort_values('timestamp')
df['delta_time'] = df['timestamp'].diff().dt.total_seconds().fillna(0)

# Fill NaNs for numeric fields
df['tls_record_len'] = df['tls_record_len'].fillna(0).astype(int)

### Column (features) explanation


In [None]:
df.to_csv(path_to_data / 'test_data_pcapng.csv', index=False)

In [None]:
# Test data convertion
from datetime import datetime

epoch = 1753127036.808682606  # data taken from real capture file -  "frame.time_epoch": "1753127036.808682606"
dt = datetime.fromtimestamp(epoch)
print(dt.strftime("%Y-%m-%d %H:%M:%S.%f"))


2025-07-21 21:43:56.808683
