In [1]:
import os
import json 
import pandas as pd

### FILE PREPARATION

In [2]:
CWD = os.getcwd()
DATA_FOLDER = os.path.join(CWD, 'data')
SUR_SSH_FOLDER = os.path.join(DATA_FOLDER, "suricata", "ssh")
SUR_FTP_FOLDER = os.path.join(DATA_FOLDER, "suricata", "ftp")
SUR_DVWA_FOLDER = os.path.join(DATA_FOLDER, "suricata", "dvwa")

In [3]:
EVE_SSH_PATH = os.path.join(SUR_SSH_FOLDER, 'eve.json')
EVE_FTP_PATH = os.path.join(SUR_FTP_FOLDER, 'eve.json')
EVE_DVWA_PATH = os.path.join(SUR_DVWA_FOLDER, 'eve.json')

### DATA PREPATATION

In [4]:
def read_file(file_path):
    with open(file_path) as f:
        raw_file = f.readlines()
    return raw_file

In [5]:
def line_to_json(line):
    return json.loads(line)

In [6]:
def read_eve_file(file_path):
    raw_eve = read_file(file_path)
    json_eve = []
    for l in raw_eve:
        json_eve.append(line_to_json(l))
    return json_eve

#### JSON 

In [7]:
eve_ssh_json = read_eve_file(EVE_SSH_PATH)
eve_ftp_json = read_eve_file(EVE_FTP_PATH)
eve_dvwa_json = read_eve_file(EVE_DVWA_PATH)

eve_jsons = {
    'ssh': eve_ssh_json,
    'ftp': eve_ftp_json,
    'dvwa': eve_dvwa_json 
}

#### DATAFRAME

In [8]:
eve_ssh_df = pd.DataFrame(eve_ssh_json)
eve_ftp_df = pd.DataFrame(eve_ftp_json)
eve_dvwa_df = pd.DataFrame(eve_dvwa_json)

eve_dfs = {
    'ssh': eve_ssh_df,
    'ftp': eve_ftp_df,
    'dvwa': eve_dvwa_df 
}

### FUNCTIONS

In [9]:
def filter_json_data(data: list[dict], key, value, comparator):
    if comparator == '==':
        return list(filter(lambda d: d[key] == value, data))
    elif comparator == '!=':
        return list(filter(lambda d: d[key] != value, data))
    else:
        raise NotImplementedError(f'Not implemented filtering for comporator "{comparator}"!')

In [21]:
def join_jsons(json_data: list[json]):
    result = []
    for data in json_data:
        result.extend(data)
    return result

In [30]:
def save_json(json_to_save, file_name):
    with open(f'results/{file_name}', 'w') as f:
        json.dump(json_to_save, f, indent=4)

#### COLUMNS 

In [10]:
all_columns = []
for df in eve_dfs.values():
    all_columns.extend(df.columns.to_list())
all_columns = set(all_columns)

columns_in_dfs = []
for column in all_columns:
    result = {'column': column}
    for name, df in eve_dfs.items():
        columns = df.columns.to_list()
        result[name] = int(column in columns)
    columns_in_dfs.append(result)
columns_in_dfs = pd.DataFrame(columns_in_dfs)

In [11]:
columns_in_dfs.to_csv('results/columns_in_eve_dfs.csv', index=False)

##### EVENT TYPES

In [12]:
for name, df in eve_dfs.items():
    print(name.upper())
    print(f'event types: {df['event_type'].unique()}')
    print()

SSH
event types: ['ssh' 'stats' 'flow']

FTP
event types: ['stats' 'flow' 'ftp' 'alert' 'anomaly']

DVWA
event types: ['stats' 'http' 'flow' 'fileinfo']



Description:

    'flow' – contains connection metadata: IPs, ports, packet/byte counts, duration, TCP flags
    → Important

    'stats' – internal Suricata statistics (packet counts, memory usage, errors)
    → Not important

    'alert' – triggered IDS rules with signature, category, and severity
    → Important

    'anomaly' – protocol parsing issues or unexpected behavior
    → Moderately important

    'ssh' – SSH handshake data: protocol versions, client/server info, algorithms
    → Important

    'ftp' – FTP commands, replies, and session details
    → Important

    'http' – HTTP method, URL, headers, user-agent, content-type
    → Important

    'fileinfo' – file transfers: filename, MIME type, size, hash
    → Important

### FLOW event type 

In [14]:
flow_jsons = {name: filter_json_data(json_data, 'event_type', 'flow', '==') for name, json_data in eve_jsons.items()}

In [32]:
flow_jsons

{'ssh': [{'timestamp': '2025-07-10T16:50:32.430730+0000',
   'flow_id': 1523555128178732,
   'in_iface': 'eth0',
   'event_type': 'flow',
   'src_ip': '172.24.0.6',
   'src_port': 34674,
   'dest_ip': '172.24.0.4',
   'dest_port': 22,
   'proto': 'TCP',
   'app_proto': 'ssh',
   'flow': {'pkts_toserver': 21,
    'pkts_toclient': 21,
    'bytes_toserver': 4786,
    'bytes_toclient': 5164,
    'start': '2025-07-10T16:49:25.682410+0000',
    'end': '2025-07-10T16:49:25.942650+0000',
    'age': 0,
    'state': 'closed',
    'reason': 'timeout',
    'alerted': False},
   'tcp': {'tcp_flags': '1b',
    'tcp_flags_ts': '1b',
    'tcp_flags_tc': '1b',
    'syn': True,
    'fin': True,
    'psh': True,
    'ack': True,
    'state': 'closed',
    'ts_max_regions': 1,
    'tc_max_regions': 1}},
  {'timestamp': '2025-07-10T16:50:33.425045+0000',
   'flow_id': 480210616721987,
   'in_iface': 'eth0',
   'event_type': 'flow',
   'src_ip': 'fe80:0000:0000:0000:9c2b:a8ff:fe1a:b548',
   'dest_ip': 'ff02

In [33]:
json_to_save = join_jsons(list(flow_jsons.values()))
save_json(json_to_save, 'eve_flow_data.json')

In [35]:
def flatten_entry(entry):
    flat = {}
    for k, v in entry.items():
        if isinstance(v, dict):
            for sub_k, sub_v in v.items():
                flat[f"{k}.{sub_k}"] = sub_v
        else:
            flat[k] = v
    return flat

flow_jsons = json.dumps(flow_jsons)
flattened_records = [flatten_entry(e) for e in flow_jsons]
df = pd.DataFrame(flattened_records)

AttributeError: 'str' object has no attribute 'items'