In [21]:
%load_ext autoreload
%autoreload 2

import loading
import pandas as pd

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
ignored_logs = [ # some of the logs contain information about the zeek session and are not useful for the analysis
    "loaded_scripts.log",
    "capture_loss.log",
    "stats.log",
    "packet_filter.log",
]
zeek_logs = loading.load_all_zeek_logs("../stratosphere-work-challenge-v1/zeek",ignored_logs) # change this to the path of the zeek logs

print(zeek_logs.keys())
print()

merged_df = loading.merge_logs(
    zeek_logs, primary_log="conn"
)  # primary log is the log that will be used as the base for the merge


Loading conn.log...
Loading dns.log...
Loading files.log...
Loading http.log...
Loading ssl.log...
Loading x509.log...

dict_keys(['conn', 'dns', 'files', 'http', 'ssl', 'x509'])

Merging dns...
Merging http...
Merging ssl...
Merging files...
Merging x509...


In [33]:
print("flows in merged log file: ", len(merged_df))
print("columns: ", len(merged_df.columns), merged_df.columns)

# save for inspection
merged_df.to_csv("merged_zeek_logs.csv", index=False)

flows in merged log file:  296
columns:  144 Index(['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p',
       'proto', 'service', 'duration', 'orig_bytes',
       ...
       'certificate.key_length', 'certificate.exponent', 'certificate.curve',
       'san.dns', 'san.uri', 'san.email', 'san.ip', 'basic_constraints.ca',
       'basic_constraints.path_len', 'log_type_x509'],
      dtype='object', length=144)


In [34]:
print("drop UID column, calculate time of the start w.r.t. first capture and convert IP to int")
merged_df.drop(columns=["uid"], inplace=True)
merged_df = loading.convert_ip_addresses(merged_df, ["id.orig_h", "id.resp_h"])
merged_df['time_from_beginning'] = (merged_df['ts'] - merged_df['ts'].min()).dt.total_seconds()

drop UID column, calculate time of the start w.r.t. first capture and convert IP to int


In [14]:
print("columns with only one unique value and drop them:")
for col in merged_df.columns:
    # print(col, len(merged_df[col].unique()))
    if len(merged_df[col].unique()) == 1:
        print(col)
        merged_df.drop(columns=[col], inplace=True)


columns with only one unique value and drop them:
local_orig
local_resp
missed_bytes
tunnel_parents
log_type
referrer
origin
username
password
proxied
orig_filenames
orig_mime_types
resp_filenames
last_alert
client_subject
client_issuer
filename
local_orig_files
parent_fuid
extracted
extracted_cutoff
extracted_size
san.uri
san.email
san.ip
basic_constraints.path_len


In [15]:
print("remove columns which just announce the log file name/type (dns etc.)")
merged_df = merged_df.loc[:, ~merged_df.columns.str.contains("log_type")]
print("columns after removal: ",len(merged_df.columns),"\n", merged_df.columns)

remove columns which just announce the log file name/type (dns etc.)
columns after removal:  106 
 Index(['id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'proto', 'service',
       'duration', 'orig_bytes', 'resp_bytes', 'conn_state',
       ...
       'certificate.not_valid_before', 'certificate.not_valid_after',
       'certificate.key_alg', 'certificate.sig_alg', 'certificate.key_type',
       'certificate.key_length', 'certificate.exponent', 'certificate.curve',
       'san.dns', 'basic_constraints.ca'],
      dtype='object', length=106)


In [16]:
# Convert T/F columns to 1/0 and make them numerical
tf_columns = merged_df.select_dtypes(include=['object']).columns
for col in tf_columns:
    if merged_df[col].isin(['T', 'F']).all():
        merged_df[col] = merged_df[col].map({'T': 1, 'F': 0}).astype(int)

In [17]:
print("column types: ")
for col in merged_df.columns:
    print(col, merged_df[col].dtype)
merged_df.describe(include="all")
print(merged_df.head())

column types: 
id.orig_h category
id.orig_p UInt16
id.resp_h category
id.resp_p UInt16
proto category
service category
duration timedelta64[ns]
orig_bytes UInt64
resp_bytes UInt64
conn_state category
history category
orig_pkts UInt64
orig_ip_bytes UInt64
resp_pkts UInt64
resp_ip_bytes UInt64
id.orig_h_dns category
id.orig_p_dns UInt16
id.resp_h_dns category
id.resp_p_dns UInt16
proto_dns category
trans_id UInt64
rtt timedelta64[ns]
query category
qclass UInt64
qclass_name category
qtype UInt64
qtype_name category
rcode UInt64
rcode_name category
AA category
TC category
RD category
RA category
Z UInt64
answers category
TTLs category
rejected category
id.orig_h_http category
id.orig_p_http UInt16
id.resp_h_http category
id.resp_p_http UInt16
trans_depth UInt64
method category
host category
uri category
version category
user_agent category
request_body_len UInt64
response_body_len UInt64
status_code UInt64
status_msg category
info_code UInt64
info_msg category
tags category
orig_fuids obj

In [None]:
cleaned_df = loading.preprocess_zeek_data(merged_df)
print("Number of columns in cleaned_df:", len(cleaned_df.columns))
cleaned_df.describe(include="all")
print(cleaned_df.head())

print("Column with time:")
print(cleaned_df["duration"].to_string())


Number of columns in cleaned_df: 73
   id.orig_h  id.orig_p  id.resp_h  id.resp_p proto service  duration  \
0  168296565       1210  134744072         53   udp     dns  0.010542   
1  168296565      43814  134744072         53   udp     dns  0.010908   
2  168296565      51631  134744072         53   udp     dns  0.010734   
3  168296565      65449  134744072         53   udp     dns  0.010405   
4  168296565      63247  134744072         53   udp     dns  0.009332   

   orig_bytes  resp_bytes conn_state  ... curve  server_name  resumed  \
0          31          80         SF  ...   NaN          NaN      NaN   
1          33          87         SF  ...   NaN          NaN      NaN   
2          50          66         SF  ...   NaN          NaN      NaN   
3          37          91         SF  ...   NaN          NaN      NaN   
4          40          80         SF  ...   NaN          NaN      NaN   

   next_protocol  established cert_chain_fuids  client_cert_chain_fuids  \
0          