In [1]:
%load_ext autoreload
%autoreload 2

import loading
import pandas as pd
import numpy as np

## Load selected logs for processing
Some logs are not used, because they describe the zeek session and not the data.

We decided to not use x509.log, since the data pertains to a very small portion of flows. This log captures details on certificates exchanged during certain TLS negotiations.
https://docs.zeek.org/en/master/logs/x509.html

In [2]:
ignored_logs = [ # some of the logs contain information about the zeek session and are not useful for the analysis
    "loaded_scripts.log",
    "capture_loss.log",
    "stats.log",
    "packet_filter.log",
    "x509.log",
]
zeek_logs = loading.load_all_zeek_logs("../stratosphere-work-challenge-v1/zeek",ignored_logs) # change this to the path of the zeek logs

print(zeek_logs.keys())
cleaned_logs = {} #where cleaned and aggregated data will be stored


Loading conn.log...
Loading dns.log...
Loading files.log...
Loading http.log...
Loading ssl.log...

dict_keys(['conn', 'dns', 'files', 'http', 'ssl'])


### Etracting info from individual logs
We select only a small number of important columns (based on prior knowledge AND inspecting the logs themselves. Some columns would be selected, but they have only one unique value (for example))

In [3]:
# files log
# multiple fuids for one uid. cannot connect 1:1, I want some kind of aggregation
files_df = zeek_logs["files"]
http_count = []
ssl_count = []
avg_seen_bytes = []

# for each uid, count the number of http and ssl files and calculate the average seen_bytes
for uid in files_df['conn_uids'].unique():
    http_files = files_df[(files_df['conn_uids'] == uid) & (files_df['source'] == 'HTTP')]
    ssl_files = files_df[(files_df['conn_uids'] == uid) & (files_df['source'] == 'SSL')]
    
    http_count.append(len(http_files))
    ssl_count.append(len(ssl_files))
    avg_seen_bytes.append(files_df[files_df['conn_uids'] == uid]['seen_bytes'].mean())

aggregated_file_df = pd.DataFrame({
    'uid': files_df['conn_uids'].unique(),
    'http_count': http_count,
    'ssl_count': ssl_count,
    'avg_seen_bytes_files': avg_seen_bytes
})

print(aggregated_file_df.head())
cleaned_logs["files"] = aggregated_file_df # no missing values and no duplicates

                  uid  http_count  ssl_count  avg_seen_bytes_files
0  CYaRbd1LgVHyMi0os7           0          2                1392.0
1  CE2v1V1PiJwfenwq22           0          2                1392.0
2   CnCVN6i60NAbKmFxl           0          2                1392.0
3   CXVBUUhqkJEhiN2s6           0          2                1424.0
4  CriN9h1d6hCNTMi3P6           1          0                  22.0


In [4]:
# ssl contains info about https (but in conn, there already is tcp with ssl tags)
ssl = zeek_logs["ssl"]
aggregated_ssl_df = ssl[
    ["uid", "version", "resumed", "next_protocol", "established", "validation_status"]
]

# Replace missing values and specific string in 'version' column
versions_col = aggregated_ssl_df.copy()["version"]
versions_col = versions_col.cat.rename_categories({"unknown-64282": "unknown"})
versions_col = versions_col.cat.add_categories("missing")
versions_col = versions_col.replace("-", None).fillna("missing")
aggregated_ssl_df["version"] = versions_col

# fill in next protocol column (missing if not specififed)
next_protocol_col = aggregated_ssl_df.copy()["next_protocol"]
next_protocol_col = next_protocol_col.cat.add_categories("missing")
next_protocol_col = next_protocol_col.fillna("missing")
aggregated_ssl_df["next_protocol"] = next_protocol_col

# fill in validation status column (missing if not specified)
validation_status_col = aggregated_ssl_df.copy()["validation_status"]
validation_status_col = validation_status_col.cat.add_categories("missing")
validation_status_col = validation_status_col.fillna("missing")
aggregated_ssl_df["validation_status"] = validation_status_col

cleaned_logs["ssl"] = aggregated_ssl_df
print(aggregated_ssl_df)

                   uid  version resumed next_protocol established  \
0    CgnCcKyQn6Fuvtaaa  unknown       T       missing           T   
1    CyaZhP168fSnjEW4i  unknown       T       missing           T   
2   C1joeV1VXsHCLjiswg  unknown       T       missing           T   
3    CplZRBVCFLcGq4Dbl  unknown       T       missing           T   
4    C05gpYcCZjI5tZ4m5  unknown       T       missing           T   
..                 ...      ...     ...           ...         ...   
73  CoqeK14slRRWT3jSG3   TLSv13       F       missing           T   
74   Cmz3aX3s2GpoaPURW   TLSv13       T       missing           T   
75  Ci1SH62mLUPecPJx5d   TLSv12       T      http/1.1           T   
76   CupYkjJ1BXaXaZk94   TLSv13       T       missing           T   
77  CokYEW2q0cjgDIE5Gi   TLSv12       T            h2           T   

   validation_status  
0            missing  
1            missing  
2            missing  
3            missing  
4            missing  
..               ...  
73        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aggregated_ssl_df["version"] = versions_col
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aggregated_ssl_df["next_protocol"] = next_protocol_col
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aggregated_ssl_df["validation_status"] = validation_status_col


In [5]:
#http log 
http = zeek_logs["http"]
aggregated_http_df = http[
    ["uid","method", "status_code", "request_body_len", "response_body_len"]]
aggregated_http_df['status_code'] = aggregated_http_df['status_code'].fillna(0)

cleaned_logs["http"] = aggregated_http_df
print(aggregated_http_df)

                   uid method  status_code  request_body_len  \
0   CBPoVA3CEo9RnTQNDj    GET          204                 0   
1   CriN9h1d6hCNTMi3P6    GET          200                 0   
2   CKwfPy2GoRYlUeD35i    GET          200                 0   
3   CKwfPy2GoRYlUeD35i    GET          200                 0   
4   CKwfPy2GoRYlUeD35i    GET          200                 0   
5   Cpl4DO1V4VtiWE0oBj    GET          200                 0   
6   Cpl4DO1V4VtiWE0oBj    GET          200                 0   
7   Cpl4DO1V4VtiWE0oBj    GET          200                 0   
8    C0Uxcf30mJ2Lq5wbX    GET          101                 0   
9   CNPVre20YbYQSMf7Ke    GET          101                 0   
10  Ctq0ZM2pGM8l5HdQS6    GET          200                 0   
11  CtCYjh1iQvaBV7zgZ1    GET          200                 0   
12  CnJ79x4juoPxV9NMW8    GET          200                 0   
13  CUhwFI2YjMkHyGju97    GET          200                 0   
14  CIrUMN3KNrQM1GRnU7    GET          2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aggregated_http_df['status_code'] = aggregated_http_df['status_code'].fillna(0)


In [6]:
dns = zeek_logs["dns"]
aggregated_dns_df = dns[["uid", "qtype_name", "rcode_name"]]
dns_answer_count = dns['answers'].apply(lambda x: len(x.split(',')) if pd.notna(x) else 0)

# add new columns to the aggregated dns dataframe
# calculate average ttl of answer packet (can mean very fast name change?)
aggregated_dns_df['dns_answer_count'] = dns_answer_count # number of dns answers
aggregated_dns_df['avg_TTL'] = dns['TTLs'].apply(lambda x: np.mean([float(ttl) for ttl in x.split(',')]) if pd.notna(x) else 0)


# extract TLD (instead of the full query)
aggregated_dns_df["TLD"] = dns["query"].apply(lambda x: x.split(".")[-1])
# extract SLD (google in google.com ...)
aggregated_dns_df["SLD"] = dns["query"].apply(lambda x: x.split(".")[-2] if len(x.split(".")) > 1 else "")
# number of subdomains in the query (longer can be used for obfuscation)
aggregated_dns_df["query_length"] = dns["query"].apply(lambda x: len(x.split(".")))

print(aggregated_dns_df)
cleaned_logs["dns"] = aggregated_dns_df

                   uid qtype_name rcode_name  dns_answer_count  avg_TTL  TLD  \
0   CMLZbu3FDJYoZwa27k          A    NOERROR                 2    116.5  com   
1   CgRCjV3z8dKmNVIvhb          A    NOERROR                 2   1800.5  com   
2   CgFfWv3PUApAZUINNf          A    NOERROR                 1     59.0  com   
3   CIkFu02IznJPZcp1El          A    NOERROR                 2   1814.5  com   
4   Cylq6E2mc9lVLjs8ua          A    NOERROR                 2    636.5  com   
..                 ...        ...        ...               ...      ...  ...   
59  Cyw1E94W8FfeSPczJk          A    NOERROR                 1    299.0  com   
60  C0jMGu1frh9ei1Wmqc          A    NOERROR                 1    247.0  com   
61  COs0MF2BvtxqwwWZBl          A    NOERROR                 1    248.0  com   
62  CovR8F24TUZtn0RYGj          A    NOERROR                 1    576.0   co   
63   Cj4sBHa0s2uYpSCc3          A    NOERROR                 5   1793.0  com   

             SLD  query_length  
0     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aggregated_dns_df['dns_answer_count'] = dns_answer_count # number of dns answers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aggregated_dns_df['avg_TTL'] = dns['TTLs'].apply(lambda x: np.mean([float(ttl) for ttl in x.split(',')]) if pd.notna(x) else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

In [7]:
conn = zeek_logs["conn"]
# here we only drop some of the columns, most are useful and we need stuff liek IPs..)
conn.drop(columns=["local_orig", "local_resp", "tunnel_parents","proto","log_type"], inplace=True)
merged_df = loading.convert_ip_addresses(conn, ["id.orig_h", "id.resp_h"])

print(conn.columns)
cleaned_logs["conn"] = conn

Index(['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p',
       'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state',
       'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts',
       'resp_ip_bytes'],
      dtype='object')


In [8]:
# still, throw away columns with only one unique value
for log_name, log_df in cleaned_logs.items():
    for col in log_df.columns:
        if log_df[col].nunique() == 1:
            print(f"{log_name} - {col}: {log_df[col].nunique()} unique value")
            log_df.drop(columns=[col], inplace=True)

# finally merge logs
merged_df = loading.merge_logs(
    cleaned_logs, primary_log="conn"
)  
merged_df.drop(columns=["uid"], inplace=True)

# results?
print("flows in merged log file: ", len(merged_df))
print("columns: ", len(merged_df.columns), merged_df.columns)

#save for inspection?
merged_df.to_csv("merged_zeek_logs.csv", index=False)

dns - qtype_name: 1 unique value
dns - rcode_name: 1 unique value
conn - missed_bytes: 1 unique value
Merging files...
Merging ssl...
Merging http...
Merging dns...
flows in merged log file:  234
columns:  32 Index(['ts', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p', 'service',
       'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 'history',
       'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes',
       'http_count', 'ssl_count', 'avg_seen_bytes_files', 'version', 'resumed',
       'next_protocol', 'established', 'validation_status', 'method',
       'status_code', 'request_body_len', 'response_body_len',
       'dns_answer_count', 'avg_TTL', 'TLD', 'SLD', 'query_length'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  log_df.drop(columns=[col], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  log_df.drop(columns=[col], inplace=True)


## Further clean-up the data 
Label the data and drop the time columns

Fill in the missing values (some columns are filled just for some data).

Remove columns which are obsolete (have only 1 value)

In [9]:
print("calculate time of the start w.r.t. the first log entry and label the data as benign if the time is less than 4 minutes")# based on task description
merged_df['time_from_beginning'] = (merged_df['ts'] - merged_df['ts'].min()).dt.total_seconds()
merged_df['label'] = np.where(merged_df['time_from_beginning'] < 4 * 60, 'benign', 'unknown')
merged_df.drop(columns=["ts", "time_from_beginning"], inplace=True)

calculate time of the start w.r.t. the first log entry and label the data as benign if the time is less than 4 minutes


In [10]:
# Convert T/F columns to 1/0 and make them numerical
tf_columns = merged_df.select_dtypes(include=['object']).columns
for col in tf_columns:
    if merged_df[col].isin(['T', 'F']).all():
        merged_df[col] = merged_df[col].map({'T': 1, 'F': 0}).astype(int)

In [11]:
print("column types: ")
for col in merged_df.columns:
    print(col, merged_df[col].dtype, merged_df[col].nunique())
merged_df.describe(include="all")
print(merged_df.head())

column types: 
id.orig_h category 3
id.orig_p UInt16 170
id.resp_h category 41
id.resp_p UInt16 8
service category 3
duration timedelta64[ns] 189
orig_bytes UInt64 116
resp_bytes UInt64 131
conn_state category 8
history category 46
orig_pkts UInt64 50
orig_ip_bytes UInt64 125
resp_pkts UInt64 46
resp_ip_bytes UInt64 133
http_count float64 5
ssl_count float64 3
avg_seen_bytes_files float64 18
version category 4
resumed category 2
next_protocol category 3
established category 2
validation_status category 2
method category 2
status_code UInt64 4
request_body_len UInt64 2
response_body_len UInt64 6
dns_answer_count float64 6
avg_TTL float64 57
TLD object 4
SLD object 17
query_length float64 4
label object 2
   id.orig_h  id.orig_p  id.resp_h  id.resp_p service               duration  \
0  168296565       1210  134744072         53     dns 0 days 00:00:00.010542   
1  168296565      43814  134744072         53     dns 0 days 00:00:00.010908   
2  168296565      51631  134744072         53  

In [12]:
# categores to dummies
# integers, floats
# missing values everywhere?
# see the jupyter for help?

print("Number of columns in cleaned_df:", len(cleaned_df.columns))
cleaned_df.describe(include="all")
print(cleaned_df.head())


NameError: name 'cleaned_df' is not defined

## Feature selection, normalisation and dimensionality reduction
Next steps in the pipeline: 

cluster the features or calculate colinearity/covariance (for feature selection)

select normalisation, visualise the data, reduction (PCA?)

