# Initial imports

In [1]:
import re
import os
import json
import random
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.model_selection import train_test_split
from typing import List, Dict, Literal, Optional

# Config

In [113]:
MAX_TEXT_LEN = 700
MAX_TOKENS_LEN = 400

# Prepare additional data

In [112]:
def remove_duplicates(lines: List[str]) -> List[str]:
    return list(set(lines))

def read_unique_line_by_line(path: Path) -> List[str]:
    return remove_duplicates(path.read_text(encoding='utf-8').split("\n"))

def ignore_lines_with_spaces(lines: List[str]) -> List[str]:
    return [line for line in lines if " " not in line]

def random_select(lines: List[str], count: int) -> List[str]:
    if count > 0:
        return random.sample(lines, count)
    else:
        return lines

def replace_chars(lines: List[str], replacements: dict) -> List[str]:
    for k, v in replacements.items():
        for idx, line in enumerate(lines):
            print(idx)
            lines[idx] = line.replace(k, str(v))
            
    return lines

# TODO: read about annotations
# TODO: read about * in Python

In [85]:
# URLs
benign_url_lines = [
    *read_unique_line_by_line(Path("./../../data/raw/benign_lines/good_urls.txt")),
    *random_select(
        ignore_lines_with_spaces(
            read_unique_line_by_line(Path("./../../data/raw/benign_lines/good_queries.txt"))
        ),
        count=6000, # we don`t need all 1kk+ rows from origin data
    ),
    # *read_unique_line_by_line(Path("./../../data/raw/benign_lines/good_urls_with_sql.txt")),
]
anomaly_url_lines = [
    *random_select(
        read_unique_line_by_line(Path("./../../data/raw/attack_lines/bad_queries.txt")),
        count=10000,
    ),
]

# Injections
injection_lines = [
    *read_unique_line_by_line(Path("./../../data/raw/attack_lines/command_injection.txt")),
    *read_unique_line_by_line(Path("./../../data/raw/attack_lines/sql_injection.txt")),
    *read_unique_line_by_line(Path("./../../data/raw/attack_lines/xss.txt")),
]

In [86]:
len(benign_url_lines), len(anomaly_url_lines), len(injection_lines)
# 17k anomaly, 
# +10k <- anomaly_url_lines from benign origin in GETs
# +3k <- injection_lines from benign origin in GETs
# ---- 
# 30k total

# normal:
# 22k 
# +8k <- benign_url_lines in GETs or POSTs
# ----
# 30k total

(6790, 10000, 2805)

# Prepare HTTP requests
## Parse seq2seq vulnbank

In [98]:
def split_vulnbank(file_content: str) -> List[str]:
    start_split_chunks = file_content.split("ST@RT")
    return [chunk.split("END")[0] for chunk in start_split_chunks]

def remove_empty(chunks: List[str]) -> List[str]:
    return [chunk for chunk in chunks if chunk != "\n"]

def remove_lines_with_vals(chunks: List[str], contain_vals: List[str], exactly_vals: List[str]) -> List[str]:
    new_chunks = []
    
    for chunk in chunks:
        new_chunk = []
        chunk_lines = chunk.split("\n")
        
        for chunk_line in chunk_lines:
            append_line = True
            
            for contain_val in contain_vals:
                if contain_val in chunk_line:
                    append_line = False
                    break
                    
            for exactly_val in exactly_vals:
                if exactly_val == chunk_line:
                    append_line = False 
                    break
                
            if append_line:
                new_chunk.append(chunk_line)
            
        new_chunks.append("\n".join(new_chunk))
            
    return new_chunks

# def save_txt(chunks: List[str]):
#     with open('./../../data/interim/vulnbank_train.txt', 'w') as filehandle:
#         for chunk in chunks:
#             filehandle.writelines(chunk + "\n\n\n")

def filter_chunks_by_len(chunks: List[str], filter_len: int) -> List[str]:
    return [chunk for chunk in chunks if (len(chunk) <= filter_len) and len(chunk) > 0]

def prepare_seq2seq_data(path: str, filter_len: int) -> List[str]:
    chunks = remove_lines_with_vals(
        remove_empty(split_vulnbank(Path(path).read_text())),
        contain_vals=["15 Mar 2018"],
        exactly_vals=[""],
    )
    print(f"Len before filter: {len(chunks)} in {path}")
    chunks = filter_chunks_by_len(chunks, filter_len=filter_len)
    print(f"Len after  filter: {len(chunks)} in {path}")
    chunks = remove_duplicates(chunks)
    print(f"Len after  removing_duplicates: {len(chunks)} in {path}")    
    return chunks

seq2seq_benign_chunks = prepare_seq2seq_data("./../../data/raw/seq2seq/vulnbank_train.txt", filter_len=MAX_TEXT_LEN)
seq2seq_anomaly_chunks = prepare_seq2seq_data("./../../data/raw/seq2seq/vulnbank_anomaly.txt", filter_len=10**5)

Len before filter: 21991 in ./../../data/raw/seq2seq/vulnbank_train.txt
Len after  filter: 10805 in ./../../data/raw/seq2seq/vulnbank_train.txt
Len after  removing_duplicates: 10292 in ./../../data/raw/seq2seq/vulnbank_train.txt
Len before filter: 1098 in ./../../data/raw/seq2seq/vulnbank_anomaly.txt
Len after  filter: 1097 in ./../../data/raw/seq2seq/vulnbank_anomaly.txt
Len after  removing_duplicates: 1078 in ./../../data/raw/seq2seq/vulnbank_anomaly.txt


In [88]:
def get_min_row_len(chunks: List[str]):    
    min_len = 10e10
    min_idx = 0
    for idx, chunk in enumerate(chunks):
        if len(chunk) < min_len:
            min_len = len(chunk)
            min_idx = idx
            
    print(min_len, min_idx)
    print(chunks[min_idx])

def get_max_row_len(chunks: List[str]):
    max_len = 0
    max_idx = 0
    for idx, chunk in enumerate(chunks):
        if len(chunk) > max_len:
            max_len = len(chunk)
            max_idx = idx
            
    print(max_len, max_idx)
    print(chunks[max_idx])

## Parse csic2010


In [43]:
def split_csic2010(file_content: str) -> List[str]:
    return re.findall(
        pattern=r"(?:GET|POST|PUT)[\s\S]+?(?=\n*POST|\n*GET|\n*PUT)|(?:GET|POST|PUT)[\s\S]+", 
        string=file_content
    )
    
csic2010_benign_chunks = [
    *split_csic2010(Path("./../../data/raw/csic2010/normalTrafficTraining.txt").read_text()),
    *split_csic2010(Path("./../../data/raw/csic2010/normalTrafficTest.txt").read_text()),
]

csic2010_anomaly_chunks = split_csic2010(Path("./../../data/raw/csic2010/anomalousTrafficTest.txt").read_text())

len(csic2010_benign_chunks), len(csic2010_anomaly_chunks)

(72000, 25065)

In [124]:
print(f"Len before removing_duplicates: {len(csic2010_benign_chunks)} for benign")
csic2010_benign_chunks = remove_duplicates(csic2010_benign_chunks)
print(f"Len after  removing_duplicates: {len(csic2010_benign_chunks)} for benign")

print(f"Len before removing_duplicates: {len(csic2010_anomaly_chunks)} for attack")
csic2010_anomaly_chunks = remove_duplicates(csic2010_anomaly_chunks)
print(f"Len after  removing_duplicates: {len(csic2010_anomaly_chunks)} for attack")

Len before removing_duplicates: 72000 for benign
Len after  removing_duplicates: 72000 for benign
Len before removing_duplicates: 25065 for attack
Len after  removing_duplicates: 25065 for attack


In [9]:
# get_max_row_len(csic2010_anomaly_chunks)

In [127]:
print(len(filter_chunks_by_len(csic2010_benign_chunks, MAX_TEXT_LEN)))
print(len(filter_chunks_by_len(csic2010_anomaly_chunks, MAX_TEXT_LEN)))

64000
19004


# Prepare openappsec

## Prepare all data for validation 

In [10]:
openappsec_anomaly_dir_path = "./../../data/raw/openappsec/malicious/"
openappsec_benign_dir_path = "./../../data/raw/openappsec/legitimate/"

In [11]:
def parse_json_request(text_json: dict):
    rows = [
        f"{text_json['method']} {text_json['url']} HTTP/1.1"
    ]

    # append headers
    for k, v in text_json["headers"].items():
        rows.append(f"{k}: {v}")

    # append data if contains
    rows.append(text_json["data"])

    return "\n".join(rows)

def read_chunks_from_dir(dir_path):
    files = os.listdir(dir_path)
    chunks = []
    
    for file in files:
        content = Path(dir_path + file).read_text()
        content_items_json = json.loads(content)
        
        for content_item_json in content_items_json:
            chunks.append(parse_json_request(content_item_json))
    
    return chunks

openappsec_benign_chunks = read_chunks_from_dir(openappsec_benign_dir_path)
openappsec_anomaly_chunks = read_chunks_from_dir(openappsec_anomaly_dir_path)

In [90]:
len(openappsec_benign_chunks), len(openappsec_anomaly_chunks)

(973964, 73924)

In [91]:
unique_openappsec_benign_chunks = list(set(openappsec_benign_chunks))
unique_openappsec_anomaly_chunks = list(set(openappsec_anomaly_chunks))

len(unique_openappsec_benign_chunks), len(unique_openappsec_anomaly_chunks)

(178514, 73924)

In [14]:
print("around 5 duplicates per row")
openappsec_benign_chunks.count(openappsec_benign_chunks[45435])

around 5 duplicates per row


5

## Remove Cookie, filter len, make unique

In [145]:
def prepare_openappsec_dataset(chunks):
    chunks = remove_lines_with_vals(
        chunks,
        contain_vals=["Cookie:"],
        exactly_vals=[""],
    )
    print(f"Len before filter: {len(chunks)}")
    chunks = filter_chunks_by_len(chunks, filter_len=MAX_TEXT_LEN)
    print(f"Len after  filter: {len(chunks)}")
    return remove_duplicates(chunks)
    
filtered_openappsec_benign_chunks = prepare_openappsec_dataset(unique_openappsec_benign_chunks)
filtered_openappsec_anomaly_chunks = prepare_openappsec_dataset(unique_openappsec_anomaly_chunks)


Len before filter: 178514
Len after  filter: 82197
Len before filter: 73924
Len after  filter: 73853


Save data for testing final model to compare with 
https://www.openappsec.io/post/best-waf-solutions-in-2023-real-world-comparison

In [146]:
# save all data for validation purposes
benign_y = [1] * len(filtered_openappsec_benign_chunks)
anomaly_y = [0] * len(filtered_openappsec_anomaly_chunks)

data = {
    "text": [*filtered_openappsec_benign_chunks, *filtered_openappsec_anomaly_chunks],
    "benign": [*benign_y, *anomaly_y]
}
df = pd.DataFrame(data)

In [51]:
df.describe()

Unnamed: 0,benign
count,172133.0
mean,0.570797
std,0.494964
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [18]:
df

Unnamed: 0,text,benign
0,GET /system/extImages/ZVC7zCtgG1b9a2plZCVaEhdW...,1
1,GET /on/demandware.static/Sites-mandslondon-Si...,1
2,GET /v1/pages/ChRDaHJvbWUvMTA1LjAuNTE5NS41NBIQ...,1
3,GET /maps/vt?pb=!1m5!1m4!1i15!2i16135!3i10624!...,1
4,GET /s3/web4static/_/v2/static/chunks/a68825.2...,1
...,...,...
181823,POST / HTTP/1.1\nUser-Agent: Mozilla/5.0 (Wind...,0
181824,GET /?p=..%5C..%5C..%5C..%5C..%5Cetc%5Cissue%0...,0
181825,POST / HTTP/1.1\nUser-Agent: Mozilla/5.0 (Wind...,0
181826,GET /?p=%3Ctbody%20onbeforepaste%3D%22prompt%2...,0


In [147]:
# df.to_csv("./../../data/processed/openappsec_final_comparison.csv", index=False, escapechar="~")

Get data for dataset

In [129]:
# random selected to make data balanced
# balanced data for dataset
final_openappsec_benign_chunks = random_select(filtered_openappsec_benign_chunks, 78000)
final_openappsec_anomaly_chunks = filtered_openappsec_anomaly_chunks

len(final_openappsec_benign_chunks), len(final_openappsec_anomaly_chunks)

(78000, 73853)

# Combine and prepare

In [130]:
# do not include openappsec, use it later
all_benign_chunks = [*csic2010_benign_chunks, *seq2seq_benign_chunks]
all_anomaly_chunks = [*csic2010_anomaly_chunks, *seq2seq_anomaly_chunks]

len(all_benign_chunks), len(all_anomaly_chunks)

(82292, 26143)

In [132]:
def filter_headers(chunks: List[str]) -> List[str]:
    new_chunks = []
    
    for chunk in chunks:
        new_chunk = []
        for line in chunk.split("\n"):
            modified_line = line
            
            if line.startswith("Cookie:"):
                continue
            
            new_chunk.append(modified_line)
        new_chunks.append("\n".join(new_chunk))
    
    return new_chunks

print("After removing duplicates. Cookie header had random values")
unique_anomaly_chunks = filter_chunks_by_len(remove_duplicates(filter_headers(all_anomaly_chunks)), MAX_TEXT_LEN)
unique_benign_chunks = filter_chunks_by_len(remove_duplicates(filter_headers(all_benign_chunks)), MAX_TEXT_LEN)

print(len(filter_headers(all_anomaly_chunks)), len(unique_anomaly_chunks))
print(len(filter_headers(all_benign_chunks)), len(unique_benign_chunks))

After removing duplicates. Cookie header had random values
26143 11765
82292 14052


## Append new attack lines to data

In [133]:
def append_url_lines_to_requests(url_lines: List[str], chunks: List[str], get_only: bool = False) -> List[str]:
    new_chunks = []
    url_lines_copy = url_lines.copy()
    shuffled_chunks = chunks.copy()
    np.random.shuffle(shuffled_chunks)
    
    for chunk in shuffled_chunks:
        if len(url_lines_copy) <= 0:
            break
        
        chunk_lines = chunk.split("\n")
        if chunk_lines[0].startswith("GET "):
            chunk_lines[0] = f"GET {url_lines_copy.pop()} HTTP/1.1"
        elif chunk_lines[0].startswith("POST ") and not get_only:
            append_url = url_lines_copy.pop()
            base_url = append_url.split("?")[0]
            chunk_lines[-1] = ""
            
            chunk_lines[0] = f"POST {base_url} HTTP/1.1"
            if len(append_url.split("?")) > 1:
                chunk_lines[-1] = append_url[len(base_url)+1:]
        else:
            continue
            
        new_chunks.append("\n".join(chunk_lines))
        
    return new_chunks
    
additional_benign_chunks = append_url_lines_to_requests(benign_url_lines, unique_benign_chunks)
additional_anomaly_chunks = append_url_lines_to_requests(anomaly_url_lines, unique_benign_chunks, get_only=True)

len(additional_benign_chunks), len(additional_anomaly_chunks)

(6790, 7845)

In [106]:
def append_injection_to_requests(injection_code: List[str], chunks: List[str]) -> List[str]:
    new_chunks = []
    search_url_params_pattern = "([&|?]\w+=[0-9a-zA-Z\+\%.*]+)"
    injection_code_copy = injection_code.copy()
    shuffled_chunks = chunks.copy()
    np.random.shuffle(shuffled_chunks)
    
    for chunk in shuffled_chunks:
        if len(injection_code_copy) <= 0:
            break
            
        chunk_lines = chunk.split("\n")
        first_line =  chunk_lines[0]
        
        if chunk_lines[0].startswith("GET "):
            params = re.findall(search_url_params_pattern, first_line)
            
            if len(params):
                random_param = random.choice(params)
                insert_in = first_line.find(random_param) + len(random_param)
                
                chunk_lines[0] = chunk_lines[0][:insert_in].split("=")[0] + "=" + injection_code_copy.pop() + chunk_lines[0][insert_in:] 
            else:
                continue

            new_chunks.append("\n".join(chunk_lines))
    
    return new_chunks

append_injection_to_requests(injection_lines, unique_benign_chunks)[:2]

['GET http://localhost:8080/tienda1/publico/anadir.jsp?id=<input onfocus=javascript:alert(1) autofocus> HTTP/1.1\nUser-Agent: Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko)\nPragma: no-cache\nCache-control: no-cache\nAccept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5\nAccept-Encoding: x-gzip, x-deflate, gzip, deflate\nAccept-Charset: utf-8, utf-8;q=0.5, *;q=0.5\nAccept-Language: en\nHost: localhost:8080\nConnection: close',
 'GET http://localhost:8080/tienda1/publico/registro.jsp?modo=<SCRIPT>document.cookie=true;</SCRIPT>&ntc=1373405933057290&B1=Registrar HTTP/1.1\nUser-Agent: Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko)\nPragma: no-cache\nCache-control: no-cache\nAccept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5\nAccept-Encoding: x-gzip, x-deflate, gzip, deflate\nAccept-Charset: utf-8, utf-8;q=0.5, *;q=0.5\nAccept-Langu

In [25]:
injection_lines[-1]

'<input onfocus=javascript:alert(1) autofocus>'

In [134]:
additional_anomaly_chunks = [
    *append_injection_to_requests(injection_lines, unique_benign_chunks),
    *append_url_lines_to_requests(anomaly_url_lines, unique_benign_chunks, get_only=True)
]
additional_benign_chunks = append_url_lines_to_requests(benign_url_lines, unique_benign_chunks)

print(len(additional_anomaly_chunks), len(additional_benign_chunks))
print(len(injection_lines), len(anomaly_url_lines), len(benign_url_lines))

10650 6790
2805 10000 6790


# Save data

In [135]:
final_anomaly_chunks = [
    *additional_anomaly_chunks,
    *unique_anomaly_chunks,
    *final_openappsec_anomaly_chunks,
]
final_benign_chunks = [
    *additional_benign_chunks,
    *unique_benign_chunks,
    *final_openappsec_benign_chunks,
]

print(len(final_benign_chunks), len(final_anomaly_chunks))
print(len(remove_duplicates(final_benign_chunks)), len(remove_duplicates(final_anomaly_chunks)))
print("No duplicates, data is balanced")

98842 96268
98842 96268
No duplicates, data is balanced


In [136]:
def check(chunks):
    text_len = MAX_TEXT_LEN
    print(len(filter_chunks_by_len(chunks, text_len)))
    print("Tokens", text_len / 2, text_len / 3)
    
check(final_benign_chunks)
check(final_anomaly_chunks)

98842
Tokens 350.0 233.33333333333334
96060
Tokens 350.0 233.33333333333334


Shuffle and create .csv

In [137]:
np.random.shuffle(final_benign_chunks)
np.random.shuffle(final_anomaly_chunks)

In [138]:
benign_y = [1] * len(final_benign_chunks)
anomaly_y = [0] * len(final_anomaly_chunks)

data = {
    "text": [*final_benign_chunks, *final_anomaly_chunks],
    "benign": [*benign_y, *anomaly_y]
}
df = pd.DataFrame(data)

In [140]:
df.describe()

Unnamed: 0,benign
count,195110.0
mean,0.506596
std,0.499958
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [143]:
df_train, df_test = train_test_split(df, test_size=0.4, shuffle=True, random_state=228)
df_val, df_test = train_test_split(df_test, test_size=0.65, random_state=228)

len(df_train), len(df_test), len(df_val)

(117066, 50729, 27315)