# Initial imports

In [252]:
import re
import random
import numpy as np
import pandas as pd

from pathlib import Path
from typing import List, Dict, Literal, Optional

# Prepare additional data

In [3]:
def remove_duplicates(lines: List[str]) -> List[str]:
    return list(set(lines))

def read_unique_line_by_line(path: Path) -> List[str]:
    return remove_duplicates(path.read_text(encoding='utf-8').split("\n"))

def ignore_lines_with_spaces(lines: List[str]) -> List[str]:
    return [line for line in lines if " " not in line]

def random_select(lines: List[str], count: int) -> List[str]:
    if count > 0:
        return random.sample(lines, count)
    else:
        return lines

def replace_chars(lines: List[str], replacements: dict) -> List[str]:
    for k, v in replacements.items():
        for idx, line in enumerate(lines):
            print(idx)
            lines[idx] = line.replace(k, str(v))
            
    return lines

# TODO: read about annotations
# TODO: read about * in Python

In [179]:
# URLs
benign_url_lines = [
    *read_unique_line_by_line(Path("./../../data/raw/benign_lines/good_urls.txt")),
    *random_select(
        ignore_lines_with_spaces(
            read_unique_line_by_line(Path("./../../data/raw/benign_lines/good_queries.txt"))
        ),
        count=6000, # we don`t need all 1kk+ rows from origin data
    ),
    *read_unique_line_by_line(Path("./../../data/raw/benign_lines/good_urls_with_sql.txt")),
]
anomaly_url_lines = [
    *random_select(
        read_unique_line_by_line(Path("./../../data/raw/attack_lines/bad_queries.txt")),
        count=10000,
    ),
]

# Injections
injection_lines = [
    *read_unique_line_by_line(Path("./../../data/raw/attack_lines/command_injection.txt")),
    *read_unique_line_by_line(Path("./../../data/raw/attack_lines/sql_injection.txt")),
    *read_unique_line_by_line(Path("./../../data/raw/attack_lines/xss.txt")),
]

In [181]:
len(benign_url_lines), len(anomaly_url_lines), len(injection_lines)
# 17k anomaly, 
# +10k <- anomaly_url_lines from benign origin in GETs
# +3k <- injection_lines from benign origin in GETs
# ---- 
# 30k total

# normal:
# 22k 
# +8k <- benign_url_lines in GETs or POSTs
# ----
# 30k total

(7777, 10000, 2805)

In [98]:
'http://faketinder.com/profile?p_id=user123&comment=Swipe_right!+You_seem_interesting.'[len('http://faketinder.com/profile?p_id=user123&comment=Swipe_right!+You_seem_interesting.'.split("?")[0]):]

'?p_id=user123&comment=Swipe_right!+You_seem_interesting.'

In [245]:
injection_lines[:4]

['\\n/usr/bin/id|',
 '%0Acat%20/etc/passwd',
 '& uname -a',
 '<?php system("echo XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

In [7]:
# The data is ready, now we need to parse request lines and insert it there

# Prepare HTTP requests
## Parse seq2seq vulnbank

In [8]:
def split_vulnbank(file_content: str) -> List[str]:
    start_split_chunks = file_content.split("ST@RT")
    return [chunk.split("END")[0] for chunk in start_split_chunks]

def remove_empty(chunks: List[str]) -> List[str]:
    return [chunk for chunk in chunks if chunk != "\n"]

def remove_lines_with_vals(chunks: List[str], contain_vals: List[str], exactly_vals: List[str]) -> List[str]:
    new_chunks = []
    
    for chunk in chunks:
        new_chunk = []
        chunk_lines = chunk.split("\n")
        
        for chunk_line in chunk_lines:
            append_line = True
            
            for contain_val in contain_vals:
                if contain_val in chunk_line:
                    append_line = False
                    break
                    
            for exactly_val in exactly_vals:
                if exactly_val == chunk_line:
                    append_line = False 
                    break
                
            if append_line:
                new_chunk.append(chunk_line)
            
        new_chunks.append("\n".join(new_chunk))
            
    return new_chunks

# def save_txt(chunks: List[str]):
#     with open('./../../data/interim/vulnbank_train.txt', 'w') as filehandle:
#         for chunk in chunks:
#             filehandle.writelines(chunk + "\n\n\n")

def filter_chunks_by_len(chunks: List[str], filter_len: int) -> List[str]:
    return [chunk for chunk in chunks if (len(chunk) <= filter_len) and len(chunk) > 0]

def prepare_seq2seq_data(path: str, filter_len: int) -> List[str]:
    chunks = remove_lines_with_vals(
        remove_empty(split_vulnbank(Path(path).read_text())),
        contain_vals=["15 Mar 2018"],
        exactly_vals=[""],
    )
    print(f"Len before filter: {len(chunks)} in {path}")
    chunks = filter_chunks_by_len(chunks, filter_len=filter_len)
    print(f"Len after  filter: {len(chunks)} in {path}")
    chunks = remove_duplicates(chunks)
    print(f"Len after  removing_duplicates: {len(chunks)} in {path}")    
    return chunks

seq2seq_benign_chunks = prepare_seq2seq_data("./../../data/raw/seq2seq/vulnbank_train.txt", filter_len=900)
seq2seq_anomaly_chunks = prepare_seq2seq_data("./../../data/raw/seq2seq/vulnbank_anomaly.txt", filter_len=10**5)

Len before filter: 21991 in ./../../data/raw/seq2seq/vulnbank_train.txt
Len after  filter: 19246 in ./../../data/raw/seq2seq/vulnbank_train.txt
Len after  removing_duplicates: 18125 in ./../../data/raw/seq2seq/vulnbank_train.txt
Len before filter: 1098 in ./../../data/raw/seq2seq/vulnbank_anomaly.txt
Len after  filter: 1097 in ./../../data/raw/seq2seq/vulnbank_anomaly.txt
Len after  removing_duplicates: 1078 in ./../../data/raw/seq2seq/vulnbank_anomaly.txt


In [9]:
def get_min_row_len(chunks: List[str]):    
    min_len = 10e10
    min_idx = 0
    for idx, chunk in enumerate(chunks):
        if len(chunk) < min_len:
            min_len = len(chunk)
            min_idx = idx
            
    print(min_len, min_idx)
    print(chunks[min_idx])

def get_max_row_len(chunks: List[str]):
    max_len = 0
    max_idx = 0
    for idx, chunk in enumerate(chunks):
        if len(chunk) > max_len:
            max_len = len(chunk)
            max_idx = idx
            
    print(max_len, max_idx)
    print(chunks[max_idx])
    
# if len > 900 we can skip it

# get_min_row_len(split_vulnbank(Path("./../../data/raw/seq2seq/vulnbank_anomaly.txt").read_text())[1:])
get_max_row_len(split_vulnbank(Path("./../../data/raw/seq2seq/vulnbank_train.txt").read_text())[1:])

2109 16053

Thu, 15 Mar 2018 14:45:52 INFO
POST /vulnbank/online/api.php HTTP/1.1
Host: 10.0.212.25
Connection: close
Content-Length: 122
Accept: application/json, text/javascript, */*; q=0.01
Origin: http://10.0.212.25
X-Requested-With: XMLHttpRequest
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/604.5.6 (KHTML, like Gecko) Version/11.0.3 Safari/604.5.6
Content-Type: application/x-www-form-urlencoded; charset=UTF-8
Referer: http://10.0.212.25/vulnbank/online/transactions.php
Accept-Encoding: gzip, deflate
Accept-Language: en-US,en;q=0.9
Cookie: PHPSESSID=c4ca4238a0b923820dcc509a6f

type=user&action=check&firstname=Theodore&lastname=Sales&creditcard=5325-0124-0353-9606&recipient=DE44444635142635466464
POST /vulnbank/online/api.php HTTP/1.1
Host: 10.0.212.25
Connection: close
Content-Length: 122
Accept: application/json, text/javascript, */*; q=0.01
Origin: http://10.0.212.25
X-Requested-With: XMLHttpRequest
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64)

## Parse csic2010


In [10]:
def split_csic2010(file_content: str) -> List[str]:
    return re.findall(
        pattern=r"(?:GET|POST|PUT)[\s\S]+?(?=\n*POST|\n*GET|\n*PUT)|(?:GET|POST|PUT)[\s\S]+", 
        string=file_content
    )
    
csic2010_benign_chunks = [
    *split_csic2010(Path("./../../data/raw/csic2010/normalTrafficTraining.txt").read_text()),
    *split_csic2010(Path("./../../data/raw/csic2010/normalTrafficTest.txt").read_text()),
]

csic2010_anomaly_chunks = split_csic2010(Path("./../../data/raw/csic2010/anomalousTrafficTest.txt").read_text())

len(csic2010_benign_chunks), len(csic2010_anomaly_chunks)

(72000, 25065)

In [11]:
print(f"Len before removing_duplicates: {len(csic2010_benign_chunks)} for benign")
csic2010_benign_chunks = remove_duplicates(csic2010_benign_chunks)
print(f"Len after  removing_duplicates: {len(csic2010_benign_chunks)} for benign")

print(f"Len before removing_duplicates: {len(csic2010_anomaly_chunks)} for attack")
csic2010_anomaly_chunks = remove_duplicates(csic2010_anomaly_chunks)
print(f"Len after  removing_duplicates: {len(csic2010_anomaly_chunks)} for attack")

Len before removing_duplicates: 72000 for benign
Len after  removing_duplicates: 72000 for benign
Len before removing_duplicates: 25065 for attack
Len after  removing_duplicates: 25065 for attack


In [12]:
# get_max_row_len(csic2010_anomaly_chunks)

## Combine and prepare

In [13]:
all_benign_chunks = [*csic2010_benign_chunks, *seq2seq_benign_chunks]
all_anomaly_chunks = [*csic2010_anomaly_chunks, *seq2seq_anomaly_chunks]

len(all_benign_chunks), len(all_anomaly_chunks)

(90125, 26143)

In [182]:
def filter_headers(chunks: List[str]) -> List[str]:
    new_chunks = []
    
    for chunk in chunks:
        new_chunk = []
        for line in chunk.split("\n"):
            modified_line = line
            
            if line.startswith("Cookie:"):
                continue
            
            new_chunk.append(modified_line)
        new_chunks.append("\n".join(new_chunk))
    
    return new_chunks

print("After removing duplicates. Cookie header had random values")
unique_anomaly_chunks = remove_duplicates(filter_headers(all_anomaly_chunks))
unique_benign_chunks = remove_duplicates(filter_headers(all_benign_chunks))

print(len(filter_headers(all_anomaly_chunks)), len(unique_anomaly_chunks))
print(len(filter_headers(all_benign_chunks)), len(unique_benign_chunks))

After removing duplicates. Cookie header had random values
26143 17013
90125 21573


## Append new attack lines to data

In [186]:
def append_url_lines_to_requests(url_lines: List[str], chunks: List[str], get_only: bool = False) -> List[str]:
    new_chunks = []
    url_lines_copy = url_lines.copy()
    shuffled_chunks = chunks.copy()
    np.random.shuffle(shuffled_chunks)
    
    for chunk in shuffled_chunks:
        if len(url_lines_copy) <= 0:
            break
        
        chunk_lines = chunk.split("\n")
        if chunk_lines[0].startswith("GET "):
            chunk_lines[0] = f"GET {url_lines_copy.pop()} HTTP/1.1"
        elif chunk_lines[0].startswith("POST ") and not get_only:
            append_url = url_lines_copy.pop()
            base_url = append_url.split("?")[0]
            chunk_lines[-1] = ""
            
            chunk_lines[0] = f"POST {base_url} HTTP/1.1"
            if len(append_url.split("?")) > 1:
                chunk_lines[-1] = append_url[len(base_url)+1:]
        else:
            continue
            
        new_chunks.append("\n".join(chunk_lines))
        
    return new_chunks
    
additional_benign_chunks = append_url_lines_to_requests(benign_url_lines, unique_benign_chunks)
additional_anomaly_chunks = append_url_lines_to_requests(anomaly_url_lines, unique_benign_chunks, get_only=True)

len(additional_benign_chunks), len(additional_anomaly_chunks)

(7777, 10000)

In [243]:
def append_injection_to_requests(injection_code: List[str], chunks: List[str]) -> List[str]:
    new_chunks = []
    search_url_params_pattern = "([&|?]\w+=[0-9a-zA-Z\+\%.*]+)"
    injection_code_copy = injection_code.copy()
    shuffled_chunks = chunks.copy()
    np.random.shuffle(shuffled_chunks)
    
    for chunk in shuffled_chunks:
        if len(injection_code_copy) <= 0:
            break
            
        chunk_lines = chunk.split("\n")
        first_line =  chunk_lines[0]
        
        if chunk_lines[0].startswith("GET "):
            params = re.findall(search_url_params_pattern, first_line)
            
            if len(params):
                random_param = random.choice(params)
                insert_in = first_line.find(random_param) + len(random_param)
                
                chunk_lines[0] = chunk_lines[0][:insert_in].split("=")[0] + "=" + injection_code_copy.pop() + chunk_lines[0][insert_in:] 
            else:
                continue

            new_chunks.append("\n".join(chunk_lines))
    
    return new_chunks

append_injection_to_requests(injection_lines, unique_benign_chunks)[:2]

['GET http://localhost:8080/tienda1/miembros/editar.jsp?modo=<IMG LOWSRC="javascript:alert(\'CrossSiteScripting\')">&nombre=Felisa&apellidos=Lecuona+Gasull&email=priestley%40stockdefotos.vu&dni=02143129N&direccion=C%2F+Garcia+Mosquera%2C+184%2C+&ciudad=Castellanos+de+Moriscos&cp=37254&provincia=Navarra&ntc=6206191254660151&B1=Registrar HTTP/1.1\nUser-Agent: Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko)\nPragma: no-cache\nCache-control: no-cache\nAccept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5\nAccept-Encoding: x-gzip, x-deflate, gzip, deflate\nAccept-Charset: utf-8, utf-8;q=0.5, *;q=0.5\nAccept-Language: en\nHost: localhost:8080\nConnection: close',
 'GET http://localhost:8080/tienda1/publico/anadir.jsp?id=exp/*<A STYLE=\'no\\xss:noxss("*//*");xss:&#101;x&#x2F;*XSS*//*/*/pression(alert("XSS"))\'>&precio=100&cantidad=82&B1=A%F1adir+al+carrito HTTP/1.1\nUser-Agent: Mozilla/5.0 (compatible; Konqueror/3.5

In [244]:
injection_lines[-1]

'<IMG LOWSRC="javascript:alert(\'CrossSiteScripting\')">'

In [247]:
additional_anomaly_chunks = [
    *append_injection_to_requests(injection_lines, unique_benign_chunks),
    *append_url_lines_to_requests(anomaly_url_lines, unique_benign_chunks, get_only=True)
]
additional_benign_chunks = append_url_lines_to_requests(benign_url_lines, unique_benign_chunks)

print(len(additional_anomaly_chunks), len(additional_benign_chunks))
print(len(injection_lines), len(anomaly_url_lines), len(benign_url_lines))

12805 7777
2805 10000 7777


# Save data

In [251]:
final_anomaly_chunks = [
    *additional_anomaly_chunks,
    *unique_anomaly_chunks,
]
final_benign_chunks = [
    *additional_benign_chunks,
    *unique_benign_chunks,
]

print(len(final_benign_chunks), len(final_anomaly_chunks))
print(len(remove_duplicates(final_benign_chunks)), len(remove_duplicates(final_anomaly_chunks)))
print("No duplicates, data is balanced")

29350 29818
29350 29818
No duplicates, data is balanced


Shuffle and create .csv

In [261]:
[1] * 10

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [None]:
np.random.shuffle(final_benign_chunks)
np.random.shuffle(final_anomaly_chunks)



In [262]:
benign_y = [1] * len(final_benign_chunks)
anomaly_y = [0] * len(final_anomaly_chunks)

data = {
    "text": [*final_benign_chunks, *final_anomaly_chunks],
    "benign": [*benign_y, *anomaly_y]
}
df = pd.DataFrame(data)

In [266]:
df.to_csv("./../../data/processed/01-04-2024-full-balanced.csv", index=False)