# Starting the Feature Creation (dataset level)

### Now, let's start creating some features!
We will take some inspiration on the OWASP coreruleset repository for diagnosing XSS and SQLi injection attacks
They provide some configurations for apache just as a plug-n-play and reject requests based on certain regexes.
* [Corerules XSS configurations](https://github.com/coreruleset/coreruleset/blob/main/rules/REQUEST-941-APPLICATION-ATTACK-XSS.conf)
* [Corerules SQLi configurations](https://github.com/coreruleset/coreruleset/blob/main/rules/REQUEST-942-APPLICATION-ATTACK-SQLI.conf)


Using those rules, we will create features for the dataset and update it as needed

In [5]:
# Basic imports
import re
from urllib.parse import parse_qsl, unquote_plus
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df_xss_sqli = pd.read_csv('/content/drive/MyDrive/xss_sqli_detector/datasets/xss_sqli_condensed.csv')

In [6]:
df_xss_sqli.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273705 entries, 0 to 273704
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   payload       273704 non-null  object
 1   label         273705 non-null  int64 
 2   attack_label  273705 non-null  object
 3   type          273705 non-null  object
dtypes: int64(1), object(3)
memory usage: 8.4+ MB


In [7]:
## Creating new features for XSS
df_xss_sqli['has_url'] = df_xss_sqli['payload'].str.contains(r'(?i)\b(?:https?:)?//[^\s"\'<>()]+')
df_xss_sqli['has_special_chars'] = df_xss_sqli['payload'].str.contains(r'^("|">|<|/><|<>\"\'`\(\){}\[\];\\|:)')

In [None]:
URL_RE = re.compile(r'(?i)\b(?:https?:)?//[^\s"\'<>()]+')  # http(s):// o //...

has_url = df_xss_sqli['payload'].str.contains(URL_RE, na=False)


In [None]:
PATTERN_REMOVE_DOMAIN = re.compile(r'^(?:[a-z][a-z0-9+.\-]*:)?//[^/]*', re.I)

def path_and_query_regex(u: str) -> str:
    if not isinstance(u, str) or not u:
        return ""
    s = PATTERN_REMOVE_DOMAIN.sub("", u.strip())
    if s == "":  # caso "http://dominio" sin path
        return "/"
    # garantiza que empiece con "/"
    return s if s.startswith("/") else "/" + s


In [None]:
def only_query(url: str) -> str:
    if not isinstance(url, str) or not url:
        return ""
    s = PATTERN_REMOVE_DOMAIN.sub("", url.strip())
    return s.split("?", 1)[1] if "?" in s else ""


AVAILABLE_REGEXES = {
    'XSS_VALUE_SPECIAL_CHARS': re.compile(r'^("|">|<|/><|<>\"\'`\(\){}\[\];\\)', re.IGNORECASE),
    'XSS_SCRIPT_RELATED': re.compile(r'(?is)\<\\s*script\\b[^>]*>.*?<\\s*/\\s*script\\s*>', re.IGNORECASE),
    'XSS_OPENING_TAGS': re.compile(r'(?i)<\\s*(?:script|iframe|object|embed|svg|math|link|meta|img|video|audio)\b',
                                   re.IGNORECASE),
    'XSS_DOM_EVENTS': re.compile(r'(?i)\bon[a-z]+\\s*(?:=|%3[dD])', re.IGNORECASE),
    'XSS_CODIFIED_TAGS': re.compile(r'(?i)(?:%3[cC]|%3[eE]|#x?0*3[cC];|#x?0*3[eE];|#0*60;|#0*62;|lt;|gt;)',
                                    re.IGNORECASE),
    'XSS_SOURCE_ATTRS': re.compile(
        r'(?i)\b(?:src|href|xlink:href|formaction|srcdoc)\s*=\s*["\']?\s*(?:javascript|vbscript|data)\s*:',
        re.IGNORECASE),
    'XSS_IMG_RELATED': re.compile(r'(?i)<\s*img\b[^>]*\bon(?:error|load)\s*(?:=|%3[dD])', re.IGNORECASE),
    'XSS_INLINE_CSS': re.compile(
        r'(?i)\bstyle\s*=\s*["\'][^"\']*(?:expression\s*\(|url\s*\(\s*[\'"]?\s*(?:javascript|data)\s*:)[^"\']*["\']',
        re.IGNORECASE),
    'XSS_IMPORT_CSS': re.compile(r'(?i)(?:@import\b|import\s*\()', re.IGNORECASE),
    'XSS_POTENTIAL_INJECTION_TAGS': re.compile(r'(?i)<\s*(?:textarea|template)\b', re.IGNORECASE)
}


# 3) De una URL -> dict de {param: valor_decodificado} solo si el valor "huele" a XSS
def suspicious_params_from_url(url: str) -> dict[str, str]:
    qs = only_query(url)
    if not qs:
        return {}
    parsed_params = {}
    for k, v in parse_qsl(qs, keep_blank_values=True):
        val = unquote_plus(v)
        parsed_params[k] = val

    key_hits = {}
    for k, v in parsed_params.items():
        for rgx_key, rgx in AVAILABLE_REGEXES.items():
            if rgx.match(v) or rgx.match(k):
                if key_hits.get(f"url_hits") is None:
                    key_hits[f"{k}_hits"] = 1
                else:
                    key_hits[f"{k}_hits"] += 1
    print("KeyHitsAndParams: ", key_hits, parsed_params)
    # print("SuspectParams: ", suspected_params)
    return {}


malicious_urls = df_fmereani["unsanitized_payload"].loc[df_fmereani["type"].eq("Malicious")].head(10)
for val in malicious_urls:
    suspicious_params_from_url(val)