<a href="https://colab.research.google.com/github/jresendiz27/xss_sqli_detector/blob/main/00_01_CreatingFeatures_XSS_SQLi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Starting the Feature Creation (dataset level)

### Now, let's start creating some features!
We will take some inspiration on the OWASP coreruleset repository for diagnosing XSS and SQLi injection attacks
They provide some configurations for apache just as a plug-n-play and reject requests based on certain regexes.
* [Corerules XSS configurations](https://github.com/coreruleset/coreruleset/blob/main/rules/REQUEST-941-APPLICATION-ATTACK-XSS.conf)
* [Corerules SQLi configurations](https://github.com/coreruleset/coreruleset/blob/main/rules/REQUEST-942-APPLICATION-ATTACK-SQLI.conf)


Using those rules, we will create features for the dataset and update it as needed

In [1]:
# Basic imports
import re
from urllib.parse import parse_qsl, unquote_plus
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df_xss_sqli = pd.read_csv('/content/drive/MyDrive/xss_sqli_detector/datasets/xss_sqli_condensed.csv')

In [4]:
df_xss_sqli.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273705 entries, 0 to 273704
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   payload       273704 non-null  object
 1   label         273705 non-null  int64 
 2   attack_label  273705 non-null  object
 3   type          273705 non-null  object
dtypes: int64(1), object(3)
memory usage: 8.4+ MB


In [None]:
## Creating new features for XSS considering CoreRulesSet Regexes for XSS/SQLi
df_xss_sqli['contains_http'] = df_xss_sqli['payload'].str.contains(r'(?i)\b(?:https?:)?//[^\s"\'<>()]+')
df_xss_sqli['contains_script_tag'] = df_xss_sqli['payload'].str.contains(r'(?i)<script[^>]*>[\s\S]*?')
df_xss_sqli['contains_external_payloads'] = df_xss_sqli['payload'].str.contains(r'(?i).(?:\b(?:(?:x(?:link:href|html|mlns)|data:text/html|formaction)\b|pattern[\s\x0b]*=)|(?:!ENTITY[\s\x0b]+(?:%[\s\x0b]+)?[^\s\x0b]+[\s\x0b]+(?:SYSTEM|PUBLIC)|@import|;base64)\b)')
df_xss_sqli['contains_javascript_uri'] = df_xss_sqli['payload'].str.contains(r'(?i)[a-z]+=(?:[^:=]+:.+;)*?[^:=]+:url\(javascript')
df_xss_sqli['contains_html_injection'] = df_xss_sqli['payload'].str.contains(r'(?i)<[^0-9<>A-Z_a-z]*(?:[^\s\x0b\"\'<>]*:)?[^0-9<>A-Z_a-z]*[^0-9A-Z_a-z]*?(?:s[^0-9A-Z_a-z]*?(?:c[^0-9A-Z_a-z]*?r[^0-9A-Z_a-z]*?i[^0-9A-Z_a-z]*?p[^0-9A-Z_a-z]*?t|t[^0-9A-Z_a-z]*?y[^0-9A-Z_a-z]*?l[^0-9A-Z_a-z]*?e|v[^0-9A-Z_a-z]*?g|e[^0-9A-Z_a-z]*?t[^0-9>A-Z_a-z])|f[^0-9A-Z_a-z]*?o[^0-9A-Z_a-z]*?r[^0-9A-Z_a-z]*?m|d[^0-9A-Z_a-z]*?i[^0-9A-Z_a-z]*?a[^0-9A-Z_a-z]*?l[^0-9A-Z_a-z]*?o[^0-9A-Z_a-z]*?g|m[^0-9A-Z_a-z]*?(?:a[^0-9A-Z_a-z]*?r[^0-9A-Z_a-z]*?q[^0-9A-Z_a-z]*?u[^0-9A-Z_a-z]*?e[^0-9A-Z_a-z]*?e|e[^0-9A-Z_a-z]*?t[^0-9A-Z_a-z]*?a[^0-9>A-Z_a-z])|(?:l[^0-9A-Z_a-z]*?i[^0-9A-Z_a-z]*?n[^0-9A-Z_a-z]*?k|o[^0-9A-Z_a-z]*?b[^0-9A-Z_a-z]*?j[^0-9A-Z_a-z]*?e[^0-9A-Z_a-z]*?c[^0-9A-Z_a-z]*?t|e[^0-9A-Z_a-z]*?m[^0-9A-Z_a-z]*?b[^0-9A-Z_a-z]*?e[^0-9A-Z_a-z]*?d|a[^0-9A-Z_a-z]*?(?:p[^0-9A-Z_a-z]*?p[^0-9A-Z_a-z]*?l[^0-9A-Z_a-z]*?e[^0-9A-Z_a-z]*?t|u[^0-9A-Z_a-z]*?d[^0-9A-Z_a-z]*?i[^0-9A-Z_a-z]*?o|n[^0-9A-Z_a-z]*?i[^0-9A-Z_a-z]*?m[^0-9A-Z_a-z]*?a[^0-9A-Z_a-z]*?t[^0-9A-Z_a-z]*?e)|p[^0-9A-Z_a-z]*?a[^0-9A-Z_a-z]*?r[^0-9A-Z_a-z]*?a[^0-9A-Z_a-z]*?m|i?[^0-9A-Z_a-z]*?f[^0-9A-Z_a-z]*?r[^0-9A-Z_a-z]*?a[^0-9A-Z_a-z]*?m[^0-9A-Z_a-z]*?e|b[^0-9A-Z_a-z]*?(?:a[^0-9A-Z_a-z]*?s[^0-9A-Z_a-z]*?e|o[^0-9A-Z_a-z]*?d[^0-9A-Z_a-z]*?y|i[^0-9A-Z_a-z]*?n[^0-9A-Z_a-z]*?d[^0-9A-Z_a-z]*?i[^0-9A-Z_a-z]*?n[^0-9A-Z_a-z]*?g[^0-9A-Z_a-z]*?s)|i[^0-9A-Z_a-z]*?m[^0-9A-Z_a-z]*?a?[^0-9A-Z_a-z]*?g[^0-9A-Z_a-z]*?e?|v[^0-9A-Z_a-z]*?i[^0-9A-Z_a-z]*?d[^0-9A-Z_a-z]*?e[^0-9A-Z_a-z]*?o)[^0-9>A-Z_a-z])|(?:<[0-9A-Z_a-z][^\s\x0b/]*[\s\x0b/]|[\"\'](?:[^\s\x0b/]*[\s\x0b/])?)(?:background|formaction|lowsrc|on(?:a(?:bort|ctivate|d(?:apteradded|dtrack)|fter(?:print|(?:scriptexecu|upda)te)|lerting|n(?:imation(?:cancel|end|iteration|start)|tennastatechange)|ppcommand|u(?:dio(?:end|process|start)|xclick))|b(?:e(?:fore(?:(?:(?:(?:de)?activa|scriptexecu)t|toggl)e|c(?:opy|ut)|editfocus|input|p(?:aste|rint)|u(?:nload|pdate))|gin(?:Event)?)|l(?:ocked|ur)|oun(?:ce|dary)|roadcast|usy)|c(?:a(?:(?:ch|llschang)ed|nplay(?:through)?|rdstatechange)|(?:ell|fstate)change|h(?:a(?:rging(?:time)?cha)?nge|ecking)|l(?:ick|ose)|o(?:m(?:mand(?:update)?|p(?:lete|osition(?:end|start|update)))|n(?:nect(?:ed|ing)|t(?:extmenu|rolselect))|py)|u(?:echange|t))|d(?:ata(?:(?:availabl|chang)e|error|setc(?:hanged|omplete))|blclick|e(?:activate|livery(?:error|success)|vice(?:found|light|(?:mo|orienta)tion|proximity))|i(?:aling|s(?:abled|c(?:hargingtimechange|onnect(?:ed|ing))))|o(?:m(?:a(?:ctivate|ttrmodified)|(?:characterdata|subtree)modified|focus(?:in|out)|mousescroll|node(?:inserted(?:intodocument)?|removed(?:fromdocument)?))|wnloading)|r(?:ag(?:drop|e(?:n(?:d|ter)|xit)|(?:gestur|leav)e|over|start)|op)|urationchange)|e(?:mptied|n(?:abled|d(?:ed|Event)?|ter)|rror(?:update)?|xit)|f(?:ailed|i(?:lterchange|nish)|o(?:cus(?:in|out)?|rm(?:change|input))|ullscreenchange)|g(?:amepad(?:axismove|button(?:down|up)|(?:dis)?connected)|et)|h(?:ashchange|e(?:adphoneschange|l[dp])|olding)|i(?:cc(?:cardlockerror|infochange)|n(?:coming|put|valid))|key(?:down|press|up)|l(?:evelchange|o(?:ad(?:e(?:d(?:meta)?data|nd)|start)?|secapture)|y)|m(?:ark|essage|o(?:use(?:down|enter|(?:lea|mo)ve|o(?:ut|ver)|up|wheel)|ve(?:end|start)?|z(?:a(?:fterpaint|udioavailable)|(?:beforeresiz|orientationchang|t(?:apgestur|imechang))e|(?:edgeui(?:c(?:ancel|omplet)|start)e|network(?:down|up)loa)d|fullscreen(?:change|error)|m(?:agnifygesture(?:start|update)?|ouse(?:hittest|pixelscroll))|p(?:ointerlock(?:change|error)|resstapgesture)|rotategesture(?:start|update)?|s(?:crolledareachanged|wipegesture(?:end|start|update)?))))|no(?:match|update)|o(?:(?:bsolet|(?:ff|n)lin)e|pen|verflow(?:changed)?)|p(?:a(?:ge(?:hide|show)|int|(?:st|us)e)|lay(?:ing)?|o(?:inter(?:down|enter|(?:(?:lea|mo)v|rawupdat)e|o(?:ut|ver)|up)|p(?:state|up(?:hid(?:den|ing)|show(?:ing|n))))|ro(?:gress|pertychange))|r(?:atechange|e(?:adystatechange|ceived|movetrack|peat(?:Event)?|quest|s(?:et|ize|u(?:lt|m(?:e|ing)))|trieving)|ow(?:e(?:nter|xit)|s(?:delete|inserted)))|s(?:croll(?:end)?|e(?:arch|ek(?:complete|ed|ing)|lect(?:ionchange|start)?|n(?:ding|t)|t)|how|(?:ound|peech)(?:end|start)|t(?:a(?:lled|rt|t(?:echange|uschanged))|k(?:comma|sessione)nd|op)|u(?:bmit|ccess|spend)|vg(?:abort|error|(?:un)?load|resize|scroll|zoom))|t(?:ext|ime(?:out|update)|o(?:ggle|uch(?:cancel|en(?:d|ter)|(?:lea|mo)ve|start))|ransition(?:cancel|end|run|start))|u(?:n(?:derflow|handledrejection|load)|p(?:dateready|gradeneeded)|s(?:erproximity|sdreceived))|v(?:ersion|o(?:ic|lum)e)change|w(?:a(?:it|rn)ing|ebkit(?:animation(?:end|iteration|start)|(?:playbacktargetavailabilitychange|transitionen)d)|heel)|zoom)|ping|s(?:rc|tyle))[\x08-\n\f\r ]*?=')
df_xss_sqli['contains_attribute_injection'] = df_xss_sqli['payload'].str.contains(r'(?i)(?:\W|^)(?:javascript:(?:[\s\S]+[=\x5c\(\[\.<]|[\s\S]*?(?:\bname\b|\x5c[ux]\d))|data:(?:(?:[a-z]\w+/\w[\w+-]+\w)?[;,]|[\s\S]*?;[\s\S]*?\b(?:base64|charset=)|[\s\S]*?,[\s\S]*?<[\s\S]*?\w[\s\S]*?>))|@\W*?i\W*?m\W*?p\W*?o\W*?r\W*?t\W*?(?:/\*[\s\S]*?)?(?:[\"\']|\W*?u\W*?r\W*?l[\s\S]*?\()|[^-]*?-\W*?m\W*?o\W*?z\W*?-\W*?b\W*?i\W*?n\W*?d\W*?i\W*?n\W*?g[^:]*?:\W*?u\W*?r\\')

df_xss_sqli['has_special_chars'] = df_xss_sqli['payload'].str.contains(r'^("|">|<|/><|<>\"\'`\(\){}\[\];\\|:)')

In [None]:
URL_RE = re.compile(r'(?i)\b(?:https?:)?//[^\s"\'<>()]+')  # http(s):// o //...

has_url = df_xss_sqli['payload'].str.contains(URL_RE, na=False)


In [None]:
PATTERN_REMOVE_DOMAIN = re.compile(r'^(?:[a-z][a-z0-9+.\-]*:)?//[^/]*', re.I)

def path_and_query_regex(u: str) -> str:
    if not isinstance(u, str) or not u:
        return ""
    s = PATTERN_REMOVE_DOMAIN.sub("", u.strip())
    if s == "":  # caso "http://dominio" sin path
        return "/"
    # garantiza que empiece con "/"
    return s if s.startswith("/") else "/" + s


In [None]:
def only_query(url: str) -> str:
    if not isinstance(url, str) or not url:
        return ""
    s = PATTERN_REMOVE_DOMAIN.sub("", url.strip())
    return s.split("?", 1)[1] if "?" in s else ""


AVAILABLE_REGEXES = {
    'XSS_VALUE_SPECIAL_CHARS': re.compile(r'^("|">|<|/><|<>\"\'`\(\){}\[\];\\)', re.IGNORECASE),
    'XSS_SCRIPT_RELATED': re.compile(r'(?is)\<\\s*script\\b[^>]*>.*?<\\s*/\\s*script\\s*>', re.IGNORECASE),
    'XSS_OPENING_TAGS': re.compile(r'(?i)<\\s*(?:script|iframe|object|embed|svg|math|link|meta|img|video|audio)\b',
                                   re.IGNORECASE),
    'XSS_DOM_EVENTS': re.compile(r'(?i)\bon[a-z]+\\s*(?:=|%3[dD])', re.IGNORECASE),
    'XSS_CODIFIED_TAGS': re.compile(r'(?i)(?:%3[cC]|%3[eE]|#x?0*3[cC];|#x?0*3[eE];|#0*60;|#0*62;|lt;|gt;)',
                                    re.IGNORECASE),
    'XSS_SOURCE_ATTRS': re.compile(
        r'(?i)\b(?:src|href|xlink:href|formaction|srcdoc)\s*=\s*["\']?\s*(?:javascript|vbscript|data)\s*:',
        re.IGNORECASE),
    'XSS_IMG_RELATED': re.compile(r'(?i)<\s*img\b[^>]*\bon(?:error|load)\s*(?:=|%3[dD])', re.IGNORECASE),
    'XSS_INLINE_CSS': re.compile(
        r'(?i)\bstyle\s*=\s*["\'][^"\']*(?:expression\s*\(|url\s*\(\s*[\'"]?\s*(?:javascript|data)\s*:)[^"\']*["\']',
        re.IGNORECASE),
    'XSS_IMPORT_CSS': re.compile(r'(?i)(?:@import\b|import\s*\()', re.IGNORECASE),
    'XSS_POTENTIAL_INJECTION_TAGS': re.compile(r'(?i)<\s*(?:textarea|template)\b', re.IGNORECASE)
}


# 3) De una URL -> dict de {param: valor_decodificado} solo si el valor "huele" a XSS
def suspicious_params_from_url(url: str) -> dict[str, str]:
    qs = only_query(url)
    if not qs:
        return {}
    parsed_params = {}
    for k, v in parse_qsl(qs, keep_blank_values=True):
        val = unquote_plus(v)
        parsed_params[k] = val

    key_hits = {}
    for k, v in parsed_params.items():
        for rgx_key, rgx in AVAILABLE_REGEXES.items():
            if rgx.match(v) or rgx.match(k):
                if key_hits.get(f"url_hits") is None:
                    key_hits[f"{k}_hits"] = 1
                else:
                    key_hits[f"{k}_hits"] += 1
    print("KeyHitsAndParams: ", key_hits, parsed_params)
    # print("SuspectParams: ", suspected_params)
    return {}


malicious_urls = df_fmereani["unsanitized_payload"].loc[df_fmereani["type"].eq("Malicious")].head(10)
for val in malicious_urls:
    suspicious_params_from_url(val)