In [53]:
import pandas as pd
import seaborn as sns
import requests
from urllib.parse import urlsplit, parse_qs, unquote_plus, parse_qsl
import re

In [2]:
## DataSources:
# https://github.com/fmereani/Cross-Site-Scripting-XSS
# https://www.kaggle.com/datasets/syedsaqlainhussain/cross-site-scripting-xss-dataset-for-deep-learning?select=XSS_dataset.csv
# https://huggingface.co/datasets/shengqin/web-attacks/
# https://huggingface.co/datasets/shengqin/web-attacks-ab2
# https://www.kaggle.com/datasets/gambleryu/biggest-sql-injection-dataset
# https://www.kaggle.com/datasets/sajid576/sql-injection-dataset


In [12]:
# download fmereani dataset
raw_payload = requests.get(
    "https://raw.githubusercontent.com/fmereani/Cross-Site-Scripting-XSS/refs/heads/master/XSSDataSets/Payloads.csv").text
with open("datasets/raw/fmereani_xss_dataset_payload.csv", 'w') as f:
    f.write(raw_payload)

In [33]:
# huggingface shengqin dataset
splits = {'train': 'train.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/shengqin/web-attacks-ab2/" + splits["train"])
df.to_csv("datasets/raw/shengqin_web_attacks_ab2_train.csv", index=False)
# huggingface shenqin datase test
df = pd.read_csv("hf://datasets/shengqin/web-attacks-ab2/" + splits["test"])
df.to_csv("datasets/raw/shengqin_web_attacks_ab2_test.csv", index=False)

In [5]:
# huggingface shengqin web-attacks dataset train
splits = {'train': 'train.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/shengqin/web-attacks/" + splits["train"])
df.to_csv("datasets/raw/shengqin_web_attacks_train.csv", index=False)

df = pd.read_csv("hf://datasets/shengqin/web-attacks/" + splits["test"])
df.to_csv("datasets/raw/shengqin_web_attacks_test.csv", index=False)

  from .autonotebook import tqdm as notebook_tqdm


In [108]:
# Load downloaded raw datasets
df_fmereani = pd.read_csv("datasets/raw/fmereani_xss_dataset_payload.csv")
df_xss_kaggle = pd.read_csv("datasets/raw/XSS_dataset.csv")
df_shengqin_ab2 = pd.read_csv("datasets/raw/shengqin_web_attacks_ab2_train.csv")
df_shengqin_web = pd.read_csv("datasets/raw/shengqin_web_attacks_train.csv")

In [3]:
df_fmereani.sample(5)

Unnamed: 0,Payloads,Class
21351,"excellent hotel and location my mom, husband a...",Benign
38599,great business hotel near suntec with a few di...,Benign
34273,comfortable but not perfect i stayed at the be...,Benign
42110,insertar&precio=8916&b1=confirmar,Benign
33122,http://www.wikihow.com/start-a-successful-yout...,Benign


In [8]:
df_xss_kaggle.sample(5)

Unnamed: 0.1,Unnamed: 0,Sentence,Label
5065,5065,"<b draggable=""true"" ondrag=""alert(1)"">test</b>",1
12323,12323,"<div draggable=""true"" contenteditable>drag me<...",1
11163,11163,"\t <div style=""padding:0em 0.25em"">",0
712,712,"<td onmouseover=""alert(1)"">test</td>",1
12947,12947,"<th onmouseup=""alert(1)"">test</th>",1


In [9]:
df_shengqin_ab2.sample(5)

Unnamed: 0,Payload,Label,text_label,ID
29116,aaaaaaa 'or 1=1,1,SQLi,29117
12712,Ø£Ø¨Ù„: Â«Ø¬Ù†ÙˆØ¨ Ø¹Ø¨Ø¯Ø§Ù„Ù„Ù‡ Ø§Ù„Ù…Ø¨Ø§Ø±...,0,normal,12713
20812,"p6,$mbt@lk+w[s`;j{\-rn:+c\z96#$/2+4di4w-p@h$`c...",1,SQLi,20813
42322,"<style>@keyframes x{}</style><isindex style=""a...",2,XSS,42323
11707,ã¨ã¦ã‚‚ã‚ˆã„ã®ã§ã™ã‹ã€ãªã«ã›è‡­ã„ã...,0,normal,11708


In [10]:
df_shengqin_web.sample(5)

Unnamed: 0,Payload,Label,text_label,ID
11098,//some user agreement and sending to server lo...,1,XSS,315
1295,select,0,normal,26634
15663,Select your dream career.,0,normal,24326
4003,"<track draggable=""true"" ondragenter=""alert(1)""...",1,XSS,6997
13990,"-6118 union all select 5906,5906,5906,5906,5906#",2,SQLi,19204


In [34]:
# Merge Shengqin sources
df_shengqin_merged = pd.concat([df_shengqin_ab2, df_shengqin_web], ignore_index=True)
df_shengqin_merged.rename(columns={"text_label": "attack_label", "Payload": 'payload', "Label": 'label'}, inplace=True)
df_shengqin_merged.drop(columns=["ID"], inplace=True, errors="ignore")
df_shengqin_merged["type"] = df_shengqin_merged["attack_label"].map(
    {'SQLi': "Malicious", "XSS": "Malicious", "normal": "Benign"})
df_shengqin_merged.sample(5)

Unnamed: 0,payload,label,attack_label,type
37802,9999999999999999999999999999999999999999999999...,1,SQLi,Malicious
47977,Hi are you live,0,normal,Benign
39816,CRINGEEEEE,0,normal,Benign
55552,"ORDER BY 1,SLEEP(5),BENCHMARK(1000000,MD5('A')...",2,SQLi,Malicious
31678,oooooooooooooooooooooooooooooooooooooooooooooo...,1,SQLi,Malicious


In [36]:
# Adapt Kaggle datasource
df_xss_kaggle["type"] = df_xss_kaggle['Label'].map({0: "Benign", 1: "Malicious"})
df_xss_kaggle["attack_label"] = df_xss_kaggle['Label'].map({0: "normal", 1: 'XSS'})
df_xss_kaggle.drop(["Unnamed: 0"], axis=1, inplace=True, errors="ignore")
df_xss_kaggle.rename(columns={"Sentence": "payload", "Label": "label"}, inplace=True)

In [31]:
df_xss_kaggle.sample(5)

Unnamed: 0,payload,label,type,attack_label
7280,"<div draggable=""true"" contenteditable>drag me<...",1,Malicious,XSS
12681,<ul>,0,Benign,normal
11073,"<div draggable=""true"" contenteditable>drag me<...",1,Malicious,XSS
9845,"<p draggable=""true"" ondragend=""alert(1)"">test</p>",1,Malicious,XSS
10291,<style>:target {color:red;}</style><param id=x...,1,Malicious,XSS


In [109]:
df_fmereani.sample(5)

Unnamed: 0,Payloads,Class
21685,fabulous garden suite (lower),Benign
3276,http://www.kaspersky-protects.com/buy/features...,Malicious
16198,http://www.wikihow.com/create-a-bat-extension-...,Benign
12110,http://localhost:8080/tienda1/publico/registro...,Benign
26983,"function venuedetailassistant(venue,u,p,i,fui,...",Benign


In [125]:
df_fmereani.rename(columns={"Class": "type", 'Payloads': 'unsanitized_payload'}, inplace=True)
df_fmereani['attack_label'] = df_fmereani['type'].map({"Benign": "normal", "Malicious": 'XSS'})
df_fmereani['has_url'] = df_fmereani['unsanitized_payload'].astype('string').str.match(r'(?i)^\s*(?:https?:)?//', na=False)
df_fmereani.loc[df_fmereani["type"].eq("Malicious") & df_fmereani["has_url"].eq(False)].info()


<class 'pandas.core.frame.DataFrame'>
Index: 899 entries, 24128 to 43216
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   unsanitized_payload  899 non-null    object 
 1   type                 899 non-null    object 
 2   attack_label         899 non-null    object 
 3   has_url              899 non-null    boolean
dtypes: boolean(1), object(3)
memory usage: 29.8+ KB


In [116]:
URL_RE = re.compile(r'(?i)\b(?:https?:)?//[^\s"\'<>()]+')  # http(s):// o //...

mal = df_fmereani["type"].eq("Malicious")
s   = df_fmereani['unsanitized_payload'].astype("string")

has_url = s.str.contains(URL_RE, na=False)

con_url = (mal & has_url).sum()
sin_url = (mal & ~has_url).sum()

print({"Malicious con URL": con_url, "Malicious sin URL": sin_url})

{'Malicious con URL': np.int64(14363), 'Malicious sin URL': np.int64(786)}


In [107]:
PATTERN_REMOVE_DOMAIN = re.compile(r'^(?:[a-z][a-z0-9+.\-]*:)?//[^/]*', re.I)

def path_and_query_regex(u: str) -> str:
    if not isinstance(u, str) or not u:
        return ""
    s = PATTERN_REMOVE_DOMAIN.sub("", u.strip())
    if s == "":  # caso "http://dominio" sin path
        return "/"
    # garantiza que empiece con "/"
    return s if s.startswith("/") else "/" + s

# Path query
df_fmereani["removed_urls"] = df_fmereani["unsanitized_payload"].apply(path_and_query_regex)
df_fmereani.sample(10)

Unnamed: 0,unsanitized_payload,type,attack_label,removed_urls,possible_payload
37382,"great hotel my husband, our 6-year-old daughte...",Benign,normal,"/great hotel my husband, our 6-year-old daught...",
3738,http://www.aowassoc.com/gallery/login.asp?fold...,Malicious,XSS,/gallery/login.asp?folder=&amp;n=&quot;&gt;&lt...,
25385,"<script>eval(function(p,a,c,k,e,d){e=function(...",Malicious,XSS,"/<script>eval(function(p,a,c,k,e,d){e=function...",c%a)>35?string.fromcharcode(c 29):c.tostring(3...
6979,http://tools.neb.com/inbase/intein.php?name=go...,Malicious,XSS,/inbase/intein.php?name=gob+dnae}%3c/style%3e%...,gob dnae}</style><script>a=eval;b=alert;a(b(/x...
17762,http://www.wikihow.com/assemble-a-concrete-gar...,Benign,normal,/assemble-a-concrete-garage&t=1396563418956&n=...,
7596,http://search.gzkj.gov.cn:8082/search.jsp?dir=...,Malicious,XSS,/search.jsp?dir=scut&amp;q=scut&amp;hitsperpag...,scut
7327,http://openssi.org/cgi-bin/view?page=%22%3e%3c...,Malicious,XSS,/cgi-bin/view?page=%22%3e%3c/title%3e%3cscript...,"""></title><script>alert(1337)</script>><mar<br..."
36507,"great location, great history we enjoyed our s...",Benign,normal,"/great location, great history we enjoyed our ...",
26466,"curiosity.controller(\'modulemanagerctrl\', fu...",Benign,normal,"/curiosity.controller(\'modulemanagerctrl\', f...",
10272,http://localhost:8080/tienda1/publico/pagar.js...,Benign,normal,/tienda1/publico/pagar.jsp?modo=insertar&preci...,insertar


In [105]:
def only_query(url: str) -> str:
    if not isinstance(url, str) or not url:
        return ""
    s = PATTERN_REMOVE_DOMAIN.sub("", url.strip())
    return s.split("?", 1)[1] if "?" in s else ""


AVAILABLE_REGEXES = {
    'XSS_VALUE_SPECIAL_CHARS': re.compile(r'^("|">|<|/><|<>\"\'`\(\){}\[\];\\)', re.IGNORECASE),
    'XSS_SCRIPT_RELATED': re.compile(r'(?is)\<\\s*script\\b[^>]*>.*?<\\s*/\\s*script\\s*>', re.IGNORECASE),
    'XSS_OPENING_TAGS': re.compile(r'(?i)<\\s*(?:script|iframe|object|embed|svg|math|link|meta|img|video|audio)\b',
                                   re.IGNORECASE),
    'XSS_DOM_EVENTS': re.compile(r'(?i)\bon[a-z]+\\s*(?:=|%3[dD])', re.IGNORECASE),
    'XSS_CODIFIED_TAGS': re.compile(r'(?i)(?:%3[cC]|%3[eE]|#x?0*3[cC];|#x?0*3[eE];|#0*60;|#0*62;|lt;|gt;)',
                                    re.IGNORECASE),
    'XSS_SOURCE_ATTRS': re.compile(
        r'(?i)\b(?:src|href|xlink:href|formaction|srcdoc)\s*=\s*["\']?\s*(?:javascript|vbscript|data)\s*:',
        re.IGNORECASE),
    'XSS_IMG_RELATED': re.compile(r'(?i)<\s*img\b[^>]*\bon(?:error|load)\s*(?:=|%3[dD])', re.IGNORECASE),
    'XSS_INLINE_CSS': re.compile(
        r'(?i)\bstyle\s*=\s*["\'][^"\']*(?:expression\s*\(|url\s*\(\s*[\'"]?\s*(?:javascript|data)\s*:)[^"\']*["\']',
        re.IGNORECASE),
    'XSS_IMPORT_CSS': re.compile(r'(?i)(?:@import\b|import\s*\()', re.IGNORECASE),
    'XSS_POTENTIAL_INJECTION_TAGS': re.compile(r'(?i)<\s*(?:textarea|template)\b', re.IGNORECASE)
}


# 3) De una URL -> dict de {param: valor_decodificado} solo si el valor "huele" a XSS
def suspicious_params_from_url(url: str) -> dict[str, str]:
    qs = only_query(url)
    if not qs:
        return {}
    parsed_params = {}
    for k, v in parse_qsl(qs, keep_blank_values=True):
        val = unquote_plus(v)
        parsed_params[k] = val

    key_hits = {}
    for k, v in parsed_params.items():
        for rgx_key, rgx in AVAILABLE_REGEXES.items():
            if rgx.match(v) or rgx.match(k):
                if key_hits.get(f"url_hits") is None:
                    key_hits[f"{k}_hits"] = 1
                else:
                    key_hits[f"{k}_hits"] += 1
    print("KeyHitsAndParams: ", key_hits, parsed_params)
    # print("SuspectParams: ", suspected_params)
    return {}


malicious_urls = df_fmereani["unsanitized_payload"].loc[df_fmereani["type"].eq("Malicious")].head(10)
for val in malicious_urls:
    suspicious_params_from_url(val)

KeyHitsAndParams:  {'keyword_hits': 1, 'keyword': '"><script>alert(document.cookie);<<br>/script>'}
KeyHitsAndParams:  {'q_hits': 1, 'q': '"><script>alert(document.<br>cookie);</script>', 'amp;btng': 'search', 'amp;ie': '', 'amp;site': '', 'amp;output': 'xml', 'amp;client': '', 'amp;lr': '', 'amp;oe': '', 'amp;filter': '0'}
KeyHitsAndParams:  {'action': 'missionary.info<marquee>pappy</marquee>', 'amp;missi<br>onary_id': '69'}
KeyHitsAndParams:  {'lforenam': '\\"><script>alert(docume<br>nt.cookie);</script>', 'amp;subdwell': '', 'amp;dwelling': '', 'amp;streetnm': '', 'amp;locality': '', 'amp;hometown': '', 'amp;postcode': '', 'amp;datebrth': '', 'amp;learn<br>gen': '', 'amp;ethnicor': '', 'amp;tel_numb': '', 'amp;tel_mob': '', 'amp;email_add': '', 'amp;email_add2': '', 'amp;agree_info': '', 'amp;username': '', 'amp;password': '', 'amp;err': 'sur'}
KeyHitsAndParams:  {'q_hits': 1, 'q': '"><script>alert(document.<br>cookie);</script>', 'amp;btng': 'search', 'amp;ie': '', 'amp;site': '', 

In [91]:
df_fmereani[['removed_urls', 'possible_payload']].loc[df_fmereani["type"].eq("Malicious")].sample(3)

Unnamed: 0,removed_urls,possible_payload
4767,/webcards/tools/calendar.php?month=&quot;&gt;&...,
28863,/cvresults.php?id='%3cscript%3ealert(/st@r-gaz...,'<script>alert(/st@r-gaz3r/)</script><marquee>...
1203,/index.php?page=&lt;script&gt;alert('http://xs...,
