# XSS-SQLi datasources

## Downloading, standardizing and condensing datasources for XSS and SQLinjection

In [1]:
import csv

import pandas as pd
import requests

In [2]:
## DataSources:
# https://github.com/fmereani/Cross-Site-Scripting-XSS
# https://www.kaggle.com/datasets/syedsaqlainhussain/cross-site-scripting-xss-dataset-for-deep-learning?select=XSS_dataset.csv
# https://huggingface.co/datasets/shengqin/web-attacks/
# https://huggingface.co/datasets/shengqin/web-attacks-ab2
# https://www.kaggle.com/datasets/gambleryu/biggest-sql-injection-dataset
# https://www.kaggle.com/datasets/sajid576/sql-injection-dataset
# https://github.com/payloadbox/xss-payload-list/blob/master/Intruder/xss-payload-list.txt
# https://www.kaggle.com/datasets/alextrinity/sqli-xss-dataset/data

# Useful Papers:
# https://inria.hal.science/hal-03273564/document
# https://www.diva-portal.org/smash/get/diva2:1883365/FULLTEXT01.pdf
# https://medium.com/@madhubleh/detection-of-cross-site-scripting-xss-attacks-with-cnn-4633b4873480
# https://pmc.ncbi.nlm.nih.gov/articles/PMC9252680/
# https://arxiv.org/pdf/2509.01835

# Useful Resources
# https://www.yeswehack.com/learn-bug-bounty/xss-attacks-exploitation-ultimate-guide
# https://github.com/Sivnerof/Sources-And-Sinks-Cheatsheet
# For reading https://www.kaggle.com/code/shashikiran42/data-read-process the https://www.kaggle.com/datasets/shashikiran42/cookie-security-and-vulnerabilities-dataset-cisc/data

In [38]:
text_payload = requests.get(
    "https://raw.githubusercontent.com/payloadbox/xss-payload-list/refs/heads/master/Intruder/xss-payload-list.txt").content

# automatically create the dataset as we expect it to be
with open("datasets/raw/xss_payload_list_payloadbox.csv", "w", newline='') as file:
    writer = csv.writer(file)
    fieldnames = ["payload", "label", "type", "attack_label"]
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    for payload in text_payload.decode("utf-8").splitlines():
        writer.writerow({'payload': payload, "label": 1, "type": "Malicious", "attack_label": "XSS"})


In [12]:
# download fmereani dataset
raw_payload = requests.get(
    "https://raw.githubusercontent.com/fmereani/Cross-Site-Scripting-XSS/refs/heads/master/XSSDataSets/Payloads.csv").text
with open("datasets/raw/fmereani_xss_dataset_payload.csv", 'w') as f:
    f.write(raw_payload)

In [33]:
# huggingface shengqin dataset
splits = {'train': 'train.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/shengqin/web-attacks-ab2/" + splits["train"])
df.to_csv("datasets/raw/shengqin_web_attacks_ab2_train.csv", index=False)
# huggingface shenqin datase test
df = pd.read_csv("hf://datasets/shengqin/web-attacks-ab2/" + splits["test"])
df.to_csv("datasets/raw/shengqin_web_attacks_ab2_test.csv", index=False)

In [5]:
# huggingface shengqin web-attacks dataset train
splits = {'train': 'train.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/shengqin/web-attacks/" + splits["train"])
df.to_csv("datasets/raw/shengqin_web_attacks_train.csv", index=False)

df = pd.read_csv("hf://datasets/shengqin/web-attacks/" + splits["test"])
df.to_csv("datasets/raw/shengqin_web_attacks_test.csv", index=False)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load downloaded raw datasets
df_fmereani = pd.read_csv("datasets/raw/fmereani_xss_dataset_payload.csv")
df_xss_kaggle = pd.read_csv("datasets/raw/XSS_dataset.csv")

df_shengqin_ab2 = pd.read_csv("datasets/raw/shengqin_web_attacks_ab2_train.csv")
df_shengqin_ab2_test = pd.read_csv("datasets/raw/shengqin_web_attacks_ab2_test.csv")
df_shengqin_ab2 = pd.concat([df_shengqin_ab2, df_shengqin_ab2_test], ignore_index=True)

df_shengqin_web = pd.read_csv("datasets/raw/shengqin_web_attacks_train.csv")
df_shengqin_web_test = pd.read_csv("datasets/raw/shengqin_web_attacks_test.csv")
df_shengqin_web = pd.concat([df_shengqin_web, df_shengqin_web_test], ignore_index=True)

df_payloadbox_xss = pd.read_csv("datasets/raw/xss_payload_list_payloadbox.csv")
df_alextrinity = pd.read_csv("datasets/raw/alextrinity_xss_sqli_dataset.csv")

In [3]:
df_fmereani.sample(5)

Unnamed: 0,Payloads,Class
14318,http://localhost:8080/tienda1/publico/pagar.js...,Benign
36148,awesome location-great staff-good value first ...,Benign
3343,http://www.the-morph.co.uk/gallery/igallery.as...,Malicious
15176,http://www.wikihow.com/appreciate-robert-patti...,Benign
33304,http://www.wikihow.com/accept-a-trade-on-anima...,Benign


In [4]:
df_xss_kaggle.sample(5)

Unnamed: 0.1,Unnamed: 0,Sentence,Label
4864,4864,"<font ondblclick=""alert(1)"">test</font>",1
11790,11790,"<li class=""toctree-l1""><a class=""reference int...",0
10457,10457,"<section onmousemove=""alert(1)"">test</section>",1
7560,7560,</li>,0
4657,4657,"<meter onbeforecut=""alert(1)"" contenteditable>...",1


In [5]:
df_shengqin_ab2.sample(5)

Unnamed: 0,Payload,Label,text_label,ID
20643,"""1 ) ) ) and 8148 = like ( 'abc...",1,SQLi,20644
37625,ä½¿ã„å§‹ã‚ã¦ã‹ã‚‰7æ—¥ç›®ã§1æœ¬ç›®ãŒåå¿...,0,normal,37626
19889,80k6g9 fil13vv9jljqaw gatp1q9gvqti8gk4mlvfufb3...,1,SQLi,19890
21647,"1"" where 6325 = 6325 union all select null,nul...",1,SQLi,21648
12279,"1"" and 9254 = ( select count ( * ) from rdb$...",1,SQLi,12280


In [6]:
df_shengqin_web.sample(5)

Unnamed: 0,Payload,Label,text_label,ID
15244,"<blockquote onmouseup=""alert(1)"">test</blockqu...",1,XSS,5962
9431,Up to 30% off select smart and electronic door...,0,normal,25215
6237,"<u draggable=""true"" ondragleave=""alert(1)"">tes...",1,XSS,5970
10521,It Was Proposed By A Committee Of The British ...,0,normal,25949
13383,`'><script>-javascript:alert(1)</script>,1,XSS,573


In [7]:
df_payloadbox_xss.sample(5)

Unnamed: 0,payload,label,type,attack_label
3730,"<rp onkeydown=""alert(1)"" contenteditable>test<...",1,Malicious,XSS
4987,<style>@keyframes x{from {left:0;}to {left: 10...,1,Malicious,XSS
2742,<link id=x tabindex=1 onbeforedeactivate=alert...,1,Malicious,XSS
4654,<style>:target {transform: rotate(180deg);}</s...,1,Malicious,XSS
6263,<iframe/onreadystatechange=alert(1),1,Malicious,XSS


In [8]:
df_alextrinity.sample(5)

Unnamed: 0,Sentence,SQLInjection,XSS,CommandInjection,Normal
75506,"The plot of 'Edison' was decent, but one actor...",0.0,0.0,0.0,1.0
172273,TMTWr580weBOp8AcmJ8gPuiRAbNyzrfIYXnSJ0 NbcndlF...,0.0,0.0,1.0,0.0
145860,action=search&uri=%2Findex.html&__locale=en&qu...,0.0,1.0,0.0,0.0
195007,Have you ever sat watching a movie when 20 or ...,0.0,0.0,1.0,0.0
174156,HDOxb8HjJo1e44EC5Obl4swbqX7Onsdt5BH9CMCAk ZJa...,0.0,0.0,1.0,0.0


In [3]:
# Merge Shengqin sources
df_shengqin_merged = pd.concat([df_shengqin_ab2, df_shengqin_web], ignore_index=True)
df_shengqin_merged.rename(columns={"text_label": "attack_label", "Payload": 'payload', "Label": 'label'}, inplace=True)
df_shengqin_merged.drop(columns=["ID"], inplace=True, errors="ignore")
df_shengqin_merged["type"] = df_shengqin_merged["attack_label"].map(
    {'SQLi': "Malicious", "XSS": "Malicious", "normal": "Benign"})
df_shengqin_merged.sample(5)

Unnamed: 0,payload,label,attack_label,type
4497,"""UNION SELECT @@VERSION,SLEEP(5),USER(),BENCHM...",1,SQLi,Malicious
44395,"sick,sick,sick",0,normal,Benign
72154,-4684 ) as uqjn where 4550 = 4550 or 1689 = 5825,2,SQLi,Malicious
52470,1%' ) ) and char ( 120 ) ||char ( 106 ) ||...,1,SQLi,Malicious
83854,"<style>@keyframes x{}</style><col style=""anima...",1,XSS,Malicious


In [4]:
# Adapt Kaggle datasource
df_xss_kaggle["type"] = df_xss_kaggle['Label'].map({0: "Benign", 1: "Malicious"})
df_xss_kaggle["attack_label"] = df_xss_kaggle['Label'].map({0: "normal", 1: 'XSS'})
df_xss_kaggle.drop(["Unnamed: 0"], axis=1, inplace=True, errors="ignore")
df_xss_kaggle.rename(columns={"Sentence": "payload", "Label": "label"}, inplace=True)

In [11]:
df_xss_kaggle.sample(5)

Unnamed: 0,payload,label,type,attack_label
2064,"\t </span> <span class=""reference-text"">",0,Benign,normal
1001,"<dir draggable=""true"" ondragend=""alert(1)"">tes...",1,Malicious,XSS
11425,<font id=x tabindex=1 onfocusin=alert(1)></font>,1,Malicious,XSS
5668,\t </span> </li>,0,Benign,normal
12978,"<textarea onmouseover=""alert(1)"">test</textarea>",1,Malicious,XSS


In [5]:
# merge shengqin with kaggle
df_shengqin_kaggle_merged = pd.concat([df_shengqin_merged, df_xss_kaggle], ignore_index=True)
df_shengqin_kaggle_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99777 entries, 0 to 99776
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   payload       99751 non-null  object
 1   label         99777 non-null  int64 
 2   attack_label  99777 non-null  object
 3   type          99777 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.0+ MB


In [6]:
# Adapt fmereani dataset
df_fmereani.rename(columns={"Class": "type", 'Payloads': 'payload'}, inplace=True)
df_fmereani['attack_label'] = df_fmereani['type'].map({"Benign": "normal", "Malicious": 'XSS'})
df_fmereani['label'] = df_fmereani['attack_label'].map({'normal': 0, 'XSS': 1})
df_fmereani.sample(5)

Unnamed: 0,payload,type,attack_label,label
27892,http://localhost:8080/tienda1/publico/registro...,Benign,normal,0
32268,http://www.wikihow.com/add-rss-and-web-feeds-t...,Benign,normal,0
14617,http://www.wikihow.com/deal-with-widespread-un...,Benign,normal,0
9413,http://www.ct.gov/ctportal/taxonomy/ct_taxonom...,Malicious,XSS,1
1717,http://www.stats.gov.cn/was40/reldetail.jsp?do...,Malicious,XSS,1


In [7]:
# Merge shengqin, kaggle and fmereani datasets
df_fmereani_shengqin_kaggle_merged = pd.concat([df_shengqin_kaggle_merged, df_fmereani], ignore_index=True)
df_fmereani_shengqin_kaggle_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142994 entries, 0 to 142993
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   payload       142968 non-null  object
 1   label         142994 non-null  int64 
 2   attack_label  142994 non-null  object
 3   type          142994 non-null  object
dtypes: int64(1), object(3)
memory usage: 4.4+ MB


In [8]:
# Merge shengqin, kaggle, fmereani and payloadbox datasets
df_fmereani_shengqin_kaggle_payloadbox_merged = pd.concat([df_fmereani_shengqin_kaggle_merged, df_payloadbox_xss],
                                                          ignore_index=True)
df_fmereani_shengqin_kaggle_payloadbox_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149607 entries, 0 to 149606
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   payload       149581 non-null  object
 1   label         149607 non-null  int64 
 2   attack_label  149607 non-null  object
 3   type          149607 non-null  object
dtypes: int64(1), object(3)
memory usage: 4.6+ MB


In [9]:
df_fmereani_shengqin_kaggle_payloadbox_merged['attack_label'].value_counts()

attack_label
normal    75268
XSS       40398
SQLi      33941
Name: count, dtype: int64

In [10]:
# Standardizing alex trinity dataset
attack_labels = ["normal", "XSS", "SQLi"]
type_labels = ['Benign', 'Malicious']

df_alextrinity.rename(columns={"SQLInjection": "SQLi", 'Sentence': 'payload', 'Normal': 'normal'}, inplace=True)
# Removing all with Command Injection
df_alextrinity = df_alextrinity.loc[~df_alextrinity['CommandInjection'].eq(1.0)].copy()
# Assigning attack_label for XSS
df_alextrinity["attack_label"] = df_alextrinity[attack_labels].idxmax(axis=1).where(
    df_alextrinity[attack_labels].any(axis=1), "unknown")

df_alextrinity['label'] = df_alextrinity['attack_label'].map({'normal': 0, 'XSS': 1, 'SQLi': 2})

df_alextrinity.loc[df_alextrinity["attack_label"].eq("normal"), "type"] = "Benign"
df_alextrinity.loc[df_alextrinity["attack_label"].isin(["XSS", "SQLi"]), "type"] = "Malicious"

df_alextrinity.drop(["SQLi", "XSS", "CommandInjection", 'normal'], axis=1, inplace=True, errors="ignore")
df_alextrinity.info()

<class 'pandas.core.frame.DataFrame'>
Index: 156636 entries, 0 to 156635
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   payload       156636 non-null  object
 1   attack_label  156636 non-null  object
 2   label         156636 non-null  int64 
 3   type          156636 non-null  object
dtypes: int64(1), object(3)
memory usage: 6.0+ MB


In [11]:
df_fmereani_shengqin_kaggle_payloadbox_alextrinity_merged = pd.concat(
    [df_fmereani_shengqin_kaggle_payloadbox_merged, df_alextrinity], ignore_index=True)
df_fmereani_shengqin_kaggle_payloadbox_alextrinity_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306243 entries, 0 to 306242
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   payload       306217 non-null  object
 1   label         306243 non-null  int64 
 2   attack_label  306243 non-null  object
 3   type          306243 non-null  object
dtypes: int64(1), object(3)
memory usage: 9.3+ MB


In [23]:
df_fmereani_shengqin_kaggle_payloadbox_alextrinity_merged['attack_label'].value_counts()

attack_label
normal    133789
SQLi       91257
XSS        81197
Name: count, dtype: int64

In [18]:
df_condensed = df_fmereani_shengqin_kaggle_payloadbox_alextrinity_merged.copy()

In [19]:
df_condensed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306243 entries, 0 to 306242
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   payload       306217 non-null  object
 1   label         306243 non-null  int64 
 2   attack_label  306243 non-null  object
 3   type          306243 non-null  object
dtypes: int64(1), object(3)
memory usage: 9.3+ MB


In [20]:
# Remove duplicates!
df_condensed = df_condensed.drop_duplicates().reset_index(drop=True)
df_condensed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273705 entries, 0 to 273704
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   payload       273704 non-null  object
 1   label         273705 non-null  int64 
 2   attack_label  273705 non-null  object
 3   type          273705 non-null  object
dtypes: int64(1), object(3)
memory usage: 8.4+ MB


In [15]:
df_condensed['attack_label'].value_counts()

attack_label
normal    121199
SQLi       88569
XSS        63635
Name: count, dtype: int64

In [21]:
df_condensed.to_csv('datasets/raw/xss_sqli_condensed.csv', index=False)

In [22]:
df_condensed.sample(15)

Unnamed: 0,payload,label,attack_label,type
221697,George Carlin is probably my favorite comedian...,0,normal,Benign
260845,"sign=';alert(String.fromCharCode(88,<br/>83,83...",1,XSS,Malicious
153614,_#%yh&^|c%!2n4no!i0l0^us@)<}3*@c%vb\{[-6]th7>0...,2,SQLi,Malicious
185802,9999999999999999999999999999999999999999999999...,2,SQLi,Malicious
112341,"superior quality and value showreview(2240962,...",0,normal,Benign
203982,I caught this on the dish last night. I liked ...,0,normal,Benign
236553,Wow! I remember so many awful films that loose...,0,normal,Benign
51384,select case when 2095 = 9074 t/*This was a ter...,1,SQLi,Malicious
184497,dj89xyxiwqvd577amuha2fr10giv6tiw32kh33858n7qve...,2,SQLi,Malicious
248316,srchval=%3Cscript%3Ealert%28document.cookie%29...,1,XSS,Malicious


### Now that the datasets are condensed and normalized, we need to start creating new features and deep understand