In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import seaborn as sns
import json
import re
import math
from collections import Counter
from urllib.parse import unquote
from typing import Union, Dict
from urllib.parse import urlparse, parse_qs

## Data Loading

In [2]:
with open('datasets/with_sql_injection_payload.json', 'r') as file:
    data = json.load(file)

## Data Preprocessing

In [3]:
parsed_requests = []

for entry in data:
    request = entry["request"]
    url = request.get("url", "")
    method = request.get("method", "")
    body = request.get("body", "")
        
    headers = request.get("headers", {})
        
    parsed_request = {
        "url": url,
        "method": method,
        "body": body,
        "header": json.dumps(headers)
    }
    parsed_requests.append(parsed_request)
    
df = pd.DataFrame(parsed_requests)

In [4]:
df['url'].apply(unquote)
df['body'].apply(unquote)
df['header'].apply(unquote)

0      {"Content-Length": "51", "Host": "testphp.vuln...
1      {"Content-Length": "52", "Host": "testphp.vuln...
2      {"Content-Length": "83", "Host": "testphp.vuln...
3      {"Content-Length": "83", "Host": "testphp.vuln...
4      {"Content-Length": "77", "Host": "testphp.vuln...
                             ...                        
434    {"Content-Length": "239", "Cookie": "login=qpv...
435    {"Content-Length": "253", "Cookie": "login=qpv...
436    {"Content-Length": "253", "Cookie": "login=qpv...
437    {"Content-Length": "236", "Cookie": "login=qpv...
438    {"Content-Length": "236", "Cookie": "login=qpv...
Name: header, Length: 439, dtype: object

## Algorithms Methods

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

In [6]:
def calculate_entropy(text: str) -> float:
    if not isinstance(text, str) or not text.strip():
        return 0.0
    
    freq = Counter(text)
    total_length = len(text)

    entropy = sum(
        (-count / total_length) * math.log2(count / total_length) 
        for count in freq.values()
    )
    
    return entropy

In [7]:
def number_of_special_chars(request: str) -> int:
    if not isinstance(request, str) or not request.strip():
        return 0
    special_chars_pattern = r"[!@#$%^&*()_+\-=\[\]{};:'\"\\|,.<>?/`~]"
    special_chars = re.findall(special_chars_pattern, request)    
    return len(special_chars)

In [8]:
def count_query_params(url: str) -> int:
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)
    return len(query_params)

In [9]:
def calculate_query_param_length(url: str) -> int:
    parsed_url = urlparse(url)
    query_string = parsed_url.query
    return len(query_string)

In [10]:
def calculate_path_depth(url: str) -> int:
    parsed_url = urlparse(url)
    path_segments = parsed_url.path.strip("/").split("/")
    return len(path_segments)

In [11]:
sql_injection_patterns = [
    r"\bSELECT\b", r"\bINSERT\b", r"\bUPDATE\b", r"\bDELETE\b", r"\bDROP\b", r"\bUNION\b", r"\bJOIN\b",
    r"\bWHERE\b", r"\bORDER\s+BY\b", r"\bGROUP\s+BY\b", r"\bHAVING\b", r"\bEXEC\b", r"\bDECLARE\b",
    r"\bCASE\b", r"\bWHEN\b", r"\bTHEN\b", r"\bEND\b", r"\bIF\b", r"\bELSE\b", r"\bCAST\b", r"\bCONVERT\b",
    r"\bTABLE\b", r"\bCOLUMN\b", r"\bDATABASE\b", r"\bSCHEMA\b", r"\bINFORMATION_SCHEMA\b",
    r"\bTRUNCATE\b", r"\bALTER\b", r"\bCREATE\b", r"\bREPLACE\b", r"\bRENAME\b", r"\bGRANT\b", r"\bREVOKE\b",
    r"\bMERGE\b", r"\bINTERSECT\b", r"\bEXCEPT\b", r"\bEXECUTE\b", r"\bFETCH\b", r"\bOPEN\b", r"\bCLOSE\b",
    r"\bDEALLOCATE\b", r"\bUSE\b", r"\bLIMIT\b", r"\bOFFSET\b", r"\bNULL\b", r"\bISNULL\b", r"\bCOALESCE\b",
    r"\bXP_CMDSHELL\b", r"\bWAITFOR\s+DELAY\b", r"\bBENCHMARK\b", r"\bCHAR\b", r"\bASCII\b", r"\bHEX\b",
    r"\bCONCAT\b", r"\bSUBSTRING\b", r"\bMID\b", r"\bIFNULL\b", r"\bLOAD_FILE\b", r"\bOUTFILE\b",
    r"\bINTO\b", r"\bDUMPFILE\b", r"\bFLOOR\b", r"\bRAND\b", r"\bMD5\b", r"\bSHA1\b", r"\bPASSWORD\b",
    r"\bCURRENT_USER\b", r"\bSESSION_USER\b", r"\bSYSTEM_USER\b", r"\bUSER\b", r"\bVERSION\b",
    r"\bFOUND_ROWS\b", r"\bROW_COUNT\b", r"\bDATABASE\(\)\b", r"\bSCHEMA\(\)\b", r"\bTABLE_NAME\b",
    r"\bCOLUMN_NAME\b", r"\bCURRENT_TIMESTAMP\b", r"\bCURRENT_DATE\b", r"\bCURRENT_TIME\b",
    r"\bSESSION_ID\b", r"\bWAITFOR\s+TIME\b", r"\bEXEC\s+sp_executesql\b", r"\bEXEC\s+sp_sqlexec\b",
    r"\bSYSOBJECTS\b", r"\bSYSCOLUMNS\b", r"\bPG_SLEEP\b", r"\bEXTRACTVALUE\b", r"\bUPDATEXML\b",
    r"\bLTRIM\b", r"\bRTRIM\b", r"\bUPPER\b", r"\bLOWER\b", r"\bSYSADMIN\b", r"\bEXEC\s+MASTER\.DBO\.XP_CMDSHELL\b",
    r"\bINFORMATION_SCHEMA\.TABLES\b", r"\bINFORMATION_SCHEMA\.COLUMNS\b",
    r"--", r";", r"'", r"\"", r"/\*", r"\*/", r"\(", r"\)", r"\{", r"\}", r"\[", r"\]", r"<", r">",
    r"\|\|", r"\|", r"\^", r"\\", r"\@", r"\#", r"\%", r"\!", r"\$", r"\+", r"-", r"/", r"\*",
    r"\bOR\b", r"\bAND\b", r"\bLIKE\b", r"\bSLEEP\b", r"\bREGEXP\b", r"\bRLIKE\b",
    r"ORDER\s+BY\s+1--", r"UNION\s+SELECT", r"DROP\s+TABLE", r"ALTER\s+TABLE", r"INTO\s+OUTFILE",
    r"\b\d+\s*(=|!=|<|>|<=|>=)\s*\d+\b",
    r"\b\d+\s*[\+\-\*/%]\s*\d+\b",
    r"(\|\|)", r"\bCONCAT\s*\(", r"\bCHAR\s*\(", r"\bASCII\s*\(", r"\bUNHEX\s*\(",
    r"\b0x[0-9A-Fa-f]+\b",
    r"\b\d+\s*(&|\||\^)\s*\d+\b",
    r"--", r"#", r"/\*", r"\*/",
    r"\)\s*AND\s*\(",
    r"\bAND\b.*&.*\b",
    r"::", r"\@\@", r"\bAS\s+\w+",
    r"-\d+'?",
    r"^-?\d+'?",
]

sql_injection_regex = re.compile("|".join(sql_injection_patterns), re.IGNORECASE)

def count_sql_keywords(body: str) -> int:
    if not body:
        return 0

    matches = sql_injection_regex.findall(body)
    return len(matches)

In [12]:
df["sql_injection_keywords_url"] = df["url"].apply(count_sql_keywords)

In [13]:
df["sql_injection_keywords_body"] = df["body"].apply(count_sql_keywords)

In [14]:
df["sql_injection_keywords_header"] = df["header"].apply(count_sql_keywords)

In [15]:
df["header_entropy"] = df["header"].apply(calculate_entropy)

In [16]:
df["body_entropy"] = df["body"].apply(calculate_entropy)

In [17]:
df["number_of_special_chars"] = df["header"].apply(number_of_special_chars)

In [18]:
df["count_query_params"] = df["url"].apply(count_query_params)

In [19]:
df["body_special_chars"] = df["body"].apply(number_of_special_chars)

In [20]:
df["query_param_length"] = df["url"].apply(calculate_query_param_length)

In [21]:
df["path_depth"] = df["url"].apply(calculate_path_depth)

In [22]:
df["method"] = pd.Categorical(df["method"]).codes

In [23]:
df["sql_injection"] = True

In [24]:
df[["method","body_entropy","header_entropy","number_of_special_chars","body_special_chars","query_param_length","path_depth","sql_injection_keywords_url","sql_injection_keywords_body","sql_injection_keywords_header","sql_injection"]]

Unnamed: 0,method,body_entropy,header_entropy,number_of_special_chars,body_special_chars,query_param_length,path_depth,sql_injection_keywords_url,sql_injection_keywords_body,sql_injection_keywords_header,sql_injection
0,0,3.703727,5.260223,154,13,0,1,3,8,98,True
1,0,4.263342,5.266637,154,8,0,1,3,5,98,True
2,0,4.336093,5.261628,154,15,0,1,3,12,98,True
3,0,4.283197,5.261628,154,15,0,1,3,12,98,True
4,0,4.320702,5.262706,154,13,0,1,3,10,98,True
...,...,...,...,...,...,...,...,...,...,...,...
434,0,4.485386,5.312636,162,35,0,1,3,32,103,True
435,0,4.572221,5.310132,162,36,0,1,3,32,103,True
436,0,4.562335,5.310132,162,36,0,1,3,32,103,True
437,0,4.482815,5.310132,162,36,0,1,3,32,103,True


In [25]:
df.to_csv("unclean.csv",index=False)

In [30]:
df_clean = pd.read_csv('clean.csv')
df_unclean = pd.read_csv('unclean.csv')
df_combined = pd.concat([df_clean, df_unclean], ignore_index=False)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
df_combined['sql_injection'] = df_combined['sql_injection'].astype(int)

In [35]:
df_combined_selected = df_combined[["method","body_entropy","header_entropy","number_of_special_chars","body_special_chars","query_param_length","path_depth","sql_injection_keywords_url","sql_injection_keywords_body","sql_injection_keywords_header","sql_injection"]]

In [36]:
from sklearn.model_selection import train_test_split

X = df_combined_selected.drop(columns=['sql_injection'])
y = df_combined_selected['sql_injection']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (716, 10)
Testing set size: (180, 10)


## Random Forest Classifier

In [38]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

randomforest_classifier = RandomForestClassifier()
randomforest_classifier.fit(X_train, y_train)

y_pred = randomforest_classifier.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9944444444444445

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99        91
           1       0.99      1.00      0.99        89

    accuracy                           0.99       180
   macro avg       0.99      0.99      0.99       180
weighted avg       0.99      0.99      0.99       180


Confusion Matrix:
 [[90  1]
 [ 0 89]]


## Logistic Regression

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model = LogisticRegression(max_iter=100)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9944444444444445

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99        91
           1       0.99      1.00      0.99        89

    accuracy                           0.99       180
   macro avg       0.99      0.99      0.99       180
weighted avg       0.99      0.99      0.99       180


Confusion Matrix:
 [[90  1]
 [ 0 89]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
request_test = {
    "request": {
        "url": "http://testphp.vulnweb.com/users.php?q='",
        "method": "GET",
        "headers": {
            "Content-Length": "77",
            "Host": "testphp.vulnweb.com",
            "Cache-Control": "max-age=0",
            "Origin": "http://testphp.vulnweb.com",
            "Content-Type": "application/x-www-form-urlencoded",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
            "Sec-GPC": "1",
            "Accept-Language": "en-US,en;q=0.5",
            "Referer": "http://testphp.vulnweb.com/login.php",
            "Accept-Encoding": "gzip, deflate, br",
            "Connection": "close"
        },
        "body": ""
    }
}

In [51]:
url = request_test["request"]["url"]
method = request_test["request"]["method"]
headers = unquote(request_test["request"]["headers"])
body = unquote(request_test["request"]["body"])

features = {
    "method": 0 if method == "GET" else 1,
    "body_entropy": calculate_entropy(body),
    "header_entropy": calculate_entropy(headers),
    "number_of_special_chars": number_of_special_chars(headers),
    "body_special_chars": number_of_special_chars(body),
    "query_param_length": calculate_query_param_length(url),
    "path_depth": calculate_path_depth(url),
    "sql_injection_keywords_url": count_sql_keywords(url),
    "sql_injection_keywords_body": count_sql_keywords(body),
    "sql_injection_keywords_header": count_sql_keywords(headers)
}

features_df = pd.DataFrame([features])

# Optional: print the features DataFrame to check the result
print(features_df.head())

AttributeError: 'dict' object has no attribute 'split'