In [191]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import seaborn as sns
import json
import re
import math
from collections import Counter
from urllib.parse import unquote
from typing import Union, Dict
from urllib.parse import urlparse, parse_qs

## Data Loading

In [192]:
with open('datasets/with_sql_injection_payload.json', 'r') as file:
    data = json.load(file)

In [193]:
def deep_decode(text):
    """Handle nested URL encoding"""
    while True:
        decoded = unquote_plus(text)
        if decoded == text:
            return decoded
        text = decoded

def detect_quote_imbalance(text):
    """Detect unbalanced quotes in SQL"""
    single_quotes = text.count("'")
    double_quotes = text.count('"')
    imbalance = 0
    if single_quotes % 2 != 0:
        imbalance += 1
    if double_quotes % 2 != 0:
        imbalance += 1
    return imbalance

def detect_statement_termination(text):
    """Detect SQL statement termination"""
    patterns = [
        r";\s*--",
        r";\s*$",
        r";\s*[^\s]",
    ]
    return 1 if any(re.search(p, text) for p in patterns) else 0

def detect_time_based(text):
    """Detect time-based SQLi patterns"""
    patterns = [
        r"(?:^|\W)SLEEP\(\s*\d+\s*\)",
        r"(?:^|\W)WAITFOR\s+DELAY\s+'\d+:\d+:\d+'",
        r"(?:^|\W)BENCHMARK\(\s*\d+,",
        r"(?:^|\W)pg_sleep\(\s*\d+\s*\)"
    ]
    return 1 if any(re.search(p, text, re.IGNORECASE) for p in patterns) else 0

## Data Preprocessing

In [194]:
parsed_requests = []

for entry in data:
    request = entry["request"]
    url = request.get("url", "")
    method = request.get("method", "")
    body = request.get("body", "")
        
    headers = request.get("headers", {})
        
    parsed_request = {
        "url": url,
        "method": method,
        "body": body,
        "header": json.dumps(headers)
    }
    parsed_requests.append(parsed_request)
    
df = pd.DataFrame(parsed_requests)

In [195]:
df["quote_imbalance"] = df["body"].apply(detect_quote_imbalance) + df["url"].apply(detect_quote_imbalance)
df["stmt_termination"] = df["body"].apply(detect_statement_termination) | df["url"].apply(detect_statement_termination)
df["time_based"] = df["body"].apply(detect_time_based) | df["url"].apply(detect_time_based)

## Algorithms Methods

In [196]:
def calculate_entropy(text: str) -> float:
    if not isinstance(text, str) or not text.strip():
        return 0.0
    
    freq = Counter(text)
    total_length = len(text)

    entropy = sum(
        (-count / total_length) * math.log2(count / total_length) 
        for count in freq.values()
    )
    
    return entropy

In [197]:
def number_of_special_chars(request: str) -> int:
    if not isinstance(request, str) or not request.strip():
        return 0
    special_chars_pattern = r"[!@#$%^&*()_+\-=\[\]{};:'\"\\|,.<>?/`~]"
    special_chars = re.findall(special_chars_pattern, request)    
    return len(special_chars)

In [198]:
def count_query_params(url: str) -> int:
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)
    return len(query_params)

In [199]:
def calculate_query_param_length(url: str) -> int:
    parsed_url = urlparse(url)
    query_string = parsed_url.query
    return len(query_string)

In [200]:
def calculate_path_depth(url: str) -> int:
    parsed_url = urlparse(url)
    path_segments = parsed_url.path.strip("/").split("/")
    return len(path_segments)

In [201]:
sql_injection_regex = re.compile(
    r'(?i)([\s\'"]+(?:or|and|union|where|select|insert|update|delete|drop|alter|create|truncate|exec|declare|xp_cmdshell|waitfor)[\s\'"]+)|(--|\#|\/\*)[^\n]*|([<>\'"\\;])|(%27|%22|%3C|%3E)|(?:&\w+;)|(\b1\s*=\s*1\b)',
    re.IGNORECASE
)

def count_sql_keywords(body: str) -> int:
    if not body:
        return 0

    matches = sql_injection_regex.findall(body)
    return len(matches)

In [202]:
df["sql_injection_keywords_url"] = df["url"].apply(count_sql_keywords)

In [203]:
df["sql_injection_keywords_body"] = df["body"].apply(count_sql_keywords)

In [204]:
df["sql_injection_keywords_header"] = df["header"].apply(count_sql_keywords)

In [205]:
df["header_entropy"] = df["header"].apply(calculate_entropy)

In [206]:
df["body_entropy"] = df["body"].apply(calculate_entropy)

In [207]:
df["number_of_special_chars"] = df["header"].apply(number_of_special_chars)

In [208]:
df["count_query_params"] = df["url"].apply(count_query_params)

In [209]:
df["body_special_chars"] = df["body"].apply(number_of_special_chars)

In [210]:
df["query_param_length"] = df["url"].apply(calculate_query_param_length)

In [211]:
df["path_depth"] = df["url"].apply(calculate_path_depth)

In [212]:
df["method"] = pd.Categorical(df["method"]).codes

In [213]:
df["sql_injection"] = True

In [214]:
df[[
    "method",
    "body_entropy",
    "header_entropy",
    "number_of_special_chars",
    "body_special_chars",
    "query_param_length",
    "path_depth",
    "sql_injection_keywords_url",
    "sql_injection_keywords_body",
    "sql_injection_keywords_header",
    "quote_imbalance",
    "stmt_termination",
    "time_based",
    "sql_injection"
]]

Unnamed: 0,method,body_entropy,header_entropy,number_of_special_chars,body_special_chars,query_param_length,path_depth,sql_injection_keywords_url,sql_injection_keywords_body,sql_injection_keywords_header,quote_imbalance,stmt_termination,time_based,sql_injection
0,0,3.703727,5.260223,154,13,0,1,0,2,34,0,0,0,True
1,0,4.263342,5.266637,154,8,7,1,0,5,34,0,0,0,True
2,0,4.336093,5.261628,154,15,0,1,0,4,34,0,0,0,True
3,0,4.283197,5.261628,154,15,0,1,0,4,34,0,0,0,True
4,0,4.320702,5.262706,154,13,0,1,0,4,34,0,0,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434,0,4.485386,5.312636,162,35,0,1,0,1,38,0,0,0,True
435,0,4.572221,5.310132,162,36,0,1,0,1,38,0,0,0,True
436,0,4.562335,5.310132,162,36,0,1,0,1,38,0,0,0,True
437,0,4.482815,5.310132,162,36,0,1,0,1,38,0,0,0,True


In [25]:
df.to_csv("unclean.csv",index=False)

In [30]:
df_clean = pd.read_csv('clean.csv')
df_unclean = pd.read_csv('unclean.csv')
df_combined = pd.concat([df_clean, df_unclean], ignore_index=False)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
df_combined['sql_injection'] = df_combined['sql_injection'].astype(int)

In [35]:
df_combined_selected = df_combined[["method","body_entropy","header_entropy","number_of_special_chars","body_special_chars","query_param_length","path_depth","sql_injection_keywords_url","sql_injection_keywords_body","sql_injection_keywords_header","sql_injection"]]

In [216]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[df_combined_selected] = scaler.fit_transform(df[df_combined_selected])

ValueError: Boolean array expected for the condition, not int64

In [36]:
from sklearn.model_selection import train_test_split

X = df_combined_selected.drop(columns=['sql_injection'])
y = df_combined_selected['sql_injection']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (716, 10)
Testing set size: (180, 10)


## Random Forest Classifier

In [38]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

randomforest_classifier = RandomForestClassifier()
randomforest_classifier.fit(X_train, y_train)

y_pred = randomforest_classifier.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9944444444444445

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99        91
           1       0.99      1.00      0.99        89

    accuracy                           0.99       180
   macro avg       0.99      0.99      0.99       180
weighted avg       0.99      0.99      0.99       180


Confusion Matrix:
 [[90  1]
 [ 0 89]]


## Logistic Regression

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model = LogisticRegression(max_iter=100)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9944444444444445

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99        91
           1       0.99      1.00      0.99        89

    accuracy                           0.99       180
   macro avg       0.99      0.99      0.99       180
weighted avg       0.99      0.99      0.99       180


Confusion Matrix:
 [[90  1]
 [ 0 89]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Decision Trees

In [99]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred_clf = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_clf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_clf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_clf))

Accuracy: 0.9944444444444445

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99        91
           1       0.99      1.00      0.99        89

    accuracy                           0.99       180
   macro avg       0.99      0.99      0.99       180
weighted avg       0.99      0.99      0.99       180


Confusion Matrix:
 [[90  1]
 [ 0 89]]


In [81]:
request_test = {
    "request": {
            "url": "http://testphp.vulnweb.com/userinfo.php",
            "method": "POST",
            "headers": {
                "Content-Length": "52",
                "Host": "testphp.vulnweb.com",
                "Cache-Control": "max-age=0",
                "Origin": "http://testphp.vulnweb.com",
                "Content-Type": "application/x-www-form-urlencoded",
                "Upgrade-Insecure-Requests": "1",
                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
                "Sec-GPC": "1",
                "Accept-Language": "en-US,en;q=0.5",
                "Referer": "http://testphp.vulnweb.com/login.php",
                "Accept-Encoding": "gzip, deflate, br",
                "Connection": "close"
            },
            "body": "uname=' OR '1' = 1 --&pass=fsdfsdf"
        }
}

In [100]:
url = request_test["request"]["url"]
method = request_test["request"]["method"]
headers = json.dumps(request_test["request"]["headers"])
body = request_test["request"]["body"]

features = {
    "method": 0 if method == "GET" else 1,
    "body_entropy": calculate_entropy(body),
    "header_entropy": calculate_entropy(headers),
    "number_of_special_chars": number_of_special_chars(headers),
    "body_special_chars": number_of_special_chars(body),
    "query_param_length": calculate_query_param_length(url),
    "path_depth": calculate_path_depth(url),
    "sql_injection_keywords_url": count_sql_keywords(url),
    "sql_injection_keywords_body": count_sql_keywords(body),
    "sql_injection_keywords_header": count_sql_keywords(headers)
}

features_df = pd.DataFrame([features])

In [101]:
features_df

Unnamed: 0,method,body_entropy,header_entropy,number_of_special_chars,body_special_chars,query_param_length,path_depth,sql_injection_keywords_url,sql_injection_keywords_body,sql_injection_keywords_header
0,1,5.118723,4.810381,21,12,0,1,3,5,18


In [102]:
prediction = clf.predict(features_df)
label = "SQL Injection" if prediction[0] == 1 else "Legitimate"

print(f"Prediction: {label}")

Prediction: SQL Injection


## Test Cases

In [106]:
test_requests = [
    {
        "request": {
            "url": "http://www.shopnplay.com/login.php",
            "method": "POST",
            "headers": {
                "Content-Type": "application/x-www-form-urlencoded",
                "User-Agent": "Mozilla/5.0"
            },
            "body": "username=admin&password=1234' OR '1' = '1"
        }
    },
    {
        "request": {
            "url": "http://www.bookstore.com/account.php",
            "method": "GET",
            "headers": {
                "Content-Type": "application/x-www-form-urlencoded",
                "User-Agent": "Mozilla/5.0"
            },
            "body": "",
            "url": "http://www.bookstore.com/account.php?id=1' UNION SELECT null, username, password FROM users --"
        }
    },
    {
        "request": {
            "url": "http://www.repairshop.com/submit_request.php",
            "method": "POST",
            "headers": {
                "Content-Type": "application/x-www-form-urlencoded",
                "User-Agent": "Mozilla/5.0"
            },
            "body": "issue=broken+screen&description=My+phone+screen+is+cracked&name=JohnDoe&email=johndoe%40example.com' AND SLEEP(5) --"
        }
    },
    {
        "request": {
            "url": "http://www.socialmedia.com/login.php",
            "method": "POST",
            "headers": {
                "Content-Type": "application/x-www-form-urlencoded",
                "User-Agent": "Mozilla/5.0"
            },
            "body": "username=admin&password=1234' --"
        }
    },
    {
        "request": {
            "url": "http://www.travelagency.com/search.php",
            "method": "GET",
            "headers": {
                "Content-Type": "application/x-www-form-urlencoded",
                "User-Agent": "Mozilla/5.0"
            },
            "body": "",
            "url": "http://www.travelagency.com/search.php?destination=Paris' UNION SELECT null, username, password FROM users --"
        }
    },
    {
        "request": {
            "url": "http://www.hotelbooking.com/reservation.php",
            "method": "POST",
            "headers": {
                "Content-Type": "application/x-www-form-urlencoded",
                "User-Agent": "Mozilla/5.0"
            },
            "body": "check_in=2025-05-01&check_out=2025-05-10&guest_name=JaneDoe&room_type=Deluxe"
        }
    },
    {
        "request": {
            "url": "http://www.newsportal.com/search.php",
            "method": "GET",
            "headers": {
                "Content-Type": "application/x-www-form-urlencoded",
                "User-Agent": "Mozilla/5.0"
            },
            "body": "",
            "url": "http://www.newsportal.com/search.php?q=technology' UNION SELECT null, username, password FROM users --"
        }
    },
    {
        "request": {
            "url": "http://www.contactus.com/submit_query.php",
            "method": "POST",
            "headers": {
                "Content-Type": "application/x-www-form-urlencoded",
                "User-Agent": "Mozilla/5.0"
            },
            "body": "name=Alice&email=alice%40example.com&message=Please+help%21' OR 1=1; DROP TABLE messages --"
        }
    },
    {
        "request": {
            "url": "http://www.membershipportal.com/login.php",
            "method": "POST",
            "headers": {
                "Content-Type": "application/x-www-form-urlencoded",
                "User-Agent": "Mozilla/5.0"
            },
            "body": "username=admin&password=1234' AND 1=1 --"
        }
    },
    {
        "request": {
            "url": "http://www.ecommerceplatform.com/checkout.php",
            "method": "POST",
            "headers": {
                "Content-Type": "application/x-www-form-urlencoded",
                "User-Agent": "Mozilla/5.0"
            },
            "body": "product_id=15&quantity=2&coupon_code=DISCOUNT123' UNION SELECT username, password, email FROM users --"
        }
    }
]


for request_test in test_requests:
    url = request_test["request"]["url"]
    method = request_test["request"]["method"]
    headers = json.dumps(request_test["request"]["headers"])
    body = request_test["request"]["body"]

    features = {
        "method": 0 if method == "GET" else 1,
        "body_entropy": calculate_entropy(body),
        "header_entropy": calculate_entropy(headers),
        "number_of_special_chars": number_of_special_chars(headers),
        "body_special_chars": number_of_special_chars(body),
        "query_param_length": calculate_query_param_length(url),
        "path_depth": calculate_path_depth(url),
        "sql_injection_keywords_url": count_sql_keywords(url),
        "sql_injection_keywords_body": count_sql_keywords(body),
        "sql_injection_keywords_header": count_sql_keywords(headers)
    }

    features_df = pd.DataFrame([features])
    prediction = randomforest_classifier.predict(features_df)
    label = "SQL Injection" if prediction[0] == 1 else "Legitimate"
    print(f"URL: {url}\nPrediction: {label}\n")

URL: http://www.shopnplay.com/login.php
Prediction: SQL Injection

URL: http://www.bookstore.com/account.php?id=1' UNION SELECT null, username, password FROM users --
Prediction: Legitimate

URL: http://www.repairshop.com/submit_request.php
Prediction: SQL Injection

URL: http://www.socialmedia.com/login.php
Prediction: SQL Injection

URL: http://www.travelagency.com/search.php?destination=Paris' UNION SELECT null, username, password FROM users --
Prediction: Legitimate

URL: http://www.hotelbooking.com/reservation.php
Prediction: SQL Injection

URL: http://www.newsportal.com/search.php?q=technology' UNION SELECT null, username, password FROM users --
Prediction: Legitimate

URL: http://www.contactus.com/submit_query.php
Prediction: SQL Injection

URL: http://www.membershipportal.com/login.php
Prediction: SQL Injection

URL: http://www.ecommerceplatform.com/checkout.php
Prediction: SQL Injection



Unnamed: 0,method,body_entropy,header_entropy,number_of_special_chars,body_special_chars,query_param_length,path_depth,sql_injection_keywords_url,sql_injection_keywords_body,sql_injection_keywords_header
0,1,5.118723,4.810381,21,12,0,1,3,5,18
