In [1]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.feature_extraction.text import HashingVectorizer

import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./dataset/clean_sql_dataset.csv')

In [3]:
df.head(50)

Unnamed: 0,Query,Label
0,""" or pg_sleep ( __TIME__ ) --",1
1,create user name identified by pass123 tempora...,1
2,AND 1 = utl_inaddr.get_host_address ( ...,1
3,select * from users where id = '1' or @ @1 ...,1
4,"select * from users where id = 1 or 1#"" ( ...",1
5,select name from syscolumns where id = ...,1
6,select * from users where id = 1 +$+ or 1 =...,1
7,"1; ( load_file ( char ( 47,101,116,99,47...",1
8,select * from users where id = '1' or ||/1 ...,1
9,select * from users where id = '1' or \.<\ ...,1


In [4]:
df.rename(columns={'Query': 'payload', 'Label': 'label'}, inplace=True)
df['type'] = np.where(df['label'] == 1, "SQL Injection", "Normal")

In [5]:
X = df['payload']
y = df['label']

In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [7]:
X_processed = [preprocess_text(payload) for payload in X]
vectorizer = HashingVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    n_features=2**20,
    dtype=np.float32
)
X_features = vectorizer.transform(X_processed)
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.3, random_state=42)

In [8]:
random_forest_model = RandomForestClassifier(
    n_estimators=50,
    max_samples=0.5,
    max_features='sqrt',
    max_depth=10,
    n_jobs=-1,
    random_state=42
)
random_forest_model.fit(X_train, y_train)

In [13]:
decision_tree_model = DecisionTreeClassifier(max_features="sqrt")
decision_tree_model.fit(X_train, y_train)

In [15]:
y_pred = random_forest_model.predict(X_test)
accuracy_random_forest = accuracy_score(y_test, y_pred)
scores_random_forest = cross_val_score(decision_tree_model, X_features, y, cv=5, scoring='f1')

print("F1 scores:", scores_random_forest)
print("Mean F1 score:", scores_random_forest.mean())
print("Accuracy Score:", accuracy_random_forest)

F1 scores: [0.96413217 0.97191898 0.89890132 0.92727509 0.94229837]
Mean F1 score: 0.9409051860442478
Accuracy Score: 0.8024405591262529


In [30]:
y_pred_tree = decision_tree_model.predict(X_test)
accuracy_tree = accuracy_score(y_test, y_pred)
scores_tree = cross_val_score(decision_tree_model, X_features, y, cv=5, scoring='f1')

print("Decision Tree Classifier F1 scores:", scores_tree)
print("Decision Tree Classifier Mean F1 score:", scores_tree.mean())
print("Decision Tree Classifier Accuracy Score:", accuracy_tree)

Decision Tree Classifier F1 scores: [0.95798162 0.97479734 0.96703585 0.98294086 0.97569522]
Decision Tree Classifier Mean F1 score: 0.9716901764346838
Decision Tree Classifier Accuracy Score: 0.9807407074475257


In [70]:
sql_injection_payloads = [
    "sleep(5)#",
    "1 or sleep(5)#",
    '" or sleep(5)#',
    "' or sleep(5)#",
    '" or sleep(5)="',
    "' or sleep(5)='",
    "1) or sleep(5)#",
    ") or sleep(5)=\"",
    "') or sleep(5)='",
    "1)) or sleep(5)#",
    ")) or sleep(5)=\"",
    "')) or sleep(5)='",
    ";waitfor delay '0:0:5'--",
    ");waitfor delay '0:0:5'--",
    ";'waitfor delay '0:0:5'--",
    '";waitfor delay \'0:0:5\'--',
    "');waitfor delay '0:0:5'--",
    "\");waitfor delay '0:0:5'--",
    "));waitfor delay '0:0:5'--",
    "'));waitfor delay '0:0:5'--",
    "\"));waitfor delay '0:0:5'--",
    "benchmark(10000000,MD5(1))#",
    "1 or benchmark(10000000,MD5(1))#",
    '" or benchmark(10000000,MD5(1))#',
    "' or benchmark(10000000,MD5(1))#",
    "1) or benchmark(10000000,MD5(1))#",
    ") or benchmark(10000000,MD5(1))#",
    "') or benchmark(10000000,MD5(1))#",
    "1)) or benchmark(10000000,MD5(1))#",
    ")) or benchmark(10000000,MD5(1))#",
    "')) or benchmark(10000000,MD5(1))#",
    "pg_sleep(5)--",
    "1 or pg_sleep(5)--",
    '" or pg_sleep(5)--',
    "' or pg_sleep(5)--",
    "1) or pg_sleep(5)--",
    ") or pg_sleep(5)--",
    "') or pg_sleep(5)--",
    "1)) or pg_sleep(5)--",
    ")) or pg_sleep(5)--",
    "')) or pg_sleep(5)--",
    "AND (SELECT * FROM (SELECT(SLEEP(5)))bAKL) AND 'vRxe'='vRxe",
    "AND (SELECT * FROM (SELECT(SLEEP(5)))YjoC) AND '%'='",
    "AND (SELECT * FROM (SELECT(SLEEP(5)))nQIP)",
    "AND (SELECT * FROM (SELECT(SLEEP(5)))nQIP)--",
    "AND (SELECT * FROM (SELECT(SLEEP(5)))nQIP)#",
    "SLEEP(5)#",
    "SLEEP(5)--",
    "SLEEP(5)=\"",
    "SLEEP(5)='",
    "or SLEEP(5)",
    "or SLEEP(5)#",
    "or SLEEP(5)--",
    "or SLEEP(5)=\"",
    "or SLEEP(5)='",
    "waitfor delay '00:00:05'",
    "waitfor delay '00:00:05'--",
    "waitfor delay '00:00:05'#",
    "benchmark(50000000,MD5(1))",
    "benchmark(50000000,MD5(1))--",
    "benchmark(50000000,MD5(1))#",
    "or benchmark(50000000,MD5(1))",
    "or benchmark(50000000,MD5(1))--",
    "or benchmark(50000000,MD5(1))#",
    "pg_SLEEP(5)",
    "pg_SLEEP(5)--",
    "pg_SLEEP(5)#",
    "or pg_SLEEP(5)",
    "or pg_SLEEP(5)--",
    "or pg_SLEEP(5)#",
    "'\"",
    "AnD SLEEP(5)",
    "AnD SLEEP(5)--",
    "AnD SLEEP(5)#",
    "&&SLEEP(5)",
    "&&SLEEP(5)--",
    "&&SLEEP(5)#",
    "' AnD SLEEP(5) ANd '1",
    "'&&SLEEP(5)&&'1",
    "ORDER BY SLEEP(5)",
    "ORDER BY SLEEP(5)--",
    "ORDER BY SLEEP(5)#",
    "sleep(5)#",
    "1 or sleep(5)#",
    '" or sleep(5)#',
    "' or sleep(5)#",
    '" or sleep(5)="',
    "' or sleep(5)='",
    "1) or sleep(5)#",
    ") or sleep(5)=\"",
    "') or sleep(5)='",
    "1)) or sleep(5)#",
    ")) or sleep(5)=\"",
    "')) or sleep(5)='",
    ";waitfor delay '0:0:5'--",
    ");waitfor delay '0:0:5'--",
    ";'waitfor delay '0:0:5'--",
    '";waitfor delay \'0:0:5\'--',
    "');waitfor delay '0:0:5'--",
    "\");waitfor delay '0:0:5'--",
    "));waitfor delay '0:0:5'--",
    "'));waitfor delay '0:0:5'--",
    "\"));waitfor delay '0:0:5'--",
    "benchmark(10000000,MD5(1))#",
    "1 or benchmark(10000000,MD5(1))#",
    '" or benchmark(10000000,MD5(1))#',
    "' or benchmark(10000000,MD5(1))#",
    "1) or benchmark(10000000,MD5(1))#",
    ") or benchmark(10000000,MD5(1))#",
    "') or benchmark(10000000,MD5(1))#",
    "1)) or benchmark(10000000,MD5(1))#",
    ")) or benchmark(10000000,MD5(1))#",
    "')) or benchmark(10000000,MD5(1))#",
    "pg_sleep(5)--",
    "1 or pg_sleep(5)--",
    '" or pg_sleep(5)--',
    "' or pg_sleep(5)--",
    "1) or pg_sleep(5)--",
    ") or pg_sleep(5)--",
    "') or pg_sleep(5)--",
    "1)) or pg_sleep(5)--",
    ")) or pg_sleep(5)--",
    "')) or pg_sleep(5)--",
    "AND (SELECT * FROM (SELECT(SLEEP(5)))bAKL) AND 'vRxe'='vRxe",
    "AND (SELECT * FROM (SELECT(SLEEP(5)))YjoC) AND '%'='",
    "AND (SELECT * FROM (SELECT(SLEEP(5)))nQIP)",
    "AND (SELECT * FROM (SELECT(SLEEP(5)))nQIP)--",
    "AND (SELECT * FROM (SELECT(SLEEP(5)))nQIP)#",
    "SLEEP(5)#",
    "SLEEP(5)--",
    "SLEEP(5)=\"",
    "SLEEP(5)='",
    "or SLEEP(5)",
    "or SLEEP(5)#",
    "or SLEEP(5)--",
    "or SLEEP(5)=\"",
    "or SLEEP(5)='",
    "waitfor delay '00:00:05'",
    "waitfor delay '00:00:05'--",
    "waitfor delay '00:00:05'#",
    "sleep(5)#",
    "1 or sleep(5)#",
    '" or sleep(5)#',
    "' or sleep(5)#",
    '" or sleep(5)="',
    "' or sleep(5)='",
    "1) or sleep(5)#",
    ") or sleep(5)=\"",
    "') or sleep(5)='",
    "1)) or sleep(5)#",
    ")) or sleep(5)=\"",
    "')) or sleep(5)='",
    ";waitfor delay '0:0:5'--",
    ");waitfor delay '0:0:5'--",
    ";'waitfor delay '0:0:5'--",
    '";waitfor delay \'0:0:5\'--',
    "');waitfor delay '0:0:5'--",
    "\");waitfor delay '0:0:5'--",
    "));waitfor delay '0:0:5'--",
    "'));waitfor delay '0:0:5'--",
    "\"));waitfor delay '0:0:5'--",
    "benchmark(10000000,MD5(1))#",
    "1 or benchmark(10000000,MD5(1))#",
    '" or benchmark(10000000,MD5(1))#',
    "' or benchmark(10000000,MD5(1))#",
    "1) or benchmark(10000000,MD5(1))#",
    ") or benchmark(10000000,MD5(1))#",
    "') or benchmark(10000000,MD5(1))#",
    "1)) or benchmark(10000000,MD5(1))#",
    ")) or benchmark(10000000,MD5(1))#",
    "')) or benchmark(10000000,MD5(1))#",
    "pg_sleep(5)--",
    "1 or pg_sleep(5)--",
    '" or pg_sleep(5)--',
    "' or pg_sleep(5)--",
    "1) or pg_sleep(5)--",
    ") or pg_sleep(5)--",
    "') or pg_sleep(5)--",
    "1)) or pg_sleep(5)--",
    ")) or pg_sleep(5)--",
    "')) or pg_sleep(5)--",
    "AND (SELECT * FROM (SELECT(SLEEP(5)))bAKL) AND 'vRxe'='vRxe",
    "AND (SELECT * FROM (SELECT(SLEEP(5)))YjoC) AND '%'='",
    "AND (SELECT * FROM (SELECT(SLEEP(5)))nQIP)",
    "AND (SELECT * FROM (SELECT(SLEEP(5)))nQIP)--",
    "AND (SELECT * FROM (SELECT(SLEEP(5)))nQIP)#",
    "SLEEP(5)#",
    "SLEEP(5)--",
    "SLEEP(5)=\"",
    "SLEEP(5)='",
    "or SLEEP(5)",
    "or SLEEP(5)#",
    "or SLEEP(5)--",
    "or SLEEP(5)=\"",
    "or SLEEP(5)='",
    "waitfor delay '00:00:05'",
    "waitfor delay '00:00:05'--",
    "waitfor delay '00:00:05'#",
    "benchmark(50000000,MD5(1))",
    "benchmark(50000000,MD5(1))--",
    "benchmark(50000000,MD5(1))#",
    "or benchmark(50000000,MD5(1))",
    "or benchmark(50000000,MD5(1))--",
    "or benchmark(50000000,MD5(1))#",
    "pg_SLEEP(5)",
    "pg_SLEEP(5)--",
    "pg_SLEEP(5)#",
    "or pg_SLEEP(5)",
    "or pg_SLEEP(5)--",
    "or pg_SLEEP(5)#",
    "'\"",
    "AnD SLEEP(5)",
    "AnD SLEEP(5)--",
    "AnD SLEEP(5)#",
    "&&SLEEP(5)",
    "&&SLEEP(5)--",
    "&&SLEEP(5)#",
    "' AnD SLEEP(5) ANd '1",
    "'&&SLEEP(5)&&'1",
    "ORDER BY SLEEP(5)",
    "ORDER BY SLEEP(5)--",
    "ORDER BY SLEEP(5)#",
    "UNION ALL SELECT 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27--"
]

In [71]:
print("SQL Injection Payload Count", len(sql_injection_payloads))
for new_request in sql_injection_payloads:
    new_request_processed = preprocess_text(new_request)
    prediction = random_forest_model.predict(vectorizer.transform([new_request_processed]))
    print("Block Request" if prediction == 1 else "Allow Request")

SQL Injection Payload Count 223
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Allow Request
Allow Request
Allow Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Bl

In [72]:
normal_requests = [
    "GET / HTTP/1.1",
    "GET /index.html HTTP/1.1",
    "POST /login HTTP/1.1",
    "GET /about-us HTTP/1.1",
    "GET /contact-us HTTP/1.1",
    "GET /products HTTP/1.1",
    "POST /register HTTP/1.1",
    "GET /faqs HTTP/1.1",
    "GET /support HTTP/1.1",
    "GET /terms-of-service HTTP/1.1",
    "GET /privacy-policy HTTP/1.1",
    "GET /sitemap.xml HTTP/1.1",
    "GET /robots.txt HTTP/1.1",
    "POST /search HTTP/1.1",
    "GET /category/electronics HTTP/1.1",
    "GET /category/clothing HTTP/1.1",
    "GET /product/123 HTTP/1.1",
    "POST /cart/add HTTP/1.1",
    "GET /cart/view HTTP/1.1",
    "GET /checkout HTTP/1.1",
    "POST /payment/process HTTP/1.1",
    "GET /?page=1 HTTP/1.1",
    "GET /?category=electronics&sort=price HTTP/1.1",
    "GET /?search=apple&sort=relevance HTTP/1.1",
    "GET /?id=123&name=john HTTP/1.1",
    "GET /?year=2022&month=12 HTTP/1.1",
    "GET /?limit=10&offset=20 HTTP/1.1",
    "GET /?filter=price&order=asc HTTP/1.1",
    "GET /?lang=en&country=us HTTP/1.1",
    "GET /?user=john&password=hello HTTP/1.1",
    "GET /?token=1234567890 HTTP/1.1",
    "GET /?redirect=/home HTTP/1.1",
    "GET /?status=success HTTP/1.1",
    "GET /?error=404 HTTP/1.1",
    "GET /?message=hello+world HTTP/1.1",
    "GET /?format=json HTTP/1.1",
    "GET /?callback=myFunction HTTP/1.1",
    "GET /?q=hello+world HTTP/1.1",
    "GET /?query=select+*+from+users HTTP/1.1",
    "GET /?sort=name&order=desc HTTP/1.1",
    "GET /?filter=category&value=electronics HTTP/1.1",
    "GET /?search=apple&sort=price&order=asc HTTP/1.1",
    "GET /?lang=en&country=us&currency=usd HTTP/1.1",
    "GET /?user=john&password=hello&token=1234567890 HTTP/1.1",
    "POST /login HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\nusername=john&password=hello",
    "POST /register HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\nname=John+Doe&email=john@example.com&password=hello",
    "POST /contact HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\nname=John+Doe&email=john@example.com&message=Hello+World",
    "POST /search HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\nq=hello+world",
    "POST /create HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\ntitle=Hello+World&content=This+is+a+test",
    "POST /update HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\nid=123&title=Hello+World&content=This+is+a+test",
    "POST /delete HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\nid=123",
    "POST /upload HTTP/1.1\nContent-Type: multipart/form-data\nfile=@/path/to/file.txt",
    "POST /forgot-password HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\nemail=john@example.com",
    "POST /reset-password HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\ntoken=1234567890&password=newpassword",
    "POST /change-password HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\noldpassword=oldpassword&newpassword=newpassword",
    "POST /add-to-cart HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\nproduct_id=123&quantity=2",
    "POST /remove-from-cart HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\nproduct_id=123",
    "POST /checkout HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\npayment_method=paypal&shipping_address=123+Main+St",
    "POST /payment HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\npayment_method=paypal&amount=10.99",
    "POST /refund HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\norder_id=123&reason=defective",
    "POST /review HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\nproduct_id=123&rating=5&review=Great+product",
    "POST /comment HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\npost_id=123&comment=Great+post",
    "POST /vote HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\npost_id=123&vote=up",
    "POST /flag HTTP/1.1\nContent-Type: application/x-www-form-urlencoded\npost_id=123&reason=offensive"
]

In [74]:
print("Normal Request Count: ",len(normal_requests))
for new_request in normal_requests:
    new_request_processed = preprocess_text(new_request)
    prediction = random_forest_model.predict(vectorizer.transform([new_request_processed]))
    print("Block Request" if prediction == 1 else "Allow Request")

Normal Request Count:  64
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Block Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Block Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
Allow Request
