In [2]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.feature_extraction.text import HashingVectorizer

import numpy as np
import pandas as pd

In [3]:
df1 = pd.read_csv('./dataset/sql_payloads.csv')
df2 = pd.read_csv('./dataset/normal_request.csv')
df3 = pd.read_csv('./dataset/bad_request.csv')
df4 = pd.read_csv('./dataset/clean_sql_dataset.csv')
df4.rename(columns={'Query': 'payload', 'Label': 'label'}, inplace=True)

In [4]:
df = pd.concat([df1, df2, df3, df4], ignore_index=True)
df.to_csv('combined_dataset.csv')

In [4]:
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 150568 entries, 0 to 151253
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   payload  150568 non-null  object
 1   label    150568 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.4+ MB


In [5]:
X = df['payload']
y = df['label']

In [6]:
def preprocess_text(text):

    if not isinstance(text, str):
        text = str(text)
        
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [7]:
X_processed = [preprocess_text(payload) for payload in X]
vectorizer = HashingVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    n_features=2**20,
    dtype=np.float32
)
X_features = vectorizer.transform(X_processed)
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.3, random_state=42)

In [8]:
random_forest_model = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1,
    random_state=42
)
random_forest_model.fit(X_train, y_train)

In [66]:
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)

In [67]:
y_pred = random_forest_model.predict(X_test)
accuracy_random_forest = accuracy_score(y_test, y_pred)
scores_random_forest = cross_val_score(decision_tree_model, X_features, y, cv=5, scoring='f1')

print("F1 scores:", scores_random_forest)
print("Mean F1 score:", scores_random_forest.mean())
print("Accuracy Score:", accuracy_random_forest)

F1 scores: [0.95638639 0.97295947 0.90366957 0.93717612 0.92359335]
Mean F1 score: 0.9387569817598329
Accuracy Score: 0.9772767817322273


In [68]:
y_pred_tree = decision_tree_model.predict(X_test)
accuracy_tree = accuracy_score(y_test, y_pred)
scores_tree = cross_val_score(decision_tree_model, X_features, y, cv=5, scoring='f1')

print("Decision Tree Classifier F1 scores:", scores_tree)
print("Decision Tree Classifier Mean F1 score:", scores_tree.mean())
print("Decision Tree Classifier Accuracy Score:", accuracy_tree)

Decision Tree Classifier F1 scores: [0.9601894  0.97340608 0.91836267 0.92607029 0.92581109]
Decision Tree Classifier Mean F1 score: 0.940767904413953
Decision Tree Classifier Accuracy Score: 0.9772767817322273


In [78]:
import joblib
joblib.dump(random_forest_model, 'random_forest_waf.pkl')
joblib.dump(decision_tree_model, 'decision_tree_waf.pkl')

['decision_tree_waf.pkl']

In [79]:
random_forest_waf = joblib.load('random_forest_waf.pkl')
decision_tree_waf = joblib.load('decision_tree_waf.pkl')

In [88]:
new__training_df

Unnamed: 0,payload,label
0,POST /contact HTTP/1.0,0
1,GET /profile HTTP/2,0
2,DELETE /home HTTP/1.1,0
3,PUT /search HTTP/1.1,0
4,GET /api/data HTTP/1.0,0
...,...,...
497,PATCH /user/subscribe_newsletter HTTP/1.1\nCon...,0
498,PATCH /notification/preferences HTTP/1.1\nCont...,0
499,PATCH /payment/update HTTP/1.1\nContent-Type: ...,0
500,PATCH /product/update_details HTTP/1.1\nConten...,0


In [89]:
X = new__training_df['payload']
y = new__training_df['label']

In [90]:
X_processed = [preprocess_text(payload) for payload in X]
X_features = vectorizer.transform(X_processed)
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.3, random_state=42)

In [92]:
random_forest_waf.fit(X_train, y_train)

In [93]:
decision_tree_waf.fit(X_train, y_train)

In [95]:
joblib.dump(random_forest_waf, 'random_forest_waf_v1.pkl')
joblib.dump(decision_tree_waf, 'decision_tree_waf_v1.pkl')

['decision_tree_waf_v1.pkl']

In [96]:
random_forest_waf = joblib.load('random_forest_waf.pkl')
decision_tree_waf = joblib.load('decision_tree_waf.pkl')

new__training_df = pd.read_csv('./dataset/bad_request.csv')

In [98]:
X = new__training_df['payload']
y = new__training_df['label']

X_processed = [preprocess_text(payload) for payload in X]
X_features = vectorizer.transform(X_processed)
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.3, random_state=42)

decision_tree_waf.fit(X_train, y_train)
random_forest_waf.fit(X_train, y_train)

In [112]:
payloads = [
    "GET /search?product=apple' AND 1=1 -- HTTP/1.1",
    "GET /search?product=apple' AND 1=2 -- HTTP/1.1",
    "GET /search?product=banana' AND 1=1 -- HTTP/1.1",
    "GET /search?product=banana' AND 1=2 -- HTTP/1.1",
    "GET /search?product=tv' AND 1=1 -- HTTP/1.1",
    "GET /search?product=tv' AND 1=2 -- HTTP/1.1",
    "GET /search?product=monitor' AND 1=1 -- HTTP/1.1",
    "GET /search?product=monitor' AND 1=2 -- HTTP/1.1",
    "GET /search?product=laptop' AND 1=1 -- HTTP/1.1",
    "GET /search?product=laptop' AND 1=2 -- HTTP/1.1",
    "GET /search?product=smartphone' AND 1=1 -- HTTP/1.1",
    "GET /search?product=smartphone' AND 1=2 -- HTTP/1.1",
    "GET /search?product=tablet' AND 1=1 -- HTTP/1.1",
    "GET /search?product=tablet' AND 1=2 -- HTTP/1.1",
    "GET /search?product=watch' AND 1=1 -- HTTP/1.1",
    "GET /search?product=watch' AND 1=2 -- HTTP/1.1",
    "GET /search?product=car' AND 1=1 -- HTTP/1.1",
    "GET /search?product=car' AND 1=2 -- HTTP/1.1",
    "GET /search?product=bike' AND 1=1 -- HTTP/1.1",
    "GET /search?product=bike' AND 1=2 -- HTTP/1.1",
    "GET /search?product=printer' AND 1=1 -- HTTP/1.1",
    "GET /search?product=printer' AND 1=2 -- HTTP/1.1",
    "GET /search?product=keyboard' AND 1=1 -- HTTP/1.1",
    "GET /search?product=keyboard' AND 1=2 -- HTTP/1.1",
    "GET /search?product=mouse' AND 1=1 -- HTTP/1.1",
    "GET /search?product=mouse' AND 1=2 -- HTTP/1.1",
    "GET /search?product=monitor' AND 1=1 -- HTTP/1.1",
    "GET /search?product=monitor' AND 1=2 -- HTTP/1.1",
    "GET /search?product=tv' AND 1=1 -- HTTP/1.1",
    "GET /search?product=tv' AND 1=2 -- HTTP/1.1",
    "GET /product?item=watch' UNION SELECT username, password FROM users -- HTTP/1.1",
    "GET /product?item=shoes' UNION ALL SELECT null, database(), null -- HTTP/1.1",
    "GET /product?item=shirt' UNION SELECT null, username, password FROM users -- HTTP/1.1",
    "GET /product?item=shoes' UNION SELECT null, table_name, null FROM information_schema.tables -- HTTP/1.1",
    "GET /product?item=hat' UNION SELECT null, column_name, null FROM information_schema.columns -- HTTP/1.1",
    "GET /product?item=pants' UNION SELECT null, version(), null -- HTTP/1.1",
    "GET /product?item=socks' UNION SELECT null, user(), null -- HTTP/1.1",
    "GET /product?item=glasses' UNION SELECT null, group_concat(username), null FROM users -- HTTP/1.1",
    "GET /product?item=watch' UNION SELECT null, password, null FROM users -- HTTP/1.1",
    "GET /product?item=earrings' UNION SELECT null, concat(username, ':', password), null FROM users -- HTTP/1.1",
    "GET /product?item=gloves' UNION SELECT null, database(), null -- HTTP/1.1",
    "GET /product?item=belt' UNION SELECT null, schema_name, null FROM information_schema.schemata -- HTTP/1.1",
    "GET /product?item=jacket' UNION SELECT null, host(), null -- HTTP/1.1",
    "GET /product?item=jeans' UNION SELECT null, @@version, null -- HTTP/1.1",
    "GET /product?item=sneakers' UNION SELECT null, null, null -- HTTP/1.1",
    "GET /product?item=sandals' UNION SELECT null, null, null -- HTTP/1.1",
    "GET /product?item=slippers' UNION SELECT null, table_name, null FROM information_schema.tables -- HTTP/1.1",
    "GET /product?item=boots' UNION SELECT null, column_name, null FROM information_schema.columns -- HTTP/1.1",
    "GET /product?item=cap' UNION SELECT null, username, null FROM users -- HTTP/1.1",
    "GET /product?item=beanie' UNION SELECT null, password, null FROM users -- HTTP/1.1",
    "GET /product?item=bag' UNION SELECT null, version(), null -- HTTP/1.1",
    "GET /product?item=wallet' UNION SELECT null, database(), null -- HTTP/1.1",
    "GET /product?item=sunglasses' UNION SELECT null, schema_name, null FROM information_schema.schemata -- HTTP/1.1",
    "GET /product?item=hat' UNION SELECT null, user(), null -- HTTP/1.1",
    "GET /product?item=scarves' UNION SELECT null, group_concat(username), null FROM users -- HTTP/1.1",
    "GET /product?item=bracelet' UNION SELECT null, group_concat(password), null FROM users -- HTTP/1.1",
    "GET /product?item=necklace' UNION SELECT null, concat(username, ':', password), null FROM users -- HTTP/1.1",
    "GET /product?item=wallet' UNION SELECT null, group_concat(table_name), null FROM information_schema.tables -- HTTP/1.1",
    "GET /user?name=admin' AND 1=1 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=2 -- HTTP/1.1",
    "GET /user?name=guest' AND 1=1 -- HTTP/1.1",
    "GET /user?name=guest' AND 1=2 -- HTTP/1.1",
    "GET /user?name=test' AND 1=1 -- HTTP/1.1",
    "GET /user?name=test' AND 1=2 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=3 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=4 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=5 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=6 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=7 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=8 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=9 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=10 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=11 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=12 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=13 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=14 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=15 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=16 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=17 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=18 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=19 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=20 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=21 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=22 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=23 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=24 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=25 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=26 -- HTTP/1.1",
    "GET /user?name=admin' AND 1=27 -- HTTP/1.1",
    "GET /product?id=10' WAITFOR DELAY '0:0:5' -- HTTP/1.1",
    "GET /product?id=10' OR IF(1=1, SLEEP(5), 0) -- HTTP/1.1",
    "GET /product?id=20' WAITFOR DELAY '0:0:5' -- HTTP/1.1",
    "GET /product?id=20' OR IF(1=2, SLEEP(5), 0) -- HTTP/1.1",
    "GET /product?id=30' WAITFOR DELAY '0:0:10' -- HTTP/1.1",
    "GET /product?id=30' OR IF(1=1, SLEEP(10), 0) -- HTTP/1.1",
    "GET /product?id=40' WAITFOR DELAY '0:0:15' -- HTTP/1.1",
    "GET /product?id=40' OR IF(1=2, SLEEP(15), 0) -- HTTP/1.1",
    "GET /product?id=50' WAITFOR DELAY '0:0:20' -- HTTP/1.1",
    "GET /product?id=50' OR IF(1=1, SLEEP(20), 0) -- HTTP/1.1",
    "GET /product?id=60' WAITFOR DELAY '0:0:30' -- HTTP/1.1",
    "GET /product?id=60' OR IF(1=1, SLEEP(30), 0) -- HTTP/1.1",
    "GET /product?id=70' WAITFOR DELAY '0:0:60' -- HTTP/1.1",
    "GET /product?id=70' OR IF(1=1, SLEEP(60), 0) -- HTTP/1.1",
    "GET /product?id=80' WAITFOR DELAY '0:0:90' -- HTTP/1.1",
    "GET /product?id=80' OR IF(1=1, SLEEP(90), 0) -- HTTP/1.1",
    "GET /product?id=90' WAITFOR DELAY '0:0:120' -- HTTP/1.1",
    "GET /product?id=90' OR IF(1=1, SLEEP(120), 0) -- HTTP/1.1",
    "GET /product?id=100' WAITFOR DELAY '0:0:150' -- HTTP/1.1",
    "GET /product?id=100' OR IF(1=1, SLEEP(150), 0) -- HTTP/1.1",
    "GET /product?id=110' WAITFOR DELAY '0:0:180' -- HTTP/1.1",
    "GET /product?id=110' OR IF(1=1, SLEEP(180), 0) -- HTTP/1.1",
    "GET /product?id=120' WAITFOR DELAY '0:0:210' -- HTTP/1.1",
    "GET /product?id=120' OR IF(1=1, SLEEP(210), 0) -- HTTP/1.1",
    "GET /product?id=130' WAITFOR DELAY '0:0:240' -- HTTP/1.1",
    "GET /product?id=130' OR IF(1=1, SLEEP(240), 0) -- HTTP/1.1",
    "GET /product?id=140' WAITFOR DELAY '0:0:300' -- HTTP/1.1",
    "GET /product?id=140' OR IF(1=1, SLEEP(300), 0) -- HTTP/1.1",
    "GET /product?id=110 HTTP/1.1",
    "GET /product?id=120 HTTP/1.1",
    "GET /search?query=laptop&category=electronics HTTP/1.1",
    "GET /search?query=tv&category=home-theater HTTP/1.1",
    "GET /product?id=130 HTTP/1.1",
    "GET /category?name=clothing&sort=price_asc HTTP/1.1",
    "GET /product?id=140&category=home-garden HTTP/1.1",
    "GET /product?id=150 HTTP/1.1",
    "GET /product?id=160&color=red HTTP/1.1",
    "GET /search?query=shoes&color=black HTTP/1.1",
    "GET /product?id=170&size=large HTTP/1.1",
    "GET /search?query=tablet&brand=apple HTTP/1.1",
    "GET /account/settings HTTP/1.1",
    "GET /transactions?type=credit HTTP/1.1",
    "GET /account/balance HTTP/1.1",
    "GET /category/electronics?brand=samsung HTTP/1.1",
    "GET /order/history HTTP/1.1",
    "GET /product?id=180&quantity=3 HTTP/1.1",
    "GET /order?id=1002 HTTP/1.1",
    "GET /wishlist?id=250 HTTP/1.1",
    "GET /product?id=190&discount=true HTTP/1.1",
    "GET /account/login HTTP/1.1",
    "GET /cart/view HTTP/1.1",
]

In [113]:
for request in normal_requests:
    request_processed = preprocess_text(request)
    prediction = random_forest_waf.predict(vectorizer.transform([request_processed]))
    print("Block Request" if prediction == 1 else "Allow Request")

Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
Block Request
