## SVM / TFIDF with NGrams

In [1]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, average_precision_score, f1_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.utils.class_weight import compute_class_weight

STORAGE_PATH = './datasets/raw'
try:
    from google.colab import drive

    drive.mount('/content/drive')
    STORAGE_PATH = '/content/drive/MyDrive/xss_sqli_detector/datasets'
except:
    print('Not running on Google Colab')

XSS_SQLI_CONDENSED_WITH_FEATURES_DATASET_PATH = f'{STORAGE_PATH}/xss_sqli_condensed_with_features.csv'

Not running on Google Colab


In [2]:
df = pd.read_csv(XSS_SQLI_CONDENSED_WITH_FEATURES_DATASET_PATH)

In [None]:
X = df["payload"].astype(str).values
y = df["label_type"].values
num_classes = len(np.unique(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

svm_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(analyzer="char", ngram_range=(2,8), min_df=4, sublinear_tf=True)),
    ("svm", LinearSVC(C=1.0, class_weight="balanced", verbose=2, max_iter=2000))
])

svm_pipe.fit(X_train, y_train)
y_pred = svm_pipe.predict(X_test)
print("SVM report:\n", classification_report(y_test, y_pred, digits=3))


In [16]:
print("This is benign", svm_pipe.predict(["""Hola! Me da gusto saber de ti y de todos los siguientes: Juan, Paco, Pedro aka. Paquito y el t0x1c en l33tc0d3 '%%%%%.href=google.com <li><a href="/wiki/File:Socrates.png" class="image"><img alt="Socrates.png" src="//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/18px-Socrates.png" decoding="async" width="18" height="28" class="noviewer" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/27px-Socrates.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/36px-Socrates.png 2x" data-file-width="326" data-file-height="500" /> </a> <a href="/wiki/Portal:Philosophy" title="Portal:Philosophy">Philosophy&#32;portal </a> </li> </ul> """]))


This is benign [0]


In [15]:
print("This is malicious", svm_pipe.predict(["<a onblur=alert(1) tabindex=1 id=x></a><input autofocus>"]))

This is malicious [1]


In [17]:
print("This is benign???? (inserted some bad code)", svm_pipe.predict(["""Hola! Me da gusto saber de ti y de todos los siguientes: Juan, Paco, Pedro aka. Paquito y el t0x1c en l33tc0d3 '%%%%%.href=google.com <li><a href="/wiki/File:Socrates.png" class="image"><img alt="Socrates.png" src="//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/18px-Socrates.png" decoding="async" width="18" height="28" class="noviewer" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/27px-Socrates.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/36px-Socrates.png 2x" data-file-width="326" data-file-height="500" /> </a> <a href="/wiki/Portal:Philosophy" title="Portal:Philosophy">Philosophy&#32;portal </a> </li> </ul>  div draggable="true" contenteditable>drag me</div><sup ondrop=alert(1) contenteditable>drop here</sup>"""]))

This is benign???? (inserted some bad code) [0]
