# First SVM XSS Demo

## Defining Imports

In [1]:
# Regular Expressions
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
# from sklearn import metrics
import m2cgen as m2c 
# To ignore any warnings
import warnings
warnings.filterwarnings("ignore")

## Defining Constants

In [2]:
# Constants: XSS Features

URL = '(http://)|(https://)'
SYMBOLS = "[<>,\'\"/]&?\{\}"
WORDS = "(alert)|(script)|(%3c)|(%3e)|(%20)|(onerror)|(onload)|(eval)|(src=)|(prompt)|(onclick)|(document)|(window)|(iframe)|(location)|(img)|(div)|(break)|(.js)"

## Defining Functions

In [3]:
# Extract Features

def get_features(url):
    is_url = 1 if re.search(URL, url, re.IGNORECASE) else 0
    xss_symbol_count = len(re.findall(SYMBOLS, url, re.IGNORECASE))
    xss_word_count = len(re.findall(WORDS, url, re.IGNORECASE))
    return [len(url), is_url, xss_symbol_count, xss_word_count]

def read_data(file, x, y, isXss):
    with open(file) as f:
        for line in f:
            x.append(get_features(line))
            if isXss: y.append(1)
            else: y.append(0)
    return x

In [4]:
# Evaluate model

def evaluate(name, y_test, y_pred):  
    """Calculate the main metrics supporting multi-target cases for a given model.  

    Args:
        name (str): The trained model's name
        y_test (series): Contains the ground truth values (aka y_true)
        y_pred (series): Contains the predicted values for the test set
        
    Returns:
        df_metrics (DataFrame): The predicted metrics in a DataFrame
        
    """
    dict_metrics = {
        'Accuracy Score': accuracy_score(y_test, y_pred),
        'Confusion Matrix': confusion_matrix(y_test, y_pred),
        'Precision Score': precision_score(y_test, y_pred),
        'Recall Score': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred)
    }
    #df_metrics = pd.DataFrame.from_dict
    df_metrics = pd.DataFrame.from_dict(dict_metrics, orient="index")
    df_metrics.columns = [name]
    return df_metrics

## Prepare Data - read and split

In [7]:
# 1. Create x and y to store datas
x = []
y = []

# 2. Read datas and get features: scrapy + github
# xf = XSSFeatures()
read_data('../../datasets/gitee/xss-200000.txt', x, y, 1)
read_data('../../datasets/gitee/good-xss-200000.txt', x, y, 0)

# 3. Split datas into test and train: 0.6 of data using for train
x_train, x_test, y_train, y_test = tts(x, y, test_size=0.4, random_state=0)

## Classifiers and Tests

In [54]:
%%time
# Support Vector Classification Linear kernel
svm = SVC(kernel='linear')
y_pred_svm = svm.fit(x_train, y_train).predict(x_test)

# Evaluate SVM model
evaluate("SVM Linear", y_test, y_pred_svm)

CPU times: user 1min 20s, sys: 1.36 s, total: 1min 22s
Wall time: 1min 29s


Unnamed: 0,SVM Linear
Accuracy Score,0.983087
Confusion Matrix,"[[53931, 234], [792, 5707]]"
Precision Score,0.960613
Recall Score,0.878135
F1 Score,0.917524


In [60]:
%%time
# Support Vector Classification RBF kernel
svm_rbf = SVC(kernel='rbf')
y_pred_svm_rbf = svm_rbf.fit(x_train, y_train).predict(x_test)

# Evaluate SVM model
evaluate("SVM RBF", y_test, y_pred_svm_rbf)

CPU times: user 59.3 s, sys: 796 ms, total: 1min
Wall time: 1min 4s


Unnamed: 0,SVM RBF
Accuracy Score,0.982642
Confusion Matrix,"[[54100, 65], [988, 5511]]"
Precision Score,0.988343
Recall Score,0.847977
F1 Score,0.912795


In [47]:
%%time
# Support Vector Classification RBF kernel
svm_poly = SVC(kernel='poly', degree=8)
y_pred_svm_poly = svm_poly.fit(x_train, y_train).predict(x_test)

# Evaluate SVM model
evaluate("SVM Poly", y_test, y_pred_svm_poly)

CPU times: user 1min 23s, sys: 1.44 s, total: 1min 24s
Wall time: 1min 36s


Unnamed: 0,SVM Poly
Accuracy Score,0.945767
Confusion Matrix,"[[54158, 7], [3283, 3216]]"
Precision Score,0.997828
Recall Score,0.494845
F1 Score,0.661592


In [79]:
%%time
# Linear Regression
reg = LinearRegression()
y_pred_reg = reg.fit(x_train, y_train).predict(x_test)

# Evaluate
evaluate("Linear Regression", y_test, y_pred_svm)

CPU times: user 416 ms, sys: 23.2 ms, total: 439 ms
Wall time: 443 ms


Unnamed: 0,Linear Regression
Accuracy Score,0.983087
Confusion Matrix,"[[53931, 234], [792, 5707]]"
Precision Score,0.960613
Recall Score,0.878135
F1 Score,0.917524


In [98]:
%%time
# Naive Bayes Gaussian
gnb = GaussianNB()
y_pred_gnb = gnb.fit(x_train, y_train).predict(x_test)

# Evaluate
evaluate("NB Gaussian", y_test, y_pred_gnb)

CPU times: user 397 ms, sys: 13.7 ms, total: 411 ms
Wall time: 573 ms


Unnamed: 0,NB Gaussian
Accuracy Score,0.968136
Confusion Matrix,"[[53086, 1079], [854, 5645]]"
Precision Score,0.83953
Recall Score,0.868595
F1 Score,0.853815


In [13]:
# 通过网格交叉验证的方式，找出效果最好的超参数
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier()

#定义需要尝试的超参数组合
grid = {"n_neighbors":range(1,11,1),"weights":['uniform','distance']}
'''
estimator:对哪个模型调整超参数
param_grid 需要验证的超参数组合。
scoring 模型评估标准
n_jobs %%bash发数量
cv 交叉验证折数
verbose 输出冗余信息，值越大，输出信息越多
'''
gs = GridSearchCV(estimator=knn,param_grid=grid,scoring="accuracy",n_jobs=-1,cv=5,verbose=10,return_train_score=True)
gs.fit(x_train, y_train)

# 最好的分值
print(gs.best_score_)

#最好的超参数组合
print(gs.best_params_)

#使用最好的超参数训练好的模型
print(gs.best_estimator_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 3/5; 1/20] START n_neighbors=1, weights=uniform.............................
[CV 2/5; 1/20] START n_neighbors=1, weights=uniform.............................
[CV 4/5; 1/20] START n_neighbors=1, weights=uniform.............................
[CV 1/5; 1/20] START n_neighbors=1, weights=uniform.............................
[CV 3/5; 1/20] END n_neighbors=1, weights=uniform;, score=(train=0.986, test=0.984) total time=   4.9s
[CV 1/5; 1/20] END n_neighbors=1, weights=uniform;, score=(train=0.995, test=0.992) total time=   4.8s
[CV 2/5; 1/20] END n_neighbors=1, weights=uniform;, score=(train=0.994, test=0.993) total time=   4.8s
[CV 5/5; 1/20] START n_neighbors=1, weights=uniform.............................
[CV 4/5; 1/20] END n_neighbors=1, weights=uniform;, score=(train=0.986, test=0.986) total time=   4.8s
[CV 1/5; 2/20] START n_neighbors=1, weights=distance............................
[CV 2/5; 2/20] START n_neighbors=1, weig

In [118]:
%%time
# K-Nearest Neighbor
knn = KNeighborsClassifier(n_neighbors=10, weights='distance')
y_pred_knn = knn.fit(x_train, y_train).predict(x_test)

# Evaluate
evaluate("K-Nearest Neighbor", y_test, y_pred_knn)

CPU times: user 4.76 s, sys: 104 ms, total: 4.86 s
Wall time: 5.81 s


Unnamed: 0,K-Nearest Neighbor
Accuracy Score,0.994873
Confusion Matrix,"[[53985, 180], [131, 6368]]"
Precision Score,0.972511
Recall Score,0.979843
F1 Score,0.976163


In [138]:
%%time
# Decision Tree
dt = DecisionTreeClassifier(random_state=0)
y_pred_dt = dt.fit(x_train, y_train).predict(x_test)

# Evaluate
evaluate("Decision Tree", y_test, y_pred_dt)

CPU times: user 391 ms, sys: 13.6 ms, total: 405 ms
Wall time: 465 ms


Unnamed: 0,Decision Tree
Accuracy Score,0.994939
Confusion Matrix,"[[54028, 137], [170, 6329]]"
Precision Score,0.978812
Recall Score,0.973842
F1 Score,0.976321


In [200]:
%%time
# Random Forest
rf = RandomForestClassifier(max_depth=2, random_state=0)
y_pred_rf = rf.fit(x_train, y_train).predict(x_test)

# Evaluate
evaluate("Random Forest", y_test, y_pred_rf)

CPU times: user 1.89 s, sys: 47.8 ms, total: 1.94 s
Wall time: 2.19 s


Unnamed: 0,Random Forest
Accuracy Score,0.991758
Confusion Matrix,"[[54043, 122], [378, 6121]]"
Precision Score,0.980458
Recall Score,0.941837
F1 Score,0.96076


## Combine Classifiers and Test

In [221]:
%%time
# Linear Regression, SVM, Decision Tree, KNearest Neighbor, Random Forest, Naive Baiyes Gaussian
# Bests: KNN + DT

# knn1 = KNeighborsClassifier(n_neighbors=5).fit(x_train, y_train)
knn_dt = VotingClassifier(estimators=[('knn', knn), ('dt', dt)], voting='hard')
y_pred_knn_dt = knn_dt.fit(x_train, y_train).predict(x_test)
evaluate("KNN + DT", y_test, y_pred_knn_dt)

CPU times: user 5.42 s, sys: 113 ms, total: 5.53 s
Wall time: 6.4 s


Unnamed: 0,KNN + DT
Accuracy Score,0.994939
Confusion Matrix,"[[54029, 136], [171, 6328]]"
Precision Score,0.97896
Recall Score,0.973688
F1 Score,0.976317


In [220]:
%%time
rf_dt = VotingClassifier(estimators=[('rf', rf), ('dt', dt)], voting='hard')
y_pred_rf_dt = rf_dt.fit(x_train, y_train).predict(x_test)
evaluate("RF + DT", y_test, y_pred_rf_dt)

CPU times: user 2.52 s, sys: 54.4 ms, total: 2.57 s
Wall time: 3.08 s


Unnamed: 0,RF + DT
Accuracy Score,0.991741
Confusion Matrix,"[[54070, 95], [406, 6093]]"
Precision Score,0.984648
Recall Score,0.937529
F1 Score,0.960511


In [219]:
%%time
rf_knn = VotingClassifier(estimators=[('rf', rf), ('knn', knn)], voting='hard')
y_pred_rf_knn = rf_knn.fit(x_train, y_train).predict(x_test)
evaluate("RF + KNN", y_test, y_pred_rf_knn)

CPU times: user 6.65 s, sys: 133 ms, total: 6.79 s
Wall time: 7.88 s


Unnamed: 0,RF + KNN
Accuracy Score,0.991708
Confusion Matrix,"[[54068, 97], [406, 6093]]"
Precision Score,0.98433
Recall Score,0.937529
F1 Score,0.960359


In [20]:
%%time
rf_svm = VotingClassifier(estimators=[('rf', rf), ('svm', svm)], voting='hard')
y_pred_rf_svm = rf_svm.fit(x_train, y_train).predict(x_test)
evaluate("RF + SVM", y_test, y_pred_rf_svm)

CPU times: user 1min 25s, sys: 1.23 s, total: 1min 26s
Wall time: 1min 33s


Unnamed: 0,RF + SVM
Accuracy Score,0.985313
Confusion Matrix,"[[54093, 72], [819, 5680]]"
Precision Score,0.987483
Recall Score,0.873981
F1 Score,0.927271


In [21]:
# convert model to pure Javascript code
classifier = dt.fit(x_train, y_train)
model_to_javascript = m2c.export_to_javascript(classifier)
print(model_to_javascript)

function score(input) {
    var var0;
    if (input[3] <= 1.5) {
        if (input[0] <= 35.5) {
            if (input[0] <= 31.5) {
                if (input[3] <= 0.5) {
                    var0 = [1.0, 0.0];
                } else {
                    if (input[0] <= 28.5) {
                        if (input[0] <= 18.5) {
                            if (input[0] <= 12.5) {
                                var0 = [1.0, 0.0];
                            } else {
                                if (input[0] <= 17.5) {
                                    if (input[0] <= 14.5) {
                                        if (input[0] <= 13.5) {
                                            var0 = [0.9615384615384616, 0.038461538461538464];
                                        } else {
                                            var0 = [1.0, 0.0];
                                        }
                                    } else {
                                        if (input[0] <= 15