In [1]:
import json
import pprint
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

# multi hot bag of bytes encoding
def path_to_vector(path):
    path_bytes = [i for i in bytes(path, 'ascii')]
    vector = [0 for i in range(256)]
    for i in path_bytes:
        vector[i] = 1
    return vector

X = []
y = []

with open('data/paths-benign.txt', 'r') as f:
    for path in f.readlines():
        path = path.strip()
        X.append(path)
        y.append(0)

with open('data/paths-malicious.txt', 'r') as f:
    for path in f.readlines():
        path = path.strip()
        X.append(path)
        y.append(1)

X = [path_to_vector(x) for x in X]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1234)

classifier = MLPClassifier(hidden_layer_sizes=(), random_state=1234)
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print('mean accuracy: ', score)

# Break open the black box to identify what bytes are signatures of SQL injection
coefficients = [(chr(i), classifier.coefs_[0][i]) for i in range(256)]
coefficients = sorted(coefficients, key = lambda i: -i[1])
pprint.pprint(coefficients)

mean accuracy:  0.9986009094088842
[(' ', array([1.5640511])),
 ("'", array([1.55660452])),
 ('=', array([1.29466259])),
 (')', array([1.07534899])),
 ('(', array([1.03737089])),
 ('1', array([0.96597745])),
 ('3', array([0.89979062])),
 ('U', array([0.89895137])),
 ('7', array([0.88764806])),
 ('O', array([0.83535017])),
 ('6', array([0.78722743])),
 ('9', array([0.67743621])),
 ('4', array([0.63773061])),
 ('X', array([0.62352166])),
 ('0', array([0.59579563])),
 ('2', array([0.56146101])),
 ('8', array([0.55401311])),
 ('_', array([0.54545924])),
 (',', array([0.53146352])),
 (':', array([0.50019453])),
 ('Y', array([0.43659955])),
 ('I', array([0.39925241])),
 ('E', array([0.34280822])),
 ('5', array([0.34121652])),
 ('J', array([0.34077929])),
 ('|', array([0.30503143])),
 ('.', array([0.27947921])),
 ('N', array([0.27835602])),
 ('>', array([0.27811108])),
 ('Q', array([0.27663099])),
 ('~', array([0.26102349])),
 ('D', array([0.21669652])),
 ('R', array([0.16536365])),
 ('-', ar