# SQL Injection detection with TensorFlow Neural Network 

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [91]:
# Import data
sql_datasets = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/sqliv2.csv", encoding='utf-16').dropna()

In [92]:
sql_datasets.head()

Unnamed: 0,Sentence,Label
1,""" or pg_sleep ( __TIME__ ) --",1.0
2,create user name identified by pass123 tempora...,1.0
3,%29,1.0
4,' AND 1 = utl_inaddr.get_host_address ( ( S...,1.0
5,select * from users where id = '1' or @ @1 = ...,1.0


In [93]:
sql_datasets.describe()

Unnamed: 0,Label
count,46564.0
mean,0.438665
std,0.496229
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [94]:
sql_datasets.shape

(46564, 2)

## Preparing data

### Split data into training and test sets

In [95]:
X = sql_datasets["Sentence"].values.astype("U")
y = sql_datasets["Label"]

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Using TensorFlow TextVectorizer

In [97]:
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=None,
    output_mode='int',
    output_sequence_length=100)

vectorize_layer.adapt(X_train)

In [98]:
vocab_size = vectorize_layer.vocabulary_size()
vocab_size

41994

### Creating a model

In [99]:
model = tf.keras.Sequential([
  vectorize_layer,
  tf.keras.layers.Embedding(vocab_size, 20, name="embedding"),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(10, activation='relu'),
  tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f68f6fb3450>

In [104]:
model.summary()

Model: "sequential_33"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_8 (TextV  (None, 100)              0         
 ectorization)                                                   
                                                                 
 embedding (Embedding)       (None, 100, 20)           839880    
                                                                 
 global_average_pooling1d_18  (None, 20)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_55 (Dense)            (None, 10)                210       
                                                                 
 dense_56 (Dense)            (None, 1)                 11        
                                                                 
Total params: 840,101
Trainable params: 840,101
Non-t

In [100]:
model.evaluate(X_test, y_test)



[0.016177844256162643, 0.9974229335784912]

### Make predictions

In [111]:
probabilty_predictions = model.predict(X_test)

In [112]:
predictions = tf.round(probabilty_predictions)

## Test on SQL payload cheatsheet

In [144]:
def score_predictions(preds) -> float:
  """Return score of predictions assuming that all predictions should be 1"""
  num_correct = 0
  num_predictions = len(preds)

  for pred in preds:
    if int(pred) == 1:
      num_correct += 1

  print("Predictions score: {}%".format((num_correct / num_predictions) * 100))

In [162]:
def put_in_prediction_in_dataframe(test_data, predictions):
  """Show prediction in DataFrame"""

  predictions_data = [["Test data", "Prediction"]]
  for i in range(0, len(test_data)):
    predictions_data.append([test_data[i], True if predictions[i] == 1 else False])

  return pd.DataFrame(predictions_data)

In [167]:
def make_predictions(test_data, mmodel):
  """Make predictions"""
  
  probs = mmodel.predict(test_data)
  return probs, tf.round(probs)

### Test on common payloads

[Payloads](https://github.com/payloadbox/sql-injection-payload-list)

In [182]:
login_payloads = [
  "' --",
  "' #",
  "'/*",
  "' or 1=1--",
  "' or 1=1#",
  "' or 1=1/*",
  "') or '1'='1--",
  "') or ('1'='1--",
  "' UNION SELECT 1, 'anotheruser', 'doesnt matter', 1--",
  "' or ",
  "-- or # ",
  "' OR '1",
  "' OR 1 -- -",
  '" OR "" = "',
  '" OR 1 = 1 -- -',
  "' OR '' = '",
  "-1 UNION SELECT 1 INTO @,@",
  "-1 UNION SELECT 1 INTO @,@,@",
  "ORDER BY 1--",
]


login_payload_df = pd.DataFrame(login_payloads)

In [183]:
probs_pred, preds = make_predictions(login_payload_df, model)
score_predictions(preds)
pred_df = put_in_prediction_in_dataframe(login_payloads, preds)
pred_df

Predictions score: 89.47368421052632%


Unnamed: 0,0,1
0,Test data,Prediction
1,' --,True
2,' #,True
3,'/*,False
4,' or 1=1--,True
5,' or 1=1#,True
6,' or 1=1/*,True
7,') or '1'='1--,True
8,') or ('1'='1--,True
9,"' UNION SELECT 1, 'anotheruser', 'doesnt matte...",True
