3.3 Load data set

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical


In [10]:
df = pd.read_csv("./datasets/SQLIV3.csv")
print(df.head())

                                            Sentence Label Unnamed: 2  \
0                  " or pg_sleep  (  __TIME__  )  --     1        NaN   
1  create user name identified by pass123 tempora...   NaN          1   
2   AND 1  =  utl_inaddr.get_host_address   (    ...     1        NaN   
3   select * from users where id  =  '1' or @ @1 ...     1        NaN   
4   select * from users where id  =  1 or 1#"  ( ...     1        NaN   

   Unnamed: 3  
0         NaN  
1         NaN  
2         NaN  
3         NaN  
4         NaN  


In [15]:

# Extract sentences and labels
sentences = df['Sentence']
labels = df['Label']

# Preprocess the data
max_words = 1000  # Maximum number of words in your vocabulary
max_len = 50  # M

sentences = sentences.fillna('').astype(str)




In [22]:
#Tokenize and pad sequences
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
X = pad_sequences(sequences, maxlen=max_len)

In [25]:
# Encode the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
#Build an advanced LSTM model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_len))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [35]:
#compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Implement early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.2, callbacks=[early_stopping])


Epoch 1/20
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 572ms/step - accuracy: 0.0000e+00 - loss: -7681.2988 - val_accuracy: 2.0214e-04 - val_loss: -59201.8906
Epoch 2/20
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 432ms/step - accuracy: 0.0000e+00 - loss: -94212.8984 - val_accuracy: 2.0214e-04 - val_loss: -229004.1562
Epoch 3/20
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 443ms/step - accuracy: 0.0000e+00 - loss: -289796.9375 - val_accuracy: 2.0214e-04 - val_loss: -501445.4062
Epoch 4/20
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 415ms/step - accuracy: 0.0000e+00 - loss: -587413.8750 - val_accuracy: 2.0214e-04 - val_loss: -865840.5000
Epoch 5/20
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 417ms/step - accuracy: 0.0000e+00 - loss: -970798.7500 - val_accuracy: 2.0214e-04 - val_loss: -1314547.1250
Epoch 6/20
[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143

<keras.src.callbacks.history.History at 0x262cbdc1670>

In [36]:

# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy*100:.2f}%')


[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 108ms/step - accuracy: 0.0000e+00 - loss: -15417264.0000
Test Loss: -15420262.0000, Test Accuracy: 0.00%


In [41]:
# Save the model
model.save('model.h5')



In [38]:
def log_attack_details(sentence, predicted_label, actual_label):
    with open('attack_log.txt', 'a') as log_file:
        log_file.write(f"Sentence: {sentence}\n")
        log_file.write(f"Predicted Label: {predicted_label}\n")
        log_file.write(f"Actual Label: {actual_label}\n\n")


In [39]:

# Assuming you have a list of sentences to classify for feedback
sentences_to_classify = ["Suspicious SQL injection attempt", "Normal user query", "Another attack example"]

# Classify the sentences and log details for attacks
for sentence in sentences_to_classify:
    # Preprocess the sentence and tokenize it
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)

    # Predict the label for the sentence
    predicted_label = model.predict(padded_sequence)

    # Assuming you have a threshold for classifying attacks
    threshold = 0.5
    if predicted_label > threshold:
        # Log the attack details for feedback
        log_attack_details(sentence, predicted_label, "Attack")
    else:
        log_attack_details(sentence, predicted_label, "Normal")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 176ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step


In [43]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import load_model


# Load your best classification model (replace 'best_model.h5' with your model's file)
best_model = load_model('model.h5')




In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
# Create an instance of Tokenizer
max_words = 1000  # Maximum number of words in your vocabulary
tokenizer = Tokenizer(num_words=max_words)

# Rest of the code
sentence = "SQL injection attempt"
sequence = tokenizer.texts_to_sequences([sentence])
padded_sequence = pad_sequences(sequence, maxlen=max_len)
predicted_label = best_model.predict(padded_sequence)
print(predicted_label)

NameError: name 'pad_sequences' is not defined