In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, RepeatVector, Embedding, TimeDistributed, Dense, Dropout, Conv1D, GRU, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import backend as K
from sklearn.metrics import classification_report, confusion_matrix
import ast

In [13]:
df2 = pd.read_csv('CVE-2012-2122.csv')
attack = []
normal = []
for i in range(len(df2)):
    calls = ast.literal_eval(df2.iloc[i]['syscalls'])
    check = df2.iloc[i]['is_exploit']
    
    temp_list = []
    for j in range(len(calls)):
        temp_list.append(calls[j]['name'])
    if check:
        attack.append(temp_list)
    else:
        normal.append(temp_list)

both_lists = attack + normal
tokenizer = Tokenizer()
tokenizer.fit_on_texts(both_lists)
word_index = tokenizer.word_index

X_train = normal[:850]
X_val = normal[850:1050]
X_test = normal[1050:] + attack
tokened_Xtrain = tokenizer.texts_to_sequences(X_train)
tokened_Xtest = tokenizer.texts_to_sequences(X_test)
tokened_Xval = tokenizer.texts_to_sequences(X_val)
max_length = 7500
X_train_padded = pad_sequences(tokened_Xtrain, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(tokened_Xtest, maxlen=max_length, padding='post')
X_val_padded = pad_sequences(tokened_Xval, maxlen=max_length, padding='post')

K = len(word_index) + 1

In [3]:
model = Sequential()
model.add(Embedding(input_dim=K, output_dim=400, input_length=None))
model.add(Conv1D(filters=16, kernel_size=3, activation="relu", padding="causal"))
model.add(Conv1D(filters=32, kernel_size=3, activation="relu", padding="causal"))
model.add(Conv1D(filters=64, kernel_size=3, activation="relu", padding="causal"))
model.add(Conv1D(filters=128, kernel_size=3, activation="relu", padding="causal"))
model.add(Conv1D(filters=256, kernel_size=3, activation="relu", padding="causal"))
model.add(Conv1D(filters=512, kernel_size=3, activation="relu", padding="causal"))
model.add(Conv1D(filters=1024, kernel_size=3, activation="relu", padding="causal"))
model.add(BatchNormalization())
model.add(GRU(500, return_sequences=True))
model.add(Dropout(0.5))
model.add(Dense(K, activation='softmax'))

early_stopping = EarlyStopping(
    monitor='val_loss',  # Metric to monitor
    patience=3,          # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True  # Restore the model weights from the epoch with the best value of the monitored metric
)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_crossentropy'])

In [16]:
def add_train_labels(x):
        return x[:-1], x[1:]
K = len(word_index) + 1
train = [add_train_labels(seq) for seq in X_train_padded]
val = [add_train_labels(seq) for seq in X_val_padded]

train_inputs, train_targets = zip(*train)
val_inputs, val_targets = zip(*val)

# Convert to numpy arrays
train_inputs = np.array(train_inputs)
train_targets = np.array(train_targets)
val_inputs = np.array(val_inputs)
val_targets = np.array(val_targets)

In [17]:
model.fit(
            train_inputs,
            train_targets,
            validation_data=(val_inputs, val_targets),
            epochs=150,
            verbose=2,
            shuffle=True,
            callbacks=[early_stopping],
        )


Epoch 1/150
27/27 - 795s - loss: 0.1202 - sparse_categorical_crossentropy: 0.1202 - val_loss: 0.1283 - val_sparse_categorical_crossentropy: 0.1283 - 795s/epoch - 29s/step
Epoch 2/150
27/27 - 810s - loss: 0.1195 - sparse_categorical_crossentropy: 0.1195 - val_loss: 0.1289 - val_sparse_categorical_crossentropy: 0.1289 - 810s/epoch - 30s/step
Epoch 3/150
27/27 - 812s - loss: 0.1197 - sparse_categorical_crossentropy: 0.1197 - val_loss: 0.1290 - val_sparse_categorical_crossentropy: 0.1290 - 812s/epoch - 30s/step
Epoch 4/150
27/27 - 812s - loss: 0.1188 - sparse_categorical_crossentropy: 0.1188 - val_loss: 0.1278 - val_sparse_categorical_crossentropy: 0.1278 - 812s/epoch - 30s/step
Epoch 5/150
27/27 - 822s - loss: 0.1177 - sparse_categorical_crossentropy: 0.1177 - val_loss: 0.1277 - val_sparse_categorical_crossentropy: 0.1277 - 822s/epoch - 30s/step
Epoch 6/150
27/27 - 811s - loss: 0.1174 - sparse_categorical_crossentropy: 0.1174 - val_loss: 0.1280 - val_sparse_categorical_crossentropy: 0.128

<keras.src.callbacks.History at 0x3399afaf0>

In [18]:
val = X_val_padded
y_val = model.predict(val)
sums_val = np.array([-np.log(pred.max(axis=-1)).sum(axis=-1) for pred in y_val])
threshold = np.mean(sums_val) + 2*np.std(sums_val)
print(threshold)

825.7450561523438


In [19]:
test = X_test_padded
print(len(test))
y_pred = model.predict(test)
sums = np.array([-np.log(pred.max(axis=-1)).sum(axis=-1) for pred in y_pred])

345


In [22]:
threshold = np.mean(sums_val) + np.std(sums_val)
successes = 0
total = len(test)
pred = []
norms_len = total - len(attack)
testY = []
for i in range(total):
    if sums[i] <= threshold:
        pred.append(0)
    else:
        pred.append(1)
    if i < norms_len:
        testY.append(0)
        if sums[i] <= threshold:
            successes += 1
    else:
        testY.append(1)
        if sums[i] > threshold:
            successes += 1
print(successes)
print(total)
print(successes / total)

285
345
0.8260869565217391


In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [23]:
precision = precision_score(testY, pred, pos_label=0)
recall = recall_score(testY, pred, pos_label=0)
f1 = f1_score(testY, pred, pos_label=0)
print("Precision: ",precision)
print("Recall: ",recall)
print("f1: ",f1)

Precision:  0.825
Recall:  0.868421052631579
f1:  0.8461538461538461


In [2]:
df = pd.read_csv('Bruteforce_CWE-307.csv')
attack = []
normal = []
for i in range(len(df)):
    calls = ast.literal_eval(df.iloc[i]['syscalls'])
    check = df.iloc[i]['is_exploit']
    
    temp_list = []
    for j in range(len(calls)):
        temp_list.append(calls[j]['name'])
    if check:
        attack.append(temp_list)
    else:
        normal.append(temp_list)

both_lists = attack + normal
tokenizer = Tokenizer()
tokenizer.fit_on_texts(both_lists)
word_index = tokenizer.word_index

X_train = normal[:500]
X_val = normal[500:750]
X_test = normal[750:] + attack
tokened_Xtrain = tokenizer.texts_to_sequences(X_train)
tokened_Xtest = tokenizer.texts_to_sequences(X_test)
tokened_Xval = tokenizer.texts_to_sequences(X_val)
max_length = 5000
X_train_padded = pad_sequences(tokened_Xtrain, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(tokened_Xtest, maxlen=max_length, padding='post')
X_val_padded = pad_sequences(tokened_Xval, maxlen=max_length, padding='post')

K = len(word_index) + 1

In [7]:
model.fit(
            train_inputs,
            train_targets,
            validation_data=(val_inputs, val_targets),
            epochs=100,
            verbose=2,
            shuffle=True,
            callbacks=[early_stopping],
        )

Epoch 1/100
16/16 - 333s - loss: 2.5387 - sparse_categorical_crossentropy: 2.5387 - val_loss: 3.4459 - val_sparse_categorical_crossentropy: 3.4459 - 333s/epoch - 21s/step
Epoch 2/100
16/16 - 319s - loss: 1.2582 - sparse_categorical_crossentropy: 1.2582 - val_loss: 3.2569 - val_sparse_categorical_crossentropy: 3.2569 - 319s/epoch - 20s/step
Epoch 3/100
16/16 - 318s - loss: 0.8021 - sparse_categorical_crossentropy: 0.8021 - val_loss: 3.0178 - val_sparse_categorical_crossentropy: 3.0178 - 318s/epoch - 20s/step
Epoch 4/100
16/16 - 320s - loss: 0.5837 - sparse_categorical_crossentropy: 0.5837 - val_loss: 2.9821 - val_sparse_categorical_crossentropy: 2.9821 - 320s/epoch - 20s/step
Epoch 5/100
16/16 - 320s - loss: 0.4987 - sparse_categorical_crossentropy: 0.4987 - val_loss: 2.9729 - val_sparse_categorical_crossentropy: 2.9729 - 320s/epoch - 20s/step
Epoch 6/100
16/16 - 318s - loss: 0.4330 - sparse_categorical_crossentropy: 0.4330 - val_loss: 3.0163 - val_sparse_categorical_crossentropy: 3.016

<keras.src.callbacks.History at 0x2f47a3a60>

In [8]:
print(len(normal), len(attack))


994 98


In [9]:
val = X_val_padded
y_val = model.predict(val)
sums_val = np.array([-np.log(pred.max(axis=-1)).sum(axis=-1) for pred in y_val])
threshold = np.mean(sums_val) + 2*np.std(sums_val)
print(threshold)

4857.457183837891


In [11]:
test = X_test_padded
print(len(test))
y_pred = model.predict(test)
sums = np.array([-np.log(pred.max(axis=-1)).sum(axis=-1) for pred in y_pred])

342


In [17]:
threshold = np.mean(sums_val) + 2*np.std(sums_val)
successes = 0
total = len(test)
avg_1 = 0
avg_2 = 0
norms_len = total - len(attack)
for i in range(total):
    if i < norms_len:
        if sums[i] <= threshold:
            successes += 1
            avg_1 += sums[i]
    else:
        if sums[i] > threshold:
            successes += 1
            avg_2 += sums[i]
avg_1 = avg_1/norms_len
avg_2 = avg_2/len(attack)
print(successes)
print(total)
print(successes / total)

227
342
0.6637426900584795


In [10]:
print(sums_val)

[4552.3003 4540.8105 4583.8555 4409.601  4611.3086 4910.6797 4843.3643
 4473.8755 4443.9985 4889.338  4402.541  4502.336  4415.4404 5031.5083
 4281.996  4518.9287 4214.032  4441.5117 4470.049  4524.8076 4532.829
 4519.027  4338.6094 4535.181  4732.416  4386.562  4368.0703 4682.0576
 4487.8037 4411.709  4333.343  4536.8486 4555.843  4320.2383 4491.8906
 4505.4526 4548.61   4722.8843 4413.823  4365.9844 4422.5557 4452.543
 4304.526  5167.8047 4510.2466 5126.9287 4494.554  4474.2603 4374.2676
 5179.496  4426.0234 4581.5977 4499.27   4446.5576 4425.83   4658.9814
 4425.29   4412.532  4620.1484 4477.7217 4432.608  4334.0947 4392.8105
 4776.123  4530.4297 4482.6455 4351.923  4367.523  4759.582  4444.2544
 4346.2637 4371.7944 4527.904  4771.5566 4350.9775 4430.251  4295.73
 4339.7764 4397.6006 4444.384  4343.5205 4659.679  4379.338  4476.6875
 4943.079  4439.079  4583.384  4577.2217 4487.795  4372.707  4526.122
 4421.695  4518.0156 4496.7153 4416.9785 4312.794  4317.863  4463.708
 4446.796  4

In [13]:
print(sums)

[4403.8154 4373.2617 5030.111  4821.6875 4480.207  4462.8506 4430.759
 4903.5947 4605.374  4391.5586 4518.998  4477.3296 4418.7295 4426.9805
 4440.826  4587.745  4496.8716 4769.081  4521.155  4347.325  5061.684
 4492.9014 4573.8604 4461.3315 4354.8325 4531.6543 4681.5166 4800.636
 4522.883  4448.2764 4510.116  4657.1797 4288.5605 4762.1426 4531.254
 5001.0996 4449.649  4513.418  4304.667  4430.7183 5285.005  4457.8584
 4485.1313 4503.919  4328.8647 4280.329  4695.3066 4574.0884 4325.503
 4413.0386 4561.5127 4452.7427 4527.6943 4407.958  4564.4927 4402.002
 5230.3613 4531.5933 4349.3896 4356.1846 4553.386  4524.814  4503.7344
 4638.745  4295.269  4802.258  5032.3438 4527.4453 4341.2456 4228.108
 4681.084  4355.909  4566.3623 4486.284  4533.4453 5058.3096 4357.0566
 4525.1904 4436.054  4330.2393 4330.739  4355.9404 5062.9404 4354.3076
 4704.7383 4483.291  4412.95   4575.141  4330.658  4327.959  4429.0713
 4446.8496 4398.3794 4543.5635 4382.04   4551.927  4725.2275 4434.4453
 4633.707  43