In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import tensorflow as tf
from keras import layers
from keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from keras.models import Sequential
from tensorflow.keras.callbacks import TensorBoard
from sklearn import metrics

In [2]:
df = pd.read_csv('D:\\data_datn\\data_datn\\code_injection_6labels.csv')

In [3]:
permuted_df = df.sample(frac=1, random_state=42)
permuted_df = permuted_df.reset_index(drop=True)
permuted_df

Unnamed: 0.1,Unnamed: 0,text,000 - Normal,126 - Path Traversal,242 - Code Injection,274 - HTTP Verb Tampering,66 - SQL Injection,88 - OS Command Injection
0,67917,GET /blog/index.php/aut-velit-dicta-eaque%22+a...,0,0,0,0,1,0
1,45059,GET /blog/index.php%22+UNION+ALL+select+NULL+-...,0,0,0,0,1,0
2,68569,POST /blog/index.php/my-account/edit-profile/p...,0,1,0,0,0,0
3,19913,POST /blog/index.php/my-account/lost-password/...,0,1,0,0,0,1
4,3173,GET /blog/index.php/tag/pariatur-eum-illo/,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
74883,37194,POST /blog/%5C..%5C..%5C..%5C..%5C..%5C..%5C.....,0,1,0,0,0,0
74884,6265,GET /blog/index.php/sample-page/amet-et-deleni...,1,0,0,0,0,0
74885,54886,GET /blog/index.php/2020/04/04/%3Bprint%28chr%...,0,0,1,0,0,0
74886,860,GET /blog/index.php/2020/04/04/inventore-asper...,1,0,0,0,0,0


In [4]:
# Assuming df is your DataFrame
n = len(permuted_df)
split_index = int(0.8 * n)  # 80% training, 20% testing

df_train = permuted_df.iloc[:split_index]
df_test = permuted_df.iloc[split_index:]

In [5]:
df_test = df_test.reset_index(drop=True)

In [6]:
target_list = ["000 - Normal", '126 - Path Traversal',
               '242 - Code Injection', '274 - HTTP Verb Tampering',
               '66 - SQL Injection', '88 - OS Command Injection']

In [7]:
# 读取train.csv和test.csv数据
# df_train = pd.read_csv('./data/torpeda_train_test/train.csv')
# df_test = pd.read_csv('./data/torpeda_train_test/test.csv')

# 获取训练集中的url和label
url_train = df_train['text']
label_train = df_train[target_list]
# 获取测试集中的url和label
url_test = df_test['text']
label_test = df_test[target_list]

print("=================================================================================================")
print("训练集中第一条数据：")
print("url: %s" % url_train[0])
print("label: %s\n" % label_train)

print("测试集中第一条数据：")
print("url: %s" % url_test[0])
print("label: %s\n" % label_test)
print("=================================================================================================")

训练集中第一条数据：
url: GET /blog/index.php/aut-velit-dicta-eaque%22+and+0+in+%28select+sleep%2815%29+%29+--+/feed
label:        000 - Normal  126 - Path Traversal  242 - Code Injection  \
0                 0                     0                     0   
1                 0                     0                     0   
2                 0                     1                     0   
3                 0                     1                     0   
4                 1                     0                     0   
...             ...                   ...                   ...   
59905             0                     0                     0   
59906             0                     0                     0   
59907             1                     0                     0   
59908             1                     0                     0   
59909             0                     0                     1   

       274 - HTTP Verb Tampering  66 - SQL Injection  \
0                        

In [31]:
print(label_train)

       000 - Normal  126 - Path Traversal  242 - Code Injection  \
0                 0                     0                     0   
1                 0                     0                     0   
2                 0                     1                     0   
3                 0                     1                     0   
4                 1                     0                     0   
...             ...                   ...                   ...   
59905             0                     0                     0   
59906             0                     0                     0   
59907             1                     0                     0   
59908             1                     0                     0   
59909             0                     0                     1   

       274 - HTTP Verb Tampering  66 - SQL Injection  \
0                              0                   1   
1                              0                   1   
2                          

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
tokenizer = CountVectorizer()

In [14]:
import json
# 字符级分词，在训练集上拟合
# tokenizer = Tokenizer(filters='\t\n', char_level=True)
Xfeatures = tokenizer.fit_transform(url_train).toarray()
# 构建词典，并保存
num_words = len(tokenizer.get_feature_names_out())+1
vocab = tokenizer.get_feature_names_out()
print("字典的大小为%d" % num_words)
print("字典：")
print(vocab)
# with open("./tokenizer/vocab_capec.json", 'w') as f:
#     json.dump(vocab, f, ensure_ascii=False)

字典的大小为17810
字典：
['00' '001234' '0037ea74eec2' ... 'zzk' 'zzou9x7ebvyh7rn'
 'zzxr1es3voa09ju']


In [33]:
label_train['000 - Normal']

0        0
1        0
2        0
3        0
4        1
        ..
59905    0
59906    0
59907    1
59908    1
59909    0
Name: 000 - Normal, Length: 59910, dtype: int64

In [11]:
# 将label转化为one-hot形式
def get_one_hot_value(s):
    one_hot = []
    for target in target_list:
        one_hot.append(s[target])
    return one_hot

In [12]:
# 获取url的长度
url_train_lens = [len(u) for u in url_train]
url_test_lens = [len(u) for u in url_test]
# 查看97%长度的大小
print(np.percentile(np.array(url_train_lens),97))
print(np.percentile(np.array(url_test_lens),97))

308.0
311.0


In [13]:
# 观察后定义url最大的长度为600
max_len = 600
# 将文本转为序列
seq_train = tokenizer.texts_to_sequences(url_train)
seq_test = tokenizer.texts_to_sequences(url_test)
# 将序列的长度统一为max_len
X_train = sequence.pad_sequences(seq_train, maxlen=max_len)
X_test = sequence.pad_sequences(seq_test, maxlen=max_len)
# 将标签转化为one-hot
Y_train = [get_one_hot_value(l) for index, l in label_train.iterrows()]
Y_test =  [get_one_hot_value(l) for index, l in label_test.iterrows()]
print("=================================================================================================")
print("举例：")
print("向量化前：")
print("url: %s" % url_train[0])
# print("label: %s\n" % label_train[0])
print("向量化后：")
print("url_vec: " + str(X_train[0]))
print("label_one_hot: " + str(Y_train[0]))
print("=================================================================================================")

AttributeError: 'CountVectorizer' object has no attribute 'texts_to_sequences'

In [15]:
# 回调
tb_callback = TensorBoard(log_dir='./logs', embeddings_freq=1)

In [16]:
from tensorflow.keras.optimizers import Adam

# Define the learning rate
learning_rate = 0.00001  # Adjust this value as needed

# Create an Adam optimizer with the specified learning rate
optimizer = Adam(learning_rate=learning_rate)

In [17]:
# 搭建网络
model = Sequential()
model.add(layers.Embedding(num_words, 64, input_length=max_len))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.BatchNormalization(axis = 1, momentum = 0.99))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.BatchNormalization(axis = 1, momentum = 0.99))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(6, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
print(model.summary())



None


In [40]:
print(Y_train)
Y_train = np.array(Y_train)


[[0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 1, 0], [1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1], [0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0], [0, 0, 0, 1, 0, 0], [0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 1, 0], [0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 1],

In [18]:
X_train = tf.convert_to_tensor(Xfeatures)
Y_train = tf.convert_to_tensor(label_train)

In [20]:
# 训练
model.fit(Xfeatures, label_train, validation_split=0.25, epochs=10, batch_size=32, callbacks=[tb_callback])

MemoryError: Unable to allocate 5.96 GiB for an array with shape (44932, 17809) and data type int64

In [43]:
X_test = tf.convert_to_tensor(X_test)
Y_test = tf.convert_to_tensor(Y_test)

In [70]:
# 评估模型
model.evaluate(X_test, Y_test, batch_size=32)

[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8644 - loss: 0.9215


[0.9069932103157043, 0.8628655076026917]

In [45]:
# 将softmax的概率值转化为标签名
def props_to_labels(props_matrix):
    labels = []
    for props_vector in props_matrix:
        idx = np.argmax(props_vector)
        label = target_list[idx]
        labels.append(label)
    return labels

In [47]:
print(target_list)

['000 - Normal', '126 - Path Traversal', '242 - Code Injection', '274 - HTTP Verb Tampering', '66 - SQL Injection', '88 - OS Command Injection']


In [72]:
def convert_to_list(Y_test_pred):
    result = []
    for pred in Y_test_pred:
        p = []
        index = np.argmax(pred)
        for i in range(0,6):
            if i == index:
                p.append(1)
            else:
                p.append(0)
        result.append(p)
    return result

In [51]:
print(Y_test_pred)

[[0.999627   0.00405519 1.         0.9998208  0.9999957  0.9999621 ]
 [0.99935913 0.9991597  0.9999998  0.9945206  1.         0.9999986 ]
 [0.99580616 0.86299    0.54782236 0.7851317  0.96248317 0.8179355 ]
 ...
 [0.99907684 0.70681036 1.         0.7683864  0.99999    0.9999071 ]
 [0.99993205 0.999613   0.7695033  0.9164208  0.9973363  0.99868196]
 [0.98931324 0.07369643 0.13167763 0.98252535 0.43033636 0.6140852 ]]


In [54]:
Y_test_list = Y_test.numpy().tolist()

In [73]:
Y_test_pred = model.predict(X_test)
label_test_pred = convert_to_list(Y_test_pred)
print(label_test_pred)

[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
[[0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 0], [1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0], [1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1], [0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1], [0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0], [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 

In [62]:
print(Y_test)

tf.Tensor(
[[0 0 1 0 0 0]
 [0 0 0 0 1 0]
 [0 0 0 0 1 0]
 ...
 [0 0 1 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]], shape=(14978, 6), dtype=int32)


In [74]:
print("混淆矩阵：")
# print(metrics.confusion_matrix(Y_test_list, label_test_pred))
print("f1-score:")
print(metrics.f1_score(Y_test_list, label_test_pred, average='micro'))
print("acc-score:")
print(metrics.accuracy_score(Y_test_list, label_test_pred))
print("recall-score:")
print(metrics.recall_score(Y_test_list, label_test_pred, average='micro'))
print("classification report:")
print(metrics.classification_report(Y_test_list, label_test_pred))

混淆矩阵：
f1-score:
0.7607701719363771
acc-score:
0.771731873414341
recall-score:
0.7347014925373134
classification report:
              precision    recall  f1-score   support

           0       0.56      0.93      0.70      2950
           1       0.87      0.80      0.84      4213
           2       0.93      0.78      0.85      3124
           3       0.99      0.91      0.95      1151
           4       0.92      0.68      0.78      3147
           5       0.36      0.03      0.06      1495

   micro avg       0.79      0.73      0.76     16080
   macro avg       0.77      0.69      0.70     16080
weighted avg       0.80      0.73      0.74     16080
 samples avg       0.79      0.78      0.78     16080



In [None]:
import pickle

# 保存模型
model.save_weights('./model/cnn_weights.h5')
model.save('./model/cnn_clf.h5')
with open('./model/cnn_clf.json', 'w') as f:
    f.write(model.to_json())

# 把tokenzier保存成pickle格式
with open('./tokenizer/tokenizer.pickle', 'wb') as handle: 
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) 