In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import tensorflow as tf
from keras import layers
from keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from keras.models import Sequential
from tensorflow.keras.callbacks import TensorBoard
from sklearn import metrics

In [2]:
import keras
print(keras.__version__)
print(tf.__version__)

3.3.3
2.16.1


In [2]:
# 读取train.csv和test.csv数据
df_train = pd.read_csv('./data/torpeda_train_test/train.csv')
df_test = pd.read_csv('./data/torpeda_train_test/test.csv')

# 获取训练集中的url和label
url_train = df_train['url']
label_train = df_train['label']
# 获取测试集中的url和label
url_test = df_test['url']
label_test = df_test['label']

print("=================================================================================================")
print("训练集中第一条数据：")
print("url: %s" % url_train[0])
print("label: %s\n" % label_train[0])

print("测试集中第一条数据：")
print("url: %s" % url_test[0])
print("label: %s\n" % label_test[0])
print("=================================================================================================")

训练集中第一条数据：
url: POST /tienda1/publico/pagar.jsp?precio=85&B1=Pasar%2Bpor%2Bcaja%27%2C%270%27%2C%270%27%29%3Bwaitfor+delay+%270%3A0%3A15%27%3B--
label: SQLi

测试集中第一条数据：
url: GET /tienda1/publico/caracteristicas.jsp?id=d%27z%220
label: SQLi



In [4]:
print(label_train)

0             SQLi
1             SQLi
2             SQLi
3             SQLi
4             SQLi
           ...    
51888    anomalous
51889    anomalous
51890          XSS
51891    anomalous
51892         SQLi
Name: label, Length: 51893, dtype: object


In [3]:
# 获得标签类型列表
labels_type = pd.Series(label_train).value_counts().keys().tolist()
# 构建标签类型字典
type_dict = dict([[labels_type[i], i] for i in range(len(labels_type))])
print("标签类型字典：")
print(type_dict)

标签类型字典：
{'SQLi': 0, 'anomalous': 1, 'normal': 2, 'XSS': 3, 'SSI': 4, 'BufferOverflow': 5, 'CRLFi': 6, 'XPath': 7, 'LDAPi': 8, 'FormatString': 9}


In [4]:
print(labels_type)

['SQLi', 'anomalous', 'normal', 'XSS', 'SSI', 'BufferOverflow', 'CRLFi', 'XPath', 'LDAPi', 'FormatString']


In [6]:
import json
# 字符级分词，在训练集上拟合
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(url_train)
# 构建词典，并保存
num_words = len(tokenizer.word_index)+1
vocab = tokenizer.word_index
print("字典的大小为%d" % num_words)
print("字典：")
print(vocab)
with open("./tokenizer/vocab.json", 'w') as f:
    json.dump(vocab, f, ensure_ascii=False)

字典的大小为72
字典：
{'%': 1, '2': 2, 'c': 3, '0': 4, 'i': 5, 'e': 6, 'r': 7, 'o': 8, '3': 9, 'a': 10, 'n': 11, '=': 12, 'l': 13, 'm': 14, '&': 15, 'd': 16, '1': 17, '7': 18, 't': 19, 's': 20, 'p': 21, '5': 22, '6': 23, 'u': 24, '9': 25, '8': 26, 'b': 27, '/': 28, ',': 29, '4': 30, 'g': 31, '.': 32, 'f': 33, 'j': 34, 'w': 35, ' ': 36, '?': 37, 'v': 38, 'h': 39, 'z': 40, 'x': 41, '+': 42, 'y': 43, '-': 44, 'k': 45, '#': 46, 'q': 47, ';': 48, '<': 49, '>': 50, '"': 51, '_': 52, '@': 53, '*': 54, ':': 55, '(': 56, ')': 57, "'": 58, '!': 59, '[': 60, ']': 61, '{': 62, '}': 63, '`': 64, '\r': 65, '$': 66, '~': 67, '|': 68, '\\': 69, '^': 70, '\n': 71}


In [7]:
# 将label转化为one-hot形式
def get_one_hot_value(s):
    print(type_dict[s])
    return [0 if i!= type_dict[s] else 1 for i in range(10)]

In [8]:
# 获取url的长度
url_train_lens = [len(u) for u in url_train]
url_test_lens = [len(u) for u in url_test]
# 查看97%长度的大小
print(np.percentile(np.array(url_train_lens),97))
print(np.percentile(np.array(url_test_lens),97))

603.239999999998
604.8299999999981


In [9]:
# 观察后定义url最大的长度为600
max_len = 600
# 将文本转为序列
seq_train = tokenizer.texts_to_sequences(url_train)
seq_test = tokenizer.texts_to_sequences(url_test)
# 将序列的长度统一为max_len
X_train = sequence.pad_sequences(seq_train, maxlen=max_len)
X_test = sequence.pad_sequences(seq_test, maxlen=max_len)
# 将标签转化为one-hot
Y_train = [get_one_hot_value(l) for l in label_train]
Y_test =  [get_one_hot_value(l) for l in label_test]
print("=================================================================================================")
print("举例：")
print("向量化前：")
print("url: %s" % url_train[0])
print("label: %s\n" % label_train[0])
print("向量化后：")
print("url_vec: " + str(X_train[0]))
print("label_one_hot: " + str(Y_train[0]))
print("=================================================================================================")

0
0
0
0
0
0
1
0
1
1
2
0
1
2
0
1
0
0
0
4
2
0
2
3
0
1
1
0
2
1
1
1
0
0
0
1
0
0
0
0
0
0
0
6
0
0
1
1
1
0
0
0
0
2
2
0
1
0
0
0
3
0
3
1
1
0
1
0
0
7
2
0
1
2
1
4
0
0
0
1
0
0
5
1
0
1
1
0
2
1
0
0
0
3
0
3
0
1
0
0
0
0
0
0
2
3
3
1
0
0
3
0
3
3
0
1
2
1
1
1
6
1
1
0
6
1
0
0
2
1
2
1
0
0
0
1
3
1
0
2
0
0
3
0
0
0
0
1
0
1
0
1
1
1
1
1
0
1
1
3
0
0
0
1
3
0
0
0
0
0
0
0
0
2
1
0
3
0
1
0
0
1
0
0
1
0
2
1
0
1
3
1
1
0
0
1
0
0
0
1
0
1
1
0
1
1
0
0
0
0
0
1
1
1
0
0
0
1
0
2
1
1
0
1
3
2
1
2
0
0
1
0
1
0
0
0
0
0
0
0
0
0
0
0
0
2
0
0
0
0
1
0
2
0
2
3
0
1
1
0
0
0
0
0
1
0
0
0
0
4
1
0
0
1
0
0
1
0
0
0
0
0
0
3
1
0
3
0
0
0
2
1
0
0
0
7
0
0
0
0
1
2
1
0
0
2
0
0
3
0
2
0
1
0
0
2
0
3
0
3
0
1
1
0
1
0
1
0
0
3
0
1
1
0
0
2
0
1
0
1
1
1
3
1
0
0
1
0
0
0
1
0
0
1
3
0
0
1
0
0
0
2
1
0
1
0
1
0
1
0
0
0
1
0
0
0
0
0
0
1
0
0
2
0
0
0
6
1
2
2
0
1
1
0
0
0
1
1
3
0
0
1
1
0
0
0
0
3
1
0
0
0
0
0
0
0
2
0
0
0
0
3
0
0
2
1
2
1
4
0
0
0
0
0
8
1
0
0
3
0
2
1
0
0
3
2
0
0
0
0
0
3
8
1
0
0
1
0
0
0
1
2
1
1
0
0
8
0
3
8
0
3
0
1
0
0
0
0
1
0
0
0
0
0
0
1
0
1
0
0
1
1
0
2
2
1
1
0
0
0


In [16]:
print("url_vec: " + str(X_train[0]))

url_vec: tf.Tensor(
[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0 

In [10]:
# 回调
tb_callback = TensorBoard(log_dir='./logs', embeddings_freq=1)

In [11]:
# 搭建网络
model = Sequential()
model.add(layers.Embedding(num_words, 64, input_length=max_len))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())



None


In [12]:
print(Y_train)
Y_train = np.array(Y_train)


[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 

In [13]:
X_train = tf.convert_to_tensor(X_train)
Y_train = tf.convert_to_tensor(Y_train)

In [17]:
# 训练
model.fit(X_train, Y_train, validation_split=0.25, epochs=6, batch_size=128, callbacks=[tb_callback])

Epoch 1/6
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 79ms/step - accuracy: 0.9854 - loss: 0.0542 - val_accuracy: 0.9921 - val_loss: 0.0249
Epoch 2/6
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 80ms/step - accuracy: 0.9934 - loss: 0.0214 - val_accuracy: 0.9948 - val_loss: 0.0151
Epoch 3/6
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 77ms/step - accuracy: 0.9947 - loss: 0.0154 - val_accuracy: 0.9951 - val_loss: 0.0142
Epoch 4/6
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 90ms/step - accuracy: 0.9956 - loss: 0.0119 - val_accuracy: 0.9958 - val_loss: 0.0113
Epoch 5/6
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 102ms/step - accuracy: 0.9969 - loss: 0.0092 - val_accuracy: 0.9962 - val_loss: 0.0109
Epoch 6/6
[1m305/305[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 89ms/step - accuracy: 0.9969 - loss: 0.0079 - val_accuracy: 0.9961 - val_loss: 0.0102


<keras.src.callbacks.history.History at 0x24e932d5ab0>

In [None]:
# 评估模型
model.evaluate(X_test, Y_test, batch_size=128)



[0.01806632364939282, 0.9933902877697842]

In [None]:
# 将softmax的概率值转化为标签名
def props_to_labels(props_matrix):
    labels = []
    for props_vector in props_matrix:
        idx = np.argmax(props_vector)
        label = labels_type[idx]
        labels.append(label)
    return labels

In [None]:
Y_test_pred = model.predict(X_test)
label_test_pred = props_to_labels(Y_test_pred)
print("混淆矩阵：")
print(metrics.confusion_matrix(label_test, label_test_pred, labels_type))
print("f1-score:")
print(metrics.f1_score(label_test, label_test_pred, labels_type, average='micro'))
print("acc-score:")
print(metrics.accuracy_score(label_test, label_test_pred, labels_type))
print("recall-score:")
print(metrics.recall_score(label_test, label_test_pred, labels_type, average='micro'))
print("classification report:")
print(metrics.classification_report(label_test, label_test_pred, labels_type))

混淆矩阵：
[[12884     0     2     1     0     0     0    17     0     0]
 [    3  4923     8     4     0     0     0     0     0     0]
 [    3    53  2453     0     0     0     0     0     0     0]
 [    1     3     0  1426     1     1     0    12     0     1]
 [    1     7     1     5   121     0     0     0     0     0]
 [    0     0     0     2     0   122     0     0     0     0]
 [    0     1     0     0     0     0    97     0     0     0]
 [    1     1     0     7     0     0     0    44     0     0]
 [    0     0     0     1     0     0     0     0    21     0]
 [    0     0     0     2     7     0     0     1     0     2]]
f1-score:
0.9933902877697842
acc-score:
0.9933902877697842
recall-score:
0.9933902877697842
classification report:
                precision    recall  f1-score   support

          SQLi       1.00      1.00      1.00     12904
     anomalous       0.99      1.00      0.99      4938
        normal       1.00      0.98      0.99      2509
           XSS       0.

In [None]:
import pickle

# 保存模型
model.save_weights('./model/cnn_weights.h5')
model.save('./model/cnn_clf.h5')
with open('./model/cnn_clf.json', 'w') as f:
    f.write(model.to_json())

# 把tokenzier保存成pickle格式
with open('./tokenizer/tokenizer.pickle', 'wb') as handle: 
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) 