In [1]:
import sys
import collections
sys.path.insert(0, '../')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchtext

from utils import load_dataset, train

In [2]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else device)

In [3]:
dataset = load_dataset("../dataset/sqliv2.csv")
dataset_size = len(dataset)

In [4]:
tokenizer = torchtext.data.get_tokenizer("basic_english")

def create_vocab(dataset):
    counter = collections.Counter()
    for query, _ in dataset:
        for str in tokenizer(query):
            counter.update(list(str))
    return torchtext.vocab.vocab(counter, min_freq=1)

vocab = create_vocab(dataset)
vocab_size = len(vocab)

In [5]:
def hash(str):
    if "and" == str:
        str = "001"
    if "or" == str:
        str = "19"
    if "xp_" == str:
        str = "483"
    if "substr" == str:
        str = "1082"
    if "utl" == str:
        str = "292"
    if "benchmark" == str:
        str = "9282"
    if "shutdown" == str:
        str = "0902"
    if "hex" == str:
        str = "422"
    if "sqlmap" == str:
        str = "4990"
    if "md5" == str:
        str = "520"
    if "select" == str:
        str = "507"
    if "union" == str:
        str = "612"
    if "drop" == str:
        str = "629"
    if "delect" == str:
        str = "923"
    if "concat" == str:
        str = "309"
    if "orderby" == str:
        str = "981"
    if "exec" == str:
        str = "015"
    return str


query_length = 60
batch_size = 100

def process_str(query):
    query_strs = tokenizer(query)
    for i, str in enumerate(query_strs):
        query_strs[i] = hash(str)

    query_chars = []
    for str in query_strs:
        for char in list(str):
            query_chars += [vocab.get_stoi()[char]]

    # 最大長超えたら切る
    if len(query_chars) > query_length:
        query_chars = query_chars[:query_length]

    # 最大長に足りない分は埋める
    if len(query_chars) < query_length:
        query_chars.extend([vocab.get_stoi()["q"]] * (query_length - len(query_chars)))

    return query_chars

def process_batch(batch):
    queries = []
    labels = []

    for query, lable in batch:
        query_chars = process_str(query)
        queries.append(query_chars)
        labels.append(int(lable))

    return (torch.LongTensor(queries),
            torch.LongTensor(labels))

train_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=process_batch, shuffle=True)

In [10]:
class CGRUClassifier(nn.Module):
    def __init__(self):
        super(CGRUClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, 30)
        self.convs1 = nn.ModuleList([nn.Conv2d(1, 48, kernel_size=(2, 30)) for _ in range(59)])
        self.convs2 = nn.ModuleList([nn.Conv2d(1, 48, kernel_size=(4, 30)) for _ in range(57)])
        self.conv2f = nn.Conv2d(1, 48, kernel_size=(4, 30))
        self.conv2l = nn.Conv2d(1, 48, kernel_size=(4, 30))
        self.gru = nn.GRU(48, 24, batch_first=True, bias=True)
        self.dropuot = nn.Dropout(0.5)
        self.fc = nn.Linear(24, 2)

    def forward(self, x):
        x = self.embedding(x)

        x1 = []
        for i, conv in enumerate(self.convs1):
            y = x[:, i:i+2, :]
            y = y.unsqueeze(1)
            y = conv(y)
            x1.append(y.squeeze())
        x1 = torch.stack(x1, dim=1)

        x2 = []
        y = x[:, 0:3, :].unsqueeze(1)
        y = self.conv_cal(y, self.conv2f)
        x2.append(y.squeeze())
        for i, conv in enumerate(self.convs2):
            y = x[:, i:i+4, :]
            y = y.unsqueeze(1)
            y = conv(y)
            x2.append(y.squeeze())
        y = x[:, 57:60, :].unsqueeze(1)
        y = self.conv_cal(y, self.conv2l)
        x2.append(y.squeeze())
        x2 = torch.stack(x2, dim=1)

        x = torch.add(x1, x2)

        # 最後の隠れ層のみ取り出す
        x = self.gru(x)[1]
        x = x.squeeze(0)
        x = F.relu(x)

        x = self.dropuot(x)
        x = self.fc(x)

        return x

    def conv_cal(self, y, conv):
        if y.shape[1] != 100:
            y_pad = torch.zeros((100, 1, 3, 30)).to(device)
            y_pad[:y.size(0)] = y
            y = y_pad
        zero = torch.zeros((100, 1, 1, 30)).to(device)
        y = torch.cat([y, zero], dim=2)
        y = conv(y)
        return y.squeeze()


network = CGRUClassifier().to(device)

In [11]:
hyperparameters = {
    "learning_rate": 0.01,
    "epoch": 30,
    "optimizer": optim.SGD(network.parameters(), lr=0.01),
    "lr_scheduler": {
        "step_size": 5,
        "gamma": 0.5,
    },
    "loss_fn": nn.CrossEntropyLoss(),
}

loss, accurancy = train(network, train_loader, device, dataset_size, 20, hyperparameters)
print(f"loss={loss}, accurancy={accurancy}")

# Save the model
torch.save(network.state_dict(), 'model.pth')

traing...
2000: accuracy=0.6665
4000: accuracy=0.66775
6000: accuracy=0.666
8000: accuracy=0.6685
10000: accuracy=0.6662
12000: accuracy=0.6644166666666667
14000: accuracy=0.6646428571428571
16000: accuracy=0.6625
18000: accuracy=0.6606111111111111
20000: accuracy=0.66035
22000: accuracy=0.6606363636363637
24000: accuracy=0.6597916666666667
26000: accuracy=0.6619615384615385
28000: accuracy=0.6618571428571428
30000: accuracy=0.6612333333333333
32000: accuracy=0.66053125
torch.Size([61, 1, 3, 30])


AttributeError: 'NoneType' object has no attribute 'squeeze'

: 