In [1]:
import csv

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable
import gensim.models

In [2]:
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class MyCorpus:
    def __iter__(self):
        dataset_path = '../../dataset/sqli1.csv'
        with open(dataset_path, 'r') as file:
            reader = csv.reader(file)
            next(reader)
            for row in reader:
                yield row[0]


word_model = gensim.models.Word2Vec(sentences=MyCorpus(), vector_size=32).wv

In [4]:
fill_one = [1 for _ in range(32)]
fill_zero = [0 for _ in range(32)]

In [5]:
dataset_path = "../../dataset/sqli1.csv"  # Add your url path here
dataset = []
with open(dataset_path, 'r') as file:
    reader = csv.reader(file)
    next(reader)
    for row in reader:
      dataset.append(row)


In [None]:
class TrainData(Dataset):
    def __init__(self, dataset_path, word_model, word_vec_num, batch_size, p):
        self.dataset_path = dataset_path
        self.word_model = word_model
        self.word_vec_num = word_vec_num
        self.batch_size = batch_size
        self.p = p
        self.data = list(self._load_data())

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)

    def _load_data(self):
        batch_query_vecs = []
        batch_lables = []
        query_vecs = []
        lables = []

        for query, label in self._query_iter():
            char_vec = self._get_char_vector(query)
            lables.append(int(label))
            query_vecs.append(char_vec)

            if len(query_vecs) == self.batch_size:
                batch_query_vecs.append(query_vecs)
                batch_lables.append(lables)
                query_vecs = []
                lables = []

        feature = torch.Tensor(batch_query_vecs)
        lable = torch.Tensor(batch_lables)
        return feature, lable

    def _query_iter(self):
        with open(self.dataset_path, 'r') as file:
            reader = csv.reader(file)
            next(reader)
            for row in reader:
                yield row[0], row[1]

    def _get_char_vector(self, query):
        char_vec = []
        if len(query) == 1:
            try:
                char_vec.append(self.word_model[query])
            except KeyError:
                pass
        else:
            for char in query:
                try:
                    char_vec.append(self.word_model[char])
                except BaseException:
                    char_vec.append(fill_one)

                if len(char_vec) >= self.word_vec_num:
                    break
        while len(char_vec) < self.word_vec_num:
            char_vec.append(fill_zero)

        return char_vec

# Parameters
batch_size = 16
word_vec_num = 256
p = 128

dataset = TrainData(dataset_path, word_model, word_vec_num, batch_size, p)

In [6]:
# Parameters
batch_size = 16
word_vec_num = 256
p = 128

def get_char_vector(query):
    char_vec = []
    if len(query) == 1:
        try:
            char_vec.append(word_model[query])
        except KeyError:
            pass
    else:
        for char in query:
            try:
                char_vec.append(word_model[char])
            except BaseException:
                char_vec.append(fill_one)

            if len(char_vec) == word_vec_num:
                break
    while len(char_vec) < word_vec_num:
        char_vec.append(fill_zero)

    return char_vec

def process_batch(batch):
    query_vec = []
    lables = []
    for data in batch:
        query, lable = data
        query_vec.append(get_char_vector(query))
        lables.append(int(lable))
    return torch.Tensor(query_vec).view(len(batch), 1, word_vec_num, 32), torch.Tensor(lables)


train_loader = DataLoader(dataset, batch_size=16, collate_fn=process_batch, shuffle=True)

In [7]:
class TextCNN(nn.Module):
    def __init__(self):
        super(TextCNN, self).__init__()
        self.conv1_1 = nn.Conv2d(1, 32, kernel_size=(1, 1), padding=1)
        self.conv1_3 = nn.Conv2d(1, 32, kernel_size=(3, 3), padding=1)
        self.conv1_5 = nn.Conv2d(1, 32, kernel_size=(5, 5), padding=1)
        self.conv2_1 = nn.Conv2d(32, 64, kernel_size=(1, 1), padding=1)
        self.conv2_3 = nn.Conv2d(32, 64, kernel_size=(3, 3), padding=1)
        self.conv2_5 = nn.Conv2d(32, 64, kernel_size=(5, 5), padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3), padding=1)
        self.conv4 = nn.Conv2d(128, 256, kernel_size=(3, 3), padding=1)
        self.conv5 = nn.Conv2d(256, 512, kernel_size=(3, 3), padding=1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, 2)

    def forward(self, x):
        x_1 = F.relu(self.conv1_1(x))
        x_1 = F.relu(self.conv2_1(x_1))
        x_1 = F.max_pool2d(x_1, 2)
        x_1 = self._common_formard(x_1)

        x_3 = F.relu(self.conv1_3(x))
        x_3 = F.relu(self.conv2_3(x_3))
        x_3 = F.max_pool2d(x_3, 2)
        x_3 = self._common_formard(x_3)

        x_5 = F.relu(self.conv1_5(x))
        x_5 = F.relu(self.conv2_5(x_5))
        x_5 = F.max_pool2d(x_5, 2)
        x_5 = self._common_formard(x_5)

        x = torch.cat([x_1, x_3, x_5], dim=1)
        length = x_1.shape[1] + x_3.shape[1] + x_5.shape[1]
        x = F.relu(nn.Linear(length, 128)(x))
        x = self.dropout2(x)
        x = self.fc2(x)

        return F.log_softmax(x)

    def _common_formard(self, x):
        x = F.relu(self.conv3(x))
        x = F.max_pool2d(x, 2)

        x = F.relu(self.conv4(x))
        x = self._elastic_pool(x)

        x = F.relu(self.conv5(x))
        x = F.max_pool2d(x, 2)

        x = self.dropout1(x)
        x = nn.Flatten()(x)

        return x

    def _elastic_pool(self, x):
        num_rows = x.shape[1] // p

        return F.max_pool2d(x, (num_rows, 2))

# Create the model
network = TextCNN().to(device)
network.train()

# Define the loss function and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(network.parameters())

In [None]:
# Train the model
for epoch in range(30):
    for data in train_loader:
        inputs, labels = data
        inputs, labels = Variable(inputs), Variable(labels)
        optimizer.zero_grad()
        outputs = network(inputs)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()


In [None]:
def train_epoch(network, dataloader, learning_rate=0.01, optimizer=None, loss_fn=torch.nn.NLLLoss(), epoch_size=None, report_freq=200):
    print("traing...")

    optimizer = optimizer or torch.optim.Adam(network.parameters(), lr=learning_rate)
    # ネットワークにトレーニングすると伝える
    network.train()

    total_loss, accuracy, count, i = 0, 0, 0, 0
    for features, labels in dataloader:
        labels = labels.long()
        features, labels = features.to(device), labels.to(device)

        optimizer.zero_grad()
        out = network(features)
        loss = loss_fn(out, labels)

        loss.backward()
        optimizer.step()
        total_loss += loss
        _, predicted = torch.max(out, 1)
        accuracy += (predicted==labels).sum()
        count += len(labels)

        i += 1
        print(f"epoch: {i}")
        if i % report_freq == 0:
            print(f"{count}: accuracy={accuracy.item()/count}")

        if epoch_size and count > epoch_size:
            print(epoch_size)
            print(count)
            break

    return total_loss.item()/count, accuracy.item()/count

train_epoch(network, train_loader, epoch_size=10000, report_freq=20)

# Save the model
torch.save(network.state_dict(), 'ep-cnn-test.pth')