In [None]:
import sys
import collections
import re
sys.path.insert(0, '../')

from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader

from utils import load_dataset, train

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else device)

In [None]:
dataset = load_dataset("../dataset/sqli1.csv")
dataset_size = len(dataset)

In [None]:
def tokenizer(query):
    regex = r"(\/\*\*\/|\*\/|\/\*|\|\||\-\-\+|\-\-|\&\&|\!\=|\<\>|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[\w]+|.)"
    words = [word for word in re.split(regex, query) if word]
    return words

with open('../dataset/sqli1.csv', 'r') as file:
    lines = file.readlines()

queries = [line.split(',')[0] for line in lines]

vectorizer = TfidfVectorizer(tokenizer=tokenizer)
vectorizer.fit(queries[1:])

In [None]:
x = vectorizer.transform(["' AND 1 = utl_inaddr.get_host_address  (  (  SELECT banner FROM v$version WHERE ROWNUM = 1  )  )   AND 'i' = 'i"])
x[x.toarray().nonzero()]

In [None]:
batch_size = 100

def process_batch(batch):
    queries = torch.Tensor()
    labels = []
    for query, label in batch:
        x = vectorizer.transform([query]).toarray()
        queries = torch.cat((queries, x.nonzero()))
        labels.append(int(label))

    return (queries,
            torch.LongTensor(labels))

train_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=process_batch, shuffle=True)

In [None]:
class BiLSTMNN(nn.Module):
    def __init__(self):
        super(BiLSTMNN, self).__init__()
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=32)
        self.bilstm = torch.nn.LSTM(10906, 32, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(64, 2)

    def forward(self, x):
        x = self.bilstm(x)
        x = torch.add(x[0], x[1])
        x = self.dropout(x)
        x = self.fc(x)

        return x

network = BiLSTMNN().to(device)

In [None]:
hyperparameters = {
    "learning_rate": 0.01,
    "epoch": 50,
    "optimizer": optim.Adam(network.parameters(), lr=0.01),
    "loss_fn": nn.CrossEntropyLoss(),
}

train(network, train_loader, device, dataset_size, 130, hyperparameters)