In [45]:
import sys
import collections
import re
sys.path.insert(0, '../')

import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader

from utils import load_dataset, train_epoch

In [46]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else device)

In [47]:
dataset = load_dataset("../dataset/sqli1.csv")
dataset_size = len(dataset)

In [72]:
def tokenizer(query):
    regex = r"(\/\*\*\/|\*\/|\/\*|\|\||\-\-\+|\-\-|\&\&|\!\=|\<\>|\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|[\w]+|.)"
    words = [word for word in re.split(regex, query) if word]
    return words

def create_vocab(dataset):
    counter = collections.Counter()
    for query, _ in dataset:
        counter.update(tokenizer(query))
    return torchtext.vocab.vocab(counter, min_freq=1)

vocab = create_vocab(dataset)
vocab_size = len(vocab)

def encode(x):
    encoded = []
    for str in tokenizer(x):
        encoded.append(vocab.get_stoi()[str])
    return encoded

def to_bow(query):
    bow = torch.zeros(vocab_size, dtype=torch.float32)
    for word_id in encode(query):
        bow[word_id] += 1
    return bow

In [93]:
def create_df(dataset):
    df = torch.zeros(vocab_size)
    for query, _ in dataset:
        for word_id in set(encode(query)):
            df[word_id] += 1
    return df

def create_tf_idf(input):
    df = create_df(dataset)
    bow = to_bow(input)
    return bow * torch.log((dataset_size+1)/(df+1))

In [128]:
tf_idf = create_tf_idf("1' or '1'='1")
print(tf_idf)

tensor([0.0000, 9.3175, 0.1116,  ..., 0.0000, 0.0000, 0.0000])


In [124]:
batch_size = 100

def process_batch(batch):
    querys = torch.Tensor()
    labels = []
    for query, label in batch:
        querys = torch.cat((querys, create_tf_idf(query)))
        labels.append(int(label))

    return (querys,
            torch.LongTensor(labels))

train_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=process_batch, shuffle=True)

In [125]:
class BiLSTMNN(nn.Module):
    def __init__(self):
        super(BiLSTMNN, self).__init__()
        self.f_lstm = torch.nn.LSTM(10906, 32, batch_first=True)
        self.b_lstm = torch.nn.LSTM(10906, 32, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(64, 2)

    def forward(self, x):
        x = self.f_lstm(x)
        x = self.b_lstm(torch.flip(x))
        x = torch.add(x[0], x[1])
        x = self.dropout(x)
        x = self.fc(x)

        return x

network = BiLSTMNN().to(device)

In [126]:
learning_rate = 0.001
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(network.parameters(), lr=learning_rate)
epoch = 50
train_epoch(
    network,
    train_loader,
    learning_rate,
    optimizer,
    loss_fn,
    epoch,
    device,
    dataset_size)

traing...


ValueError: LSTM: Expected input to be 2D or 3D, got 1D instead