### Import Thư Viện

In [1]:
import torch
from torch import nn
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
import multiprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import json
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from gensim.models import Word2Vec
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


### Load dataset

In [2]:
data_source = pd.read_csv('data_source.csv')
#data_source = data_source[data_source['label']=='153 - Input Data Manipulation']
#data_source, _ = train_test_split(data_source, test_size=0.75, random_state=42)
data_target  = pd.read_csv('data_target.csv')
#data_target, _ = train_test_split(data_target, test_size=0.9, random_state=42)

data_target = data_target.sample(frac=1, random_state=42)
#data_target = data_target[data_target['label']=='153 - Input Data Manipulation']
X_source = data_source['text'].str.replace('/',' ')
y_source = data_source['label']
X_target  = data_target ['text'].str.replace('/','')
y_target  = data_target ['label']
rlist =['194 - Fake the Source of Data', '66 - SQL Injection',
       '34 - HTTP Response Splitting', '126 - Path Traversal',
       '000 - Normal', '272 - Protocol Manipulation',
       '310 - Scanning for Vulnerable Software', '242 - Code Injection',
       '153 - Input Data Manipulation']
mapping = {l: i+1 for i, l in enumerate(rlist)}
y_source = [mapping[s] for s in y_source] 
y_target  = [mapping[r] for r in y_target ]
y_source = np.array(y_source)
y_target  = np.array(y_target )


### Trích xuất đặc trưng bẳng BERT

In [4]:
# model_name = 'jackaduma/SecBERT'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)

# X_source = list(X_source)
# X_target = list(X_target)
# input_texts_source = X_source
# input_text_target = X_target

# X_train = []
# X_test = []

# for i , text in enumerate(input_texts_source):

#     tokens = tokenizer.tokenize(text)

#     token_ids = tokenizer.convert_tokens_to_ids(tokens)

#     input_tensor = torch.tensor([token_ids])
#     with torch.no_grad():
#         outputs = model(input_tensor)

#     last_hidden_state = outputs.last_hidden_state

#     first_token_features = last_hidden_state[0, 0, :]

#     X_train.append(first_token_features)
#     if i%1000==0 :
#         print(i/len(X_source))

# for j , text in enumerate(input_text_target):

#     tokens = tokenizer.tokenize(text)

#     token_ids = tokenizer.convert_tokens_to_ids(tokens)

#     input_tensor = torch.tensor([token_ids])
#     with torch.no_grad():
#         outputs = model(input_tensor)

#     last_hidden_state = outputs.last_hidden_state

#     first_token_features = last_hidden_state[0, 0, :]

#     X_test.append(first_token_features)
#     if j%1000==0 :
#         print(j/len(X_target))
# X_test = torch.stack(X_test)
# X_train = torch.stack(X_train)


### Convert Dữ liệu chưa biết có nhãn là 0, đã biết nhãn là 1

In [4]:
y_target = np.where(np.logical_or(y_target == 1, y_target == 3), 0, 1)

### Feature Extraction

In [5]:
class Vectorizer:
    def __init__(self, method='BOW', ngram_range=(1, 1), max_features=300, emb_fname='', word_index_fname=''):
        self.method = method
        if self.method == 'BOW':
            self.vectorizer = CountVectorizer(analyzer='word', input='content', ngram_range=ngram_range, max_features=max_features)
        elif self.method == 'TFIDF':
            self.vectorizer = TfidfVectorizer(analyzer='word', input='content', max_features=max_features)
        elif self.method == 'Word2Vec':
            self.max_features = max_features
            self.emb_fname = emb_fname
            self.word_index_fname = word_index_fname
        else:
            raise ValueError('Feature extraction method does not exist.')

    def feature_extraction(self, X_train, X_test):
        train_data = self.vectorizer.fit_transform(X_train).toarray()
        test_data = self.vectorizer.transform(X_test).toarray()
        return train_data, test_data

    def get_word_index(self):
        word2id = json.load(open(self.word_index_fname, 'r'))
        return word2id

    def get_embedding_matrix(self):
        np.random.seed(0)
        word2id = self.get_word_index()
        embedding_matrix = np.random.uniform(-0.25, 0.25, [len(word2id) + 1, self.max_features])
        with open(self.emb_fname, 'r', encoding='utf-8') as f:
            for line in f:
                content = line.split(' ')
                if content[0] in word2id:
                    embedding_matrix[word2id[content[0]]] = np.array(list(map(float, content[1:])))
        return embedding_matrix

### TFTDF

In [9]:
# vectorizer = Vectorizer('TFIDF')
# X_train, X_test = vectorizer.feature_extraction(X_source, X_target)

### WORD2VEC

In [6]:

sentences = [sentence.split() for sentence in X_source]
w2v_model = Word2Vec(sentences, window=5, min_count=5, workers=4)

def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X_train = np.array([vectorize(sentence) for sentence in X_source])
X_test = np.array([vectorize(sentence) for sentence in X_target])

### Sử dụng GPU

In [12]:
device = ""
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

### Set các giá trị đầu vào

In [13]:
input_layer = X_train.shape[1]
input_size = input_layer
hidden_size =64

### Hàm Discriminator

In [14]:
class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_layer, 1024),
            # nn.ReLU(),
            # nn.Dropout(0.3),
            # nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        x = x.view(x.size(0), input_layer)
        output = self.model(x)
        return output
    
discriminator = Discriminator().to(device=device)

### Hàm Generator

In [16]:
class Generator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_layer, 256),
            nn.ReLU(),
            nn.Dropout(0.3), 
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, input_layer),
            nn.Tanh(),
        )

    def forward(self, x):
        output = self.model(x)
        output = output.view(x.size(0), input_layer)
        return output

generator = Generator().to(device=device)

### Khởi tạo các giá trị laerning rate, epochs, batch_size, hàm loss, optimization

In [17]:
lr = 0.0001
num_epochs = 2000
batch_size= 32
loss_function = nn.BCELoss()
optimizer_discriminator = torch.optim.Adam(discriminator.parameters(), lr=lr)
optimizer_generator = torch.optim.Adam(generator.parameters(), lr=lr)

### Dùng để chuyển đổi dạng đầu vào nếu dùng trích xuất là Bert

In [18]:
# X_source = X_train.to(device=device, dtype=torch.float32)
# X_target = X_test.to(device=device, dtype=torch.float32)

In [19]:
X_source = torch.from_numpy(X_train).to(device=device, dtype=torch.float32)
X_target = torch.from_numpy(X_test).to(device=device, dtype=torch.float32)

### Tranining

In [20]:
for epoch in range(num_epochs):
    idx = np.random.randint(0, X_source.shape[0], batch_size)
    real_samples  = X_source[idx]
    real_samples_labels = torch.ones((batch_size,1)).to(device=device)
    latent_space_samples = torch.rand((batch_size,input_layer)).to( device=device)
    
    generated_samples = generator(latent_space_samples)
    generated_samples_labels = torch.zeros((batch_size, 1)).to(
            device=device
        )
    all_samples = torch.cat((real_samples, generated_samples))
    all_samples_labels = torch.cat(
            (real_samples_labels, generated_samples_labels)
        )

        # Training the discriminator
    discriminator.zero_grad()
    
    output_discriminator = discriminator(all_samples)
    all_samples_labels = all_samples_labels.view(-1, 1)
    loss_discriminator = loss_function(
            output_discriminator, all_samples_labels
        )
    loss_discriminator.backward()
    optimizer_discriminator.step()

        # Data for training the generator
    latent_space_samples = torch.rand((batch_size,input_layer)).to(device=device)

        # Training the generator
    generator.zero_grad()
    generated_samples = generator(latent_space_samples)
    output_discriminator_generated = discriminator(generated_samples)
    loss_generator = loss_function(
            output_discriminator_generated, real_samples_labels
        )
    loss_generator.backward()
    optimizer_generator.step()
    if(epoch%100==0):
        predicted_labels = discriminator(X_target)
        new_tensor = torch.where(predicted_labels >= 0.5, torch.tensor(1), torch.tensor(0))
        new_label = new_tensor.numpy()
        
        # print(f"accuracy: {accuracy_score(y_target, new_label)}")
        print(f"Epoch: {epoch/100} Loss D.: {loss_discriminator}")
        print(f"Epoch: {epoch/100} Loss G.: {loss_generator}")
 

accuracy: 0.370782473082806
accuracy: 0.6291275530096272
accuracy: 0.6067840326305372
accuracy: 0.6239390576732747
accuracy: 0.6272381009507243
accuracy: 0.6294274660348499
accuracy: 0.6213598056563596
accuracy: 0.6293674834298053
accuracy: 0.6279578922112588
accuracy: 0.37855022043607356
accuracy: 0.378490237831029
accuracy: 0.6291275530096272
accuracy: 0.5028341780883544
accuracy: 0.6294274660348499
accuracy: 0.6045646762438893
accuracy: 0.6247188315388538
accuracy: 0.6293974747323277
accuracy: 0.5841106079237022
accuracy: 0.6295174399424167
accuracy: 0.5316258285097322


### chuyển tensor output về 0 và 1

In [22]:
predicted_labels = discriminator(X_target)
new_tensor = torch.where(predicted_labels >= 0.5, torch.tensor(1), torch.tensor(0))

In [23]:
new_label = new_tensor.numpy()

### Kết quả phân loại trên tập test

In [25]:
accuracy = accuracy_score(y_target, new_label)
accuracy

0.6285577182617041

In [26]:
print('accuracy =', accuracy_score(y_target, new_label))
print(classification_report(y_target,new_label))
print('Confusion Matrix: \n',confusion_matrix(y_target, new_label))

accuracy = 0.6285577182617041
              precision    recall  f1-score   support

           0       0.38      0.00      0.01     12353
           1       0.63      1.00      0.77     20990

    accuracy                           0.63     33343
   macro avg       0.50      0.50      0.39     33343
weighted avg       0.54      0.63      0.49     33343

Confusion Matrix: 
 [[   49 12304]
 [   81 20909]]
