In [1]:
import torch
from torch import nn
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
data_source = pd.read_csv('dataset_capec_combine.csv')
data_source = data_source[data_source['label']=='242 - Code Injection']
data_target  = pd.read_csv('dataset_capec_transfer.csv')
data_target = data_target[data_target['label']=='242 - Code Injection']
X_source = data_source['text'].str.replace('/',' ')
y_source = data_source['label']
X_target  = data_target ['text'].str.replace('/','')
y_target  = data_target ['label']
rlist =['000 - Normal', '126 - Path Traversal',
       '153 - Input Data Manipulation', '194 - Fake the Source of Data',
       '242 - Code Injection', '310 - Scanning for Vulnerable Software',
       '34 - HTTP Response Splitting']
mapping = {l: i+1 for i, l in enumerate(rlist)}
y_source = [mapping[s] for s in y_source] 
y_target  = [mapping[r] for r in y_target ]
y_source = np.array(y_source)
y_target  = np.array(y_target )

In [3]:
data_source.value_counts()

text                                                                                                    label       
GET /blog/index.php/comments/feed/                                                                      000 - Normal    14245
GET /blog/index.php/2020/04/04/ipsa-ea-porro-distinctio/                                                000 - Normal     4367
GET /blog/index.php/2020/04/04/hic-porro-nihil-non-rerum/                                               000 - Normal     4243
GET /blog/index.php/feed/                                                                               000 - Normal     4132
GET /blog/index.php/2020/04/04/explicabo-qui-fuga-distinctio-dolores-voluptatibus-sit/                  000 - Normal     3249
                                                                                                                        ...  
GET /blog/index.php/2020/etc%2Fpasswd/29/distinctio-sequi-officiis-occaecati/embed                      000 - Normal        1
G

In [4]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_source)
X_source = tokenizer.texts_to_sequences(X_source)
X_source = pad_sequences(X_source, maxlen=40)

tokenizer.fit_on_texts(X_target )
X_target = tokenizer.texts_to_sequences(X_target )
X_target = pad_sequences(X_target , maxlen=40)

In [5]:
X_source.shape

(226509, 40)

In [6]:
device = ""
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [7]:
class Discriminator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(40, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        x = x.view(x.size(0), 40)
        output = self.model(x)
        return output

In [8]:
discriminator = Discriminator().to(device=device)

In [9]:
class Generator(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(40, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024, 40),
            nn.Tanh(),
        )

    def forward(self, x):
        output = self.model(x)
        output = output.view(x.size(0), 40)
        return output

generator = Generator().to(device=device)

In [10]:
lr = 0.00001
num_epochs = 1000
batch_size=64
loss_function = nn.BCELoss()

optimizer_discriminator = torch.optim.Adam(discriminator.parameters(), lr=lr)
optimizer_generator = torch.optim.Adam(generator.parameters(), lr=lr)

In [11]:
def normalize_tensor(tensor):
    # Tính giá trị trung bình và độ lệch chuẩn của tensor
    tensor = tensor.float()
    mean = tensor.mean()
    std = tensor.std()

    # Chuẩn hóa Z-score
    normalized_tensor = (tensor - mean) / std

    return normalized_tensor

In [12]:
X_source = torch.from_numpy(X_source).to(device=device)
X_source = normalize_tensor(X_source)
X_target = torch.from_numpy(X_target).to(device=device)
X_target = normalize_tensor(X_target)

In [13]:
for epoch in range(num_epochs):
    idx = np.random.randint(0, X_source.shape[0], batch_size)
    real_samples  = X_source[idx]
    real_samples_labels = torch.ones((batch_size,1)).to(device=device)
    latent_space_samples = torch.rand((batch_size,40)).to( device=device)
    
    generated_samples = generator(latent_space_samples)
    generated_samples_labels = torch.zeros((batch_size, 1)).to(
            device=device
        )
    all_samples = torch.cat((real_samples, generated_samples))
    all_samples_labels = torch.cat(
            (real_samples_labels, generated_samples_labels)
        )

        # Training the discriminator
    discriminator.zero_grad()
    output_discriminator = discriminator(all_samples)
    loss_discriminator = loss_function(
            output_discriminator, all_samples_labels
        )
    loss_discriminator.backward()
    optimizer_discriminator.step()

        # Data for training the generator
    latent_space_samples = torch.rand((batch_size,40)).to(device=device)

        # Training the generator
    generator.zero_grad()
    generated_samples = generator(latent_space_samples)
    output_discriminator_generated = discriminator(generated_samples)
    loss_generator = loss_function(
            output_discriminator_generated, real_samples_labels
        )
    loss_generator.backward()
    optimizer_generator.step()
    if(epoch%100==0):
        print(f"Epoch: {epoch+100} Loss D.: {loss_discriminator}")
        print(f"Epoch: {epoch+100} Loss G.: {loss_generator}")
 

Epoch: 100 Loss D.: 0.6965636610984802
Epoch: 100 Loss G.: 0.6972460150718689
Epoch: 200 Loss D.: 0.6272693872451782
Epoch: 200 Loss G.: 0.6768917441368103
Epoch: 300 Loss D.: 0.6509124636650085
Epoch: 300 Loss G.: 0.5995656847953796
Epoch: 400 Loss D.: 0.617836058139801
Epoch: 400 Loss G.: 0.7476617693901062
Epoch: 500 Loss D.: 0.5930208563804626
Epoch: 500 Loss G.: 0.7631440758705139
Epoch: 600 Loss D.: 0.5204918384552002
Epoch: 600 Loss G.: 0.7584710121154785
Epoch: 700 Loss D.: 0.48490265011787415
Epoch: 700 Loss G.: 0.8943015933036804
Epoch: 800 Loss D.: 0.4583350718021393
Epoch: 800 Loss G.: 1.0644950866699219
Epoch: 900 Loss D.: 0.5489089488983154
Epoch: 900 Loss G.: 1.0157688856124878
Epoch: 1000 Loss D.: 0.4010257124900818
Epoch: 1000 Loss G.: 1.2658493518829346


In [14]:
k = int (X_source.shape[0]*0.2/64)
for i in range(k):
    latent_space_samples = torch.rand((batch_size,40)).to(device=device)
    X_source = torch.cat([X_source, generator(latent_space_samples)], dim=0)
torch.save(X_source, 'tensor_000.pt')

In [15]:
generator(latent_space_samples).shape

torch.Size([64, 40])

In [16]:
X_source.shape

torch.Size([271757, 40])

In [17]:
def count_elements_greater_than(tensor, threshold):
    # Áp dụng phép so sánh >= threshold
    mask = tensor >= threshold

    # Tính tổng các phần tử True trong mask
    count = torch.sum(mask)

    return count.item()
            

In [20]:
X_target

tensor([], size=(0, 40))

In [18]:
predicted_labels = discriminator(X_target)

cou = count_elements_greater_than(predicted_labels, 0.5)

cou/predicted_labels.shape[0]

ZeroDivisionError: division by zero