In [1]:
import torch
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [2]:
data_train = pd.read_csv("dataset_capec_combine.csv")
data_train = data_train[data_train['label']!='242 - Code Injection'] 
data_train = data_train[data_train['label']!='153 - Input Data Manipulation']
data_test = pd.read_csv("dataset_capec_transfer.csv")
X_train = data_train['text'].str.replace('/',' ')
X_test = data_test['text'].str.replace('/',' ')
y_train = data_train['label']
y_test = data_test['label']

rlist =['000 - Normal', '126 - Path Traversal',
       '153 - Input Data Manipulation', '194 - Fake the Source of Data',
       '242 - Code Injection', '310 - Scanning for Vulnerable Software',
       '34 - HTTP Response Splitting']

mapping = {l: i+1 for i, l in enumerate(rlist)}

y_train = [mapping[s] for s in y_train] 
y_test  = [mapping[r] for r in y_test]

y_train = np.array(y_train)
y_test  = np.array(y_test)

In [3]:
device = ""
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [4]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=40)

tokenizer.fit_on_texts(data_test )
X_test = tokenizer.texts_to_sequences(X_test )
X_test = pad_sequences(X_test , maxlen=40)

In [5]:
def normalize_tensor(tensor):
    # Tính giá trị trung bình và độ lệch chuẩn của tensor
    tensor = tensor.float()
    mean = tensor.mean()
    std = tensor.std()

    # Chuẩn hóa Z-score
    normalized_tensor = (tensor - mean) / std

    return normalized_tensor

In [6]:
X_train = torch.from_numpy(X_train).to(device=device)
X_train = normalize_tensor(X_train)
X_test = torch.from_numpy(X_test).to(device=device)
X_test = normalize_tensor(X_test)

In [7]:
y_train = torch.tensor(y_train)

X_train = torch.cat((X_train, y_train.unsqueeze(1)), dim=1)

X_train.shape

torch.Size([321602, 41])

In [8]:
mapping

{'000 - Normal': 1,
 '126 - Path Traversal': 2,
 '153 - Input Data Manipulation': 3,
 '194 - Fake the Source of Data': 4,
 '242 - Code Injection': 5,
 '310 - Scanning for Vulnerable Software': 6,
 '34 - HTTP Response Splitting': 7}

In [11]:
X_train_153 = torch.load('Data_GAN/tensor_153.pt')
y_train_153 = torch.full((X_train_153.shape[0],),3)
X_train_242 = torch.load('Data_GAN/tensor_242.pt')
y_train_242 = torch.full((X_train_242.shape[0],),5)

In [12]:
X_train_153 = torch.cat((X_train_153, y_train_153.unsqueeze(1)), dim=1)
X_train_242 = torch.cat((X_train_242, y_train_242.unsqueeze(1)), dim=1)

In [13]:
X_train = torch.cat((X_train, X_train_153, X_train_242))

In [14]:
torch.save(X_train, 'Data_GAN/tensor_train.pt')

In [15]:
y_test = torch.tensor(y_test)
y_test

tensor([3, 3, 3,  ..., 5, 5, 5], dtype=torch.int32)

In [None]:
X_test = torch.cat((X_test, y_test.unsqueeze(1)),dim=1)

In [None]:
torch.save(X_test, 'Data_GAN/tensor_test.pt')