In [1]:
!pip install transformers



In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from numpy import newaxis
import math
import os
import pandas as pd
import torch.nn as nn
from scipy.stats import chi2
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel



In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
# configuration for training, you should modify these values to get the best performance
config = {
    "num_labels": 7,
    "hidden_dropout_prob": 0.15,
    "hidden_size": 768,
    "max_length": 512,
}

training_parameters = {
    "batch_size": 16,
    "epochs": 5,
    "output_folder": "/kaggle/working",
    "output_file": "model.bin",
    "learning_rate": 2e-5,
    "print_after_steps": 100,
    "save_steps": 5000,

}

In [5]:
class ReviewDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained('jackaduma/SecBERT')

    def __getitem__(self, index):
        review = self.df.iloc[index]["text"]
        attack = self.df.iloc[index]["label"]
        attack_dict = {'000 - Normal': 0,
          '126 - Path Traversal': 1,
          '242 - Code Injection': 2,
          '153 - Input Data Manipulation': 3,
          '310 - Scanning for Vulnerable Software': 4,
          '194 - Fake the Source of Data': 5,
          '34 - HTTP Response Splitting': 6}
        label = attack_dict[attack]
        encoded_input = self.tokenizer.encode_plus(
                review,
                add_special_tokens=True,
                max_length = 512,
                padding="max_length",
                return_overflowing_tokens=True,
                truncation = True,
            )
        if "num_truncated_tokens" in encoded_input and encoded_input["num_truncated_tokens"] > 0:
            # print("Attention! you are cropping tokens")
            pass

        input_ids = encoded_input["input_ids"]
        attention_mask = encoded_input["attention_mask"] if "attention_mask" in encoded_input else None

        token_type_ids = encoded_input["token_type_ids"] if "token_type_ids" in encoded_input else None



        data_input = {
            "input_ids": torch.tensor(input_ids),
            "attention_mask": torch.tensor(attention_mask),
            "token_type_ids": torch.tensor(token_type_ids),
            "label": torch.tensor(label),
        }

        return data_input["input_ids"], data_input["attention_mask"], data_input["token_type_ids"], data_input["label"]



    def __len__(self):
        return self.df.shape[0]

In [6]:
def pinverse(difference, num_random_features):
    num_samples, _ = difference.shape
    sigma = torch.cov(difference.T)
    mu = torch.mean(difference, 0)
    if num_random_features == 1:
        stat = float(num_samples * torch.pow(mu, 2)) / float(sigma)
    else:
        sigma = torch.pinverse(sigma)
        right_side = torch.matmul(mu, torch.matmul(sigma, mu.T))
        stat = num_samples * right_side
    return chi2.sf(stat.detach().cpu(), num_random_features)


def unnorm(difference, num_random_features):
    num_samples, _ = difference.shape
    sigma = torch.cov(difference.T)
    mu = torch.mean(difference, 0)
    if num_random_features == 1:
        stat = float(num_samples * torch.pow(mu, 2)) / float(sigma)
    else:
        right_side = torch.matmul(mu, mu.T)
        stat = num_samples * right_side
    return chi2.sf(stat.detach().cpu(), num_random_features)


def smooth(data):
    w = torch.linalg.norm(data, dim=1)
    w = torch.exp(-w ** 2 / 2.0)
    return w[:, newaxis]


def smooth_cf(data, w, random_frequencies):
    n, _ = data.shape
    _, d = random_frequencies.shape
    mat = torch.matmul(data,random_frequencies)
    arr = torch.cat((torch.sin(mat) * w, torch.cos(mat) * w), dim = 1)
    n1, d1 = arr.shape
    assert n1 == n and d1 == 2 * d and w.shape == (n, 1)
    return arr


def smooth_difference(random_frequencies, X, Y):
    x_smooth = smooth(X)
    y_smooth = smooth(Y)
    characteristic_function_x = smooth_cf(X, x_smooth, random_frequencies)
    characteristic_function_y = smooth_cf(Y, y_smooth, random_frequencies)
    return characteristic_function_x - characteristic_function_y

In [7]:
class SmoothCFTest:

    def _gen_random(self, dimension):
        return torch.tensor(np.random.randn(dimension, self.num_random_features).astype(np.float32)).to(self.device)


    def __init__(self, data_x, data_y, scale, num_random_features, device, method):
        self.device = device
        self.method = method
        self.data_x = scale*data_x.to(self.device)
        self.data_y = scale*data_y.to(self.device)
        self.num_random_features = num_random_features

        _, dimension_x = np.shape(self.data_x)
        _, dimension_y = np.shape(self.data_y)
        assert dimension_x == dimension_y
        self.random_frequencies = self._gen_random(dimension_x)


    def compute_pvalue(self):
        difference = smooth_difference(self.random_frequencies, self.data_x, self.data_y)
        if self.method == "unnorm":
            return unnorm(difference, self.num_random_features)
        return pinverse(difference, self.num_random_features)

In [8]:
df_train = pd.read_csv('/kaggle/input/code-injection/dataset_capec_combine.csv')
df_train.head()

Unnamed: 0,text,label
0,GET /blog/index.php/2020/04/04/voluptatum-repr...,000 - Normal
1,GET /blog/xmlrpc.php?rsd,000 - Normal
2,GET /blog/index.php/2020/04/04/nihil-tenetur-e...,000 - Normal
3,GET /blog/index.php/2020/04/04/explicabo-qui-f...,000 - Normal
4,GET /blog/index.php/2020/04/04/explicabo-qui-f...,000 - Normal


In [9]:
# Optional (not effect very much)
# for word tokenizer instead of character tokenizer
df_train['text'] = df_train['text'].str.replace('/',' ')
df_train.head()

Unnamed: 0,text,label
0,GET blog index.php 2020 04 04 voluptatum-repr...,000 - Normal
1,GET blog xmlrpc.php?rsd,000 - Normal
2,GET blog index.php 2020 04 04 nihil-tenetur-e...,000 - Normal
3,GET blog index.php 2020 04 04 explicabo-qui-f...,000 - Normal
4,GET blog index.php 2020 04 04 explicabo-qui-f...,000 - Normal


In [10]:
from sklearn.model_selection import train_test_split
## prepare for training
df_train = df_train[0:len(df_train)//training_parameters['batch_size']*training_parameters['batch_size']]
source_dataset = ReviewDataset(df_train)
source_dataloader = DataLoader(dataset = source_dataset, batch_size = training_parameters["batch_size"], shuffle = True, num_workers = 2)

Downloading (…)lve/main/config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/378k [00:00<?, ?B/s]

In [11]:
df_transfer = pd.read_csv('/kaggle/input/code-injection/dataset_capec_transfer.csv')
df_transfer.head()

Unnamed: 0,text,label
0,POST /vendor/phpunit/phpunit/src/Util/PHP/eval...,153 - Input Data Manipulation
1,POST /cgi-bin/ViewLog.asp remote_submit_Flag=...,153 - Input Data Manipulation
2,GET /.svn/wc.db,153 - Input Data Manipulation
3,GET /blog/.svn/wc.db,153 - Input Data Manipulation
4,GET /blog/index.php/my-account/.svn/wc.db,153 - Input Data Manipulation


In [12]:
# Optional (not effect very much)
# for word tokenizer instead of character tokenizer
df_transfer['text'] = df_transfer['text'].str.replace('/',' ')
df_transfer = df_transfer[0:len(df_transfer)//training_parameters['batch_size']*training_parameters['batch_size']]

In [13]:
target_dataset = ReviewDataset(df_transfer)
target_dataloader = DataLoader(dataset = target_dataset, batch_size = training_parameters["batch_size"], shuffle = True, num_workers = 2)

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim

class DomainAdaptationModel(nn.Module):
    def __init__(self):
        super(DomainAdaptationModel, self).__init__()

        num_labels = config["num_labels"]
        self.bert = AutoModel.from_pretrained('jackaduma/SecBERT') # model that we will use
        self.dropout = nn.Dropout(config["hidden_dropout_prob"])

        self.prj = nn.Linear(config["hidden_size"], config["hidden_size"]//2);

        self.attack_classifier = nn.Sequential(
            nn.Linear(config["hidden_size"]//2, num_labels),
            nn.LogSoftmax(dim=1),
        )


#       Freeze bert layer
        modules = [self.bert.embeddings, self.bert.encoder.layer[:2]] #Replace value by what you want
        for module in modules:
            for param in module.parameters():
                param.requires_grad = False


    def forward(
          self,
          input_ids=None,
          attention_mask=None,
          token_type_ids=None,
          labels=None,
#           grl_lambda = 1.0,
          ):

        outputs = self.bert(
                input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
            )

#         pooled_output = outputs[1] # For bert-base-uncase
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)

        pooled_output_prj = self.prj(pooled_output)

        attack_pred = self.attack_classifier(pooled_output_prj)

        return attack_pred.to(device), pooled_output_prj

In [15]:
def compute_accuracy(logits, labels):
    predicted_labels_dict = {
      0: 0,
      1: 0,
      2: 0,
      3: 0,
      4: 0,
      5: 0,
      6: 0,
    }

    predicted_label = logits.max(dim = 1)[1]

    for pred in predicted_label:
        # print(pred.item())
        predicted_labels_dict[pred.item()] += 1
    acc = (predicted_label == labels).float().mean()

    return acc, predicted_labels_dict

In [16]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report,accuracy_score, f1_score

def evaluate(model, dataset = "target", percentage = 80):
    with torch.no_grad():
        predicted_labels_dict = {
          0: 0,
          1: 0,
          2: 0,
          3: 0,
          4: 0,
          5: 0,
          6: 0,
        }
        model.eval()
        dev_df = pd.read_csv("/kaggle/input/code-injection/dataset_capec_" + dataset + ".csv")
        data_size = dev_df.shape[0]
        selected_for_evaluation = int(data_size*percentage/100)
        dev_df = dev_df.head(selected_for_evaluation)
        dataset = ReviewDataset(dev_df)
        dataloader = DataLoader(dataset = dataset, batch_size = training_parameters["batch_size"], shuffle = True, num_workers = 2)

        true_labels = list()
        predicted_label = list()
        for input_ids, attention_mask, token_type_ids, labels in dataloader:
            inputs = {
                "input_ids": input_ids.squeeze(axis=1),
                "attention_mask": attention_mask.squeeze(axis=1),
                "token_type_ids" : token_type_ids.squeeze(axis=1),
                "labels": labels,
            }
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            attack_pred, _ = model(**inputs)
            true_labels.extend(attack_pred.max(dim = 1)[1].cpu().numpy())
            predicted_label.extend(inputs['labels'].cpu().numpy())
            _, predicted_labels = compute_accuracy(attack_pred, inputs["labels"])

            for i in range(7):
              predicted_labels_dict[i] += predicted_labels[i]

        score = f1_score(true_labels,predicted_label,average="macro")
        precision = precision_score(true_labels, predicted_label,average="macro")
        recall = recall_score(true_labels, predicted_label,average="macro")
        report = classification_report(true_labels,predicted_label,digits=4)
        acc= accuracy_score(true_labels, predicted_label)
        #classifaction_report_csv(report,precision,recall,score,0)
        print ('\n clasification report:\n', report)
        print ('F1 score:', score)
        print ('Recall:', recall)
        print ('Precision:', precision)
        print ('Acc:', acc)
        print('Confusion Matrix: \n',confusion_matrix(true_labels, predicted_label))
        print(predicted_labels_dict)

In [17]:
lr = training_parameters["learning_rate"]
n_epochs = training_parameters["epochs"]

In [18]:
lr = training_parameters["learning_rate"]
n_epochs = training_parameters["epochs"]

model = DomainAdaptationModel()
model.to(device)

optimizer = optim.Adam(model.parameters(), lr)

loss_fn_attack_classifier = torch.nn.NLLLoss()
# loss_fn_domain_classifier = torch.nn.NLLLoss()
# mkmmd_loss = MultipleKernelMaximumMeanDiscrepancy(
#         kernels=[GaussianKernel(alpha=2 ** k) for k in range(-3, 2)],
#         linear=True
#     )


'''
In one training step we will update the model using both the source labeled data and target unlabeled data
We will run it till the batches last for any of these datasets

In our case target dataset has more data. Hence, we will leverage the entire source dataset for training

If we use the same approach in a case where the source dataset has more data then the target dataset then we will
under-utilize the labeled source dataset. In such a scenario it is better to reload the target dataset when it finishes
This will ensure that we are utilizing the entire source dataset to train our model.
'''

max_batches = min(len(source_dataloader), len(target_dataloader))
best_acc = 0.
for i in range(1):
    for epoch_idx in range(10):
        source_iterator = iter(source_dataloader)
        target_iterator = iter(target_dataloader)
        loss_after_epoch = 0.
        for batch_idx in range(max_batches):

            p = float(batch_idx + epoch_idx * max_batches) / (training_parameters["epochs"] * max_batches)
            grl_lambda = 2. / (1. + np.exp(-10 * p)) - 1
            grl_lambda = torch.tensor(grl_lambda)

            model.train()
            # if(batch_idx%training_parameters["print_after_steps"] == 0 ):
            #     print("Training Step:", batch_idx)

            optimizer.zero_grad()

            # Souce dataset training update
            input_ids, attention_mask, token_type_ids, labels = next(source_iterator)
            inputs = {
                "input_ids": input_ids.squeeze(axis=1),
                "attention_mask": attention_mask.squeeze(axis=1),
                "token_type_ids" : token_type_ids.squeeze(axis=1),
                "labels" : labels,
            }

            for k, v in inputs.items():
                inputs[k] = v.to(device)

            attack_pred, pooled_output_prj_source = model(**inputs)
            loss_s_attack = loss_fn_attack_classifier(attack_pred, inputs["labels"])


            # Target dataset training update
            input_ids, attention_mask, token_type_ids, labels = next(target_iterator)
            inputs = {
                "input_ids": input_ids.squeeze(axis=1),
                "attention_mask": attention_mask.squeeze(axis=1),
                "token_type_ids" : token_type_ids.squeeze(axis=1),
                "labels" : labels,
            }

            for k, v in inputs.items():
                inputs[k] = v.to(device)
            _, pooled_output_prj_target = model(**inputs)


            # Combining the loss

            #transfer_loss = mkmmd_loss(pooled_output_prj_source, pooled_output_prj_target); ## mk mme loss

            scf_loss = SmoothCFTest(pooled_output_prj_source,  pooled_output_prj_target, scale = 1, num_random_features=5, method="pinverse" ,device=device)
            transfer_loss = scf_loss.compute_pvalue() #SCF Loss

#             mkme_loss = MeanEmbeddingTest(
#                   pooled_output_prj_source, pooled_output_prj_target, scale=1, number_of_random_frequencies=5, method="pinverse" ,device=device)
#             transfer_loss = mkme_loss.compute_pvalue()
            loss = loss_s_attack + transfer_loss*2.0
            loss.backward()
            optimizer.step()
            loss_after_epoch = loss
        print(loss_after_epoch)
    evaluate(model, dataset = "transfer", percentage = 100)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

Some weights of the model checkpoint at jackaduma/SecBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  right_side = torch.matmul(mu, torch.matmul(sigma, mu.T))


tensor(nan, device='cuda:0', grad_fn=<AddBackward0>)
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0854, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1126, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0345, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.2029, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0149, device='cuda:0', grad_fn=<AddBackward0>)
tensor(nan, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.0043, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.1644, device='cuda:0', grad_fn=<AddBackward0>)

 clasification report:
               precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000      2600
           1     0.2156    0.0541    0.0865       665
           2     0.6815    0.9676    0.7997      6457
           3     0.0379    0.1104    0.0565       670
           4     0.0000    0.0000    0.0000        30
           5     0.0000    0.0000    0.0000       454
           6     0.9963    0.5642    0.7204  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
 torch.save(model.state_dict(), os.path.join(training_parameters["output_folder"], "epoch_" + str(n_epochs)  +  training_parameters["output_file"] ))

In [20]:
evaluate(model, dataset = "transfer", percentage = 100)


 clasification report:
               precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000      2600
           1     0.2156    0.0541    0.0865       665
           2     0.6815    0.9676    0.7997      6457
           3     0.0379    0.1104    0.0565       670
           4     0.0000    0.0000    0.0000        30
           5     0.0000    0.0000    0.0000       454
           6     0.9963    0.5642    0.7204       943

    accuracy                         0.5830     11819
   macro avg     0.2759    0.2423    0.2376     11819
weighted avg     0.4661    0.5830    0.5025     11819

F1 score: 0.23759287972517334
Recall: 0.2423388675306314
Precision: 0.27589616206749373
Acc: 0.5829596412556054
Confusion Matrix: 
 [[   0   38 1618  944    0    0    0]
 [   0   36  369  260    0    0    0]
 [   0   36 6248  171    0    0    2]
 [   0   54  542   74    0    0    0]
 [   0    3    0   27    0    0    0]
 [   0    0    0  454    0    0    0]
 [   0    0  391  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
evaluate(model, dataset = "combine", percentage = 10)


 clasification report:
               precision    recall  f1-score   support

           0     0.9881    0.9565    0.9721     24096
           1     0.9634    0.9994    0.9811     15676
           2     0.9913    0.6333    0.7729       720
           3     0.8015    0.5632    0.6615       380
           4     0.7649    0.9984    0.8662      1825
           5     0.9999    0.9998    0.9998     16482

    accuracy                         0.9748     59179
   macro avg     0.9182    0.8584    0.8756     59179
weighted avg     0.9768    0.9748    0.9745     59179

F1 score: 0.8755845075400156
Recall: 0.8584298970313324
Precision: 0.9181699054943576
Acc: 0.974771456090843
Confusion Matrix: 
 [[23049   467     0    30   548     2]
 [    9 15667     0     0     0     0]
 [  180    62   456    19     3     0]
 [   86    67     4   214     9     0]
 [    3     0     0     0  1822     0]
 [    0     0     0     4     0 16478]]
{0: 24096, 1: 15676, 2: 720, 3: 380, 4: 1825, 5: 16482, 6: 0}
