<a href="https://colab.research.google.com/github/leeeenammmmm/lab/blob/main/GAN_exp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math
import os
import pandas as pd
import torch.nn as nn

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
config = {
    "num_labels": 7,
    "hidden_dropout_prob": 0.15,
    "hidden_size": 768,
    "max_length": 512,
}

training_parameters = {
    "batch_size": 2,
    "epochs": 1,
    "output_folder": "/content/drive/MyDrive/Code_Injection_Dataset",
    "output_file": "model.bin",
    "learning_rate": 2e-5,
    "print_after_steps": 100,
    "save_steps": 5000,

}

In [6]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
class ReviewDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained('jackaduma/SecBERT')

    def __getitem__(self, index):
        review = self.df.iloc[index]["text"]
        sentiment = self.df.iloc[index]["label"]
        sentiment_dict = {'000 - Normal': 0,
          '126 - Path Traversal': 1,
          '242 - Code Injection': 2,
          '153 - Input Data Manipulation': 3,
          '310 - Scanning for Vulnerable Software': 4,
          '194 - Fake the Source of Data': 5,
          '34 - HTTP Response Splitting': 6}
        label = sentiment_dict[sentiment]
        encoded_input = self.tokenizer.encode_plus(
                review,
                add_special_tokens=True,
                max_length= config["max_length"],
                pad_to_max_length=True,
                return_overflowing_tokens=True,
            )
        if "num_truncated_tokens" in encoded_input and encoded_input["num_truncated_tokens"] > 0:
            # print("Attention! you are cropping tokens")
            pass

        input_ids = encoded_input["input_ids"]
        attention_mask = encoded_input["attention_mask"] if "attention_mask" in encoded_input else None

        token_type_ids = encoded_input["token_type_ids"] if "token_type_ids" in encoded_input else None



        data_input = {
            "input_ids": torch.tensor(input_ids),
            "attention_mask": torch.tensor(attention_mask),
            "token_type_ids": torch.tensor(token_type_ids),
            "label": torch.tensor(label),
        }

        return data_input["input_ids"], data_input["attention_mask"], data_input["token_type_ids"], data_input["label"]



    def __len__(self):
        return self.df.shape[0]

In [7]:
model = AutoModel.from_pretrained('jackaduma/SecBERT')

Downloading (…)lve/main/config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/336M [00:00<?, ?B/s]

Some weights of the model checkpoint at jackaduma/SecBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
%cd /content/drive/MyDrive/Code_Injection_Dataset

/content/drive/MyDrive/Code_Injection_Dataset


In [12]:
df_train = pd.read_csv('dataset_capec_combine.csv')
df_train.head()

Unnamed: 0,text,label
0,GET /blog/index.php/2020/04/04/voluptatum-repr...,000 - Normal
1,GET /blog/xmlrpc.php?rsd,000 - Normal
2,GET /blog/index.php/2020/04/04/nihil-tenetur-e...,000 - Normal
3,GET /blog/index.php/2020/04/04/explicabo-qui-f...,000 - Normal
4,GET /blog/index.php/2020/04/04/explicabo-qui-f...,000 - Normal


In [13]:
df_train['text'] = df_train['text'].str.replace('/',' ')
df_train.head()

Unnamed: 0,text,label
0,GET blog index.php 2020 04 04 voluptatum-repr...,000 - Normal
1,GET blog xmlrpc.php?rsd,000 - Normal
2,GET blog index.php 2020 04 04 nihil-tenetur-e...,000 - Normal
3,GET blog index.php 2020 04 04 explicabo-qui-f...,000 - Normal
4,GET blog index.php 2020 04 04 explicabo-qui-f...,000 - Normal


In [14]:
## Reduce data for testing
df_242 = df_train[(df_train['label'] == '242 - Code Injection')]
df_242 = df_242.sample(frac = 1)
df_242 = df_242[:50000]
df_000 = df_train[(df_train['label'] == '000 - Normal')]
df_000 = df_000.sample(frac = 1)
df_000 = df_000[:50000]

df_sub = df_train[(df_train['label'] != '000 - Normal') & (df_train['label'] != '242 - Code Injection')]

df_train = pd.concat([df_train,df_242,df_000], ignore_index=True)

In [15]:
## prep
source_dataset = ReviewDataset(df_train)
source_dataloader = DataLoader(dataset = source_dataset, batch_size = training_parameters["batch_size"], shuffle = True, num_workers = 2)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/378k [00:00<?, ?B/s]

In [16]:
df_transfer = pd.read_csv('dataset_capec_transfer.csv')
df_transfer.head()

Unnamed: 0,text,label
0,POST /vendor/phpunit/phpunit/src/Util/PHP/eval...,153 - Input Data Manipulation
1,POST /cgi-bin/ViewLog.asp remote_submit_Flag=...,153 - Input Data Manipulation
2,GET /.svn/wc.db,153 - Input Data Manipulation
3,GET /blog/.svn/wc.db,153 - Input Data Manipulation
4,GET /blog/index.php/my-account/.svn/wc.db,153 - Input Data Manipulation


In [17]:
# Optional (not effect very much)
df_transfer['text'] = df_transfer['text'].str.replace('/',' ')
df_transfer.head()

Unnamed: 0,text,label
0,POST vendor phpunit phpunit src Util PHP eval...,153 - Input Data Manipulation
1,POST cgi-bin ViewLog.asp remote_submit_Flag=...,153 - Input Data Manipulation
2,GET .svn wc.db,153 - Input Data Manipulation
3,GET blog .svn wc.db,153 - Input Data Manipulation
4,GET blog index.php my-account .svn wc.db,153 - Input Data Manipulation


In [18]:
target_dataset = ReviewDataset(df_transfer)
target_dataloader = DataLoader(dataset = target_dataset, batch_size = training_parameters["batch_size"], shuffle = True, num_workers = 2)

In [19]:
from torch.autograd import Function


class GradientReversalFn(Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha

        return output, None

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim

class DomainAdaptationModel(nn.Module):
    def __init__(self):
        super(DomainAdaptationModel, self).__init__()
        
        num_labels = config["num_labels"]
        self.bert = AutoModel.from_pretrained('jackaduma/SecBERT')
        self.dropout = nn.Dropout(config["hidden_dropout_prob"])
        self.sentiment_classifier = nn.Sequential(
            nn.Linear(config["hidden_size"], num_labels),
            nn.LogSoftmax(dim=1),
        )
        self.domain_classifier = nn.Sequential(
            nn.Linear(config["hidden_size"], 2),
            nn.LogSoftmax(dim=1),
        )


    def forward(
          self,
          input_ids=None,
          attention_mask=None,
          token_type_ids=None,
          labels=None,
          grl_lambda = 1.0, 
          ):

        outputs = self.bert(
                input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
            )

#         pooled_output = outputs[1] # For bert-base-uncase
        pooled_output = outputs.pooler_output 
        pooled_output = self.dropout(pooled_output)


        reversed_pooled_output = GradientReversalFn.apply(pooled_output, grl_lambda)

        sentiment_pred = self.sentiment_classifier(pooled_output)
        domain_pred = self.domain_classifier(reversed_pooled_output)

        return sentiment_pred.to(device), domain_pred.to(device)

In [21]:
def compute_accuracy(logits, labels):
    
    predicted_labels_dict = {
      0: 0,
      1: 0,
      2: 0,
      3: 0,
      4: 0,
      5: 0,
      6: 0,
    }
    
    predicted_label = logits.max(dim = 1)[1]
    
    for pred in predicted_label:
        # print(pred.item())
        predicted_labels_dict[pred.item()] += 1
    acc = (predicted_label == labels).float().mean()
    
    return acc, predicted_labels_dict
    

In [22]:
def evaluate(model, dataset = "imdb", percentage = 5):
    with torch.no_grad():
        predicted_labels_dict = {                                                   
          0: 0,
          1: 0,
          2: 0,
          3: 0,
          4: 0,
          5: 0,
          6: 0,                                                                   
        }
        
        dev_df = pd.read_csv("dataset_capec_" + dataset + ".csv")
        data_size = dev_df.shape[0]
        selected_for_evaluation = int(data_size*percentage/100)
        dev_df = dev_df.head(selected_for_evaluation)
        dataset = ReviewDataset(dev_df)

        dataloader = DataLoader(dataset = dataset, batch_size = training_parameters["batch_size"], shuffle = True, num_workers = 2)

        mean_accuracy = 0.0
        total_batches = len(dataloader)
        
        for input_ids, attention_mask, token_type_ids, labels in dataloader:
            inputs = {
                "input_ids": input_ids.squeeze(axis=1),
                "attention_mask": attention_mask.squeeze(axis=1),
                "token_type_ids" : token_type_ids.squeeze(axis=1),
                "labels": labels,
            }
            for k, v in inputs.items():
                inputs[k] = v.to(device)


            sentiment_pred, _ = model(**inputs)
            accuracy, predicted_labels = compute_accuracy(sentiment_pred, inputs["labels"])
            mean_accuracy += accuracy
            for i in range(7): 
              predicted_labels_dict[i] += predicted_labels[i]

        print(predicted_labels_dict)
    return mean_accuracy/total_batches

In [None]:
lr = training_parameters["learning_rate"]
n_epochs = training_parameters["epochs"]

model = DomainAdaptationModel()
model.to(device)

optimizer = optim.Adam(model.parameters(), lr)

loss_fn_sentiment_classifier = torch.nn.NLLLoss()
loss_fn_domain_classifier = torch.nn.NLLLoss()
'''
In one training step we will update the model using both the source labeled data and target unlabeled data
We will run it till the batches last for any of these datasets

In our case target dataset has more data. Hence, we will leverage the entire source dataset for training

If we use the same approach in a case where the source dataset has more data then the target dataset then we will
under-utilize the labeled source dataset. In such a scenario it is better to reload the target dataset when it finishes
This will ensure that we are utilizing the entire source dataset to train our model.
'''

max_batches = min(len(source_dataloader), len(target_dataloader))

for epoch_idx in range(n_epochs):
    
    source_iterator = iter(source_dataloader)
    target_iterator = iter(target_dataloader)

    for batch_idx in range(max_batches):
        
        p = float(batch_idx + epoch_idx * max_batches) / (training_parameters["epochs"] * max_batches)
        grl_lambda = 2. / (1. + np.exp(-10 * p)) - 1
        grl_lambda = torch.tensor(grl_lambda)
        
        model.train()
        
        if(batch_idx%training_parameters["print_after_steps"] == 0 ):
            print("Training Step:", batch_idx)
        
        optimizer.zero_grad()
        
        # Souce dataset training update
        input_ids, attention_mask, token_type_ids, labels = next(source_iterator)
        inputs = {
            "input_ids": input_ids.squeeze(axis=1),
            "attention_mask": attention_mask.squeeze(axis=1),
            "token_type_ids" : token_type_ids.squeeze(axis=1),
            "labels" : labels,
            "grl_lambda" : grl_lambda,
        }

        for k, v in inputs.items():
            inputs[k] = v.to(device)
    
        sentiment_pred, domain_pred = model(**inputs)
        loss_s_sentiment = loss_fn_sentiment_classifier(sentiment_pred, inputs["labels"])
        y_s_domain = torch.zeros(training_parameters["batch_size"], dtype=torch.long).to(device)
        loss_s_domain = loss_fn_domain_classifier(domain_pred, y_s_domain)


        # Target dataset training update 
        input_ids, attention_mask, token_type_ids, labels = next(target_iterator)
        inputs = {
            "input_ids": input_ids.squeeze(axis=1),
            "attention_mask": attention_mask.squeeze(axis=1),
            "token_type_ids" : token_type_ids.squeeze(axis=1),
            "labels" : labels,
            "grl_lambda" : grl_lambda,
        }

        for k, v in inputs.items():
            inputs[k] = v.to(device)
    
        _, domain_pred = model(**inputs)
        
        # Note that we are not using the sentiment predictions here for updating the weights
        y_t_domain = torch.ones(input_ids.shape[0], dtype=torch.long).to(device)
        # print(domain_pred.shape, y_t_domain.shape)
        loss_t_domain = loss_fn_domain_classifier(domain_pred, y_t_domain)

        # Combining the loss 

        loss = loss_s_sentiment + loss_s_domain + loss_t_domain
        loss.backward()
        optimizer.step()

    # Evaluate the model after every epoch
    
    torch.save(model.state_dict(), os.path.join(training_parameters["output_folder"], "epoch_" + str(epoch_idx)  +  training_parameters["output_file"] ))
#     accuracy = evaluate(model, dataset = "combine", percentage = 1).item()
#     print("Accuracy on amazon after epoch " + str(epoch_idx) + " is " + str(accuracy))

    accuracy = evaluate(model, dataset = "transfer", percentage = 100).item()
    print("Accuracy on transfer dataset after epoch " + str(epoch_idx) + " is " + str(accuracy))

Some weights of the model checkpoint at jackaduma/SecBERT were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longe

Training Step: 0
Training Step: 100
Training Step: 200
Training Step: 300
Training Step: 400
Training Step: 500
Training Step: 600
Training Step: 700
Training Step: 800
Training Step: 900
Training Step: 1000
Training Step: 1100
