In [1]:
!pip install transformers



In [2]:

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import shutil
import sys   

In [3]:
train_path = "/kaggle/input/code-injection-6labels-balance/code_injection_6labels_balance.csv"


In [4]:
df = pd.read_csv(train_path)

In [5]:
target_list = ["000 - Normal", '126 - Path Traversal', 
               '242 - Code Injection', '274 - HTTP Verb Tampering', 
               '66 - SQL Injection', '88 - OS Command Injection']

In [6]:
df

Unnamed: 0.1,Unnamed: 0,text,000 - Normal,126 - Path Traversal,242 - Code Injection,274 - HTTP Verb Tampering,66 - SQL Injection,88 - OS Command Injection
0,0,GET /,1,0,0,0,0,0
1,1,GET /blog/index.php/2020/04/04/voluptatum-repr...,1,0,0,0,0,0
2,2,GET /blog/xmlrpc.php?rsd,1,0,0,0,0,0
3,3,POST /blog/index.php/my-account/user-logout/?_...,0,0,0,0,0,1
4,4,GET /blog/index.php/2020/04/04/nihil-tenetur-e...,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
69730,69730,POST /blog/index.php/my-account/user-logout/?_...,0,1,0,0,0,1
69731,69731,POST /blog/index.php/my-account/user-logout/?_...,0,1,0,0,0,1
69732,69732,POST /blog/index.php/my-account/user-logout/?_...,0,1,0,0,0,1
69733,69733,POST /blog/index.php/my-account/user-logout/?_...,0,1,0,0,0,1


In [7]:
# df1 = pd.read_csv(train_path)
df[df[target_list] == 1].count()

Unnamed: 0                       0
text                             0
000 - Normal                 14694
126 - Path Traversal         15839
242 - Code Injection         15747
274 - HTTP Verb Tampering     5437
66 - SQL Injection           15951
88 - OS Command Injection     7317
dtype: int64

In [6]:
train, validate, test = np.split(df.sample(frac=1, random_state=42),[int(.6*len(df)), int(.8*len(df))])

In [6]:
train[train[target_list] == 1].count()

Unnamed: 0                      0
text                            0
000 - Normal                 8755
126 - Path Traversal         9499
242 - Code Injection         9440
274 - HTTP Verb Tampering    3206
66 - SQL Injection           9654
88 - OS Command Injection    4402
dtype: int64

In [7]:
validate[validate[target_list] == 1].count()

Unnamed: 0                      0
text                            0
000 - Normal                 2942
126 - Path Traversal         3159
242 - Code Injection         3180
274 - HTTP Verb Tampering    1120
66 - SQL Injection           3168
88 - OS Command Injection    1452
dtype: int64

In [8]:
test[test[target_list] == 1].count()

Unnamed: 0                      0
text                            0
000 - Normal                 2997
126 - Path Traversal         3181
242 - Code Injection         3127
274 - HTTP Verb Tampering    1111
66 - SQL Injection           3129
88 - OS Command Injection    1463
dtype: int64

In [10]:
# hyperparameters
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 2
LEARNING_RATE = 1e-05

In [11]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
from transformers import BertTokenizer, BertModel



In [12]:
tokenizer = AutoTokenizer.from_pretrained('jackaduma/SecBERT')


Downloading config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/378k [00:00<?, ?B/s]

In [13]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['text']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [8]:
test.to_csv('test.csv')

In [7]:
train = train.reset_index(drop=True)
validate = validate.reset_index(drop=True)
test = test.reset_index(drop=True)

train = CustomDataset(train, tokenizer, MAX_LEN)
validate= CustomDataset(validate, tokenizer, MAX_LEN)
test = CustomDataset(test, tokenizer, MAX_LEN)

In [15]:
train_data_loader = torch.utils.data.DataLoader(train, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(validate, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)
test_data_loader = torch.utils.data.DataLoader(test, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [16]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [17]:
def load_ckp(checkpoint_fpath, model):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into       
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
#     optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
#     valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss 
    return model
# , optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [18]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = AutoModel.from_pretrained('jackaduma/SecBERT')
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 6)
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()

model.to(device)

Downloading model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=0)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [19]:
model = load_ckp('/kaggle/input/bert-for-capec-dataset/pytorch/secbert/1/SecBert_6labels_balance.pt', model)

In [26]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [27]:
val_targets=[]
val_outputs=[]

In [28]:
def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path):
   
  # initialize tracker for minimum validation loss
    valid_loss_min = np.Inf
   
 
    for epoch in range(1, n_epochs+1):
        train_loss = 0
        valid_loss = 0
        model.train()
        print('############# Epoch {}: Training Start   #############'.format(epoch))
        for batch_idx, data in enumerate(training_loader):
            #print('yyy epoch', batch_idx)
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids)

            optimizer.zero_grad()
            loss = loss_fn(outputs, targets)
            #if batch_idx%5000==0:
             #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #print('before loss data in training', loss.item(), train_loss)
            train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
            #print('after loss data in training', loss.item(), train_loss)
            
    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
        for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

        print('############# Epoch {}: Validation End     #############'.format(epoch))
        # calculate average losses
        #print('before cal avg train loss', train_loss)
        if len(training_loader) > 0:
            train_loss = train_loss/len(training_loader)
        if len(validation_loader) > 0:
            valid_loss = valid_loss/len(validation_loader)
        # print training/validation statistics 
        print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))

        # create checkpoint variable and add important data
        checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }

        # save checkpoint
        save_ckp(checkpoint, False, checkpoint_path, best_model_path)

        ## TODO: save the model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
            # save checkpoint as best model
            save_ckp(checkpoint, True, checkpoint_path, best_model_path)
            valid_loss_min = valid_loss

        print('############# Epoch {}  Done   #############\n'.format(epoch))

        return model

In [29]:
ckpt_path = "/kaggle/working/curr_ckpt"
best_model_path = "/kaggle/working/best_model.pt"

In [30]:
t = time.time()
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)
print(time.time()-t)

############# Epoch 1: Training Start   #############
############# Epoch 2: Training Start   #############
############# Epoch 2: Training End     #############
############# Epoch 2: Validation Start   #############
############# Epoch 2: Validation End     #############
Epoch: 2 	Avgerage Training Loss: 0.000009 	Average Validation Loss: 0.000026
Validation loss decreased (inf --> 0.000026).  Saving model ...
############# Epoch 2  Done   #############

1883.4724218845367


In [20]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report,accuracy_score, f1_score

In [21]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
#     y_true1 = y_true.to_numpy()
#     y_pred1 = y_pred.toarray()
    acc_list = []
    
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [22]:
# from sklearn.metrics import hamming_loss,classification_report
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report,accuracy_score, f1_score, hamming_loss

In [23]:
def save_classification(y_test, y_pred, labels):
    if isinstance(y_pred, np.ndarray) == False:
        y_pred = y_pred.toarray()

    def accuracy(y_true, y_pred):
        temp = 0
        for i in range(y_true.shape[0]):
            numerator = sum(np.logical_and(y_true[i], y_pred[i]))
            denominator = sum(np.logical_or(y_true[i], y_pred[i]))
            if denominator != 0:
                temp += numerator / denominator
        return temp / y_true.shape[0]

    out = classification_report(y_test,y_pred, output_dict=True, target_names=labels)
    total_support = out['samples avg']['support']

    mr = accuracy_score(y_test, y_pred)
    acc = accuracy(y_test,y_pred)
    hm = hamming_loss(y_test, y_pred)

    out['Exact Match Ratio'] = {'precision': mr, 'recall': mr, 'f1-score': mr, 'support': total_support}
    out['Hamming Loss'] = {'precision': hm, 'recall': hm, 'f1-score': hm, 'support': total_support}
    out['Accuracy'] = {'precision': acc, 'recall': acc, 'f1-score': acc, 'support': total_support}
    out_df = pd.DataFrame(out).transpose()
    print(out_df)

    

    return out_df

In [24]:
def predict(testing_loader, model):
    print("\nPredicting...")
    # deactivate dropout layers
    model.eval()

    # empty list to save the model predictions
    total_preds = []
    total_labels = []
    # iterate over batches
    for step, batch in enumerate(testing_loader):
        # push the batch to gpu
        ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        targets = batch['targets'].to(device)

        # deactivate autograd
        with torch.no_grad():
            # model predictions
            output = model(ids, mask, token_type_ids)
            final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
            
            total_preds += list(final_output)
            total_labels += targets.tolist()

    return total_labels, total_preds

In [25]:
import time
t = time.time()
total_labels, total_preds = predict(test_data_loader, model)
print(time.time()-t)



Predicting...
99.54562330245972


In [31]:
save_classification(y_test=np.array(np.nan_to_num(total_labels)), y_pred=np.array(np.round(total_preds)), labels=target_list)

  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score  support
000 - Normal                0.994811  0.959626  0.976902   2997.0
126 - Path Traversal        0.980818  0.980509  0.980663   3181.0
242 - Code Injection        1.000000  0.997761  0.998879   3127.0
274 - HTTP Verb Tampering   1.000000  1.000000  1.000000   1111.0
66 - SQL Injection          0.995718  0.966123  0.980697   3129.0
88 - OS Command Injection   0.980259  0.984279  0.982265   1463.0
micro avg                   0.992031  0.978745  0.985343  15008.0
macro avg                   0.991934  0.981383  0.986568  15008.0
weighted avg                0.992081  0.978745  0.985302  15008.0
samples avg                 0.977307  0.977630  0.977388  15008.0
Exact Match Ratio           0.976769  0.976769  0.976769  15008.0
Hamming Loss                0.005222  0.005222  0.005222  15008.0
Accuracy                    0.977235  0.977235  0.977235  15008.0


Unnamed: 0,precision,recall,f1-score,support
000 - Normal,0.994811,0.959626,0.976902,2997.0
126 - Path Traversal,0.980818,0.980509,0.980663,3181.0
242 - Code Injection,1.0,0.997761,0.998879,3127.0
274 - HTTP Verb Tampering,1.0,1.0,1.0,1111.0
66 - SQL Injection,0.995718,0.966123,0.980697,3129.0
88 - OS Command Injection,0.980259,0.984279,0.982265,1463.0
micro avg,0.992031,0.978745,0.985343,15008.0
macro avg,0.991934,0.981383,0.986568,15008.0
weighted avg,0.992081,0.978745,0.985302,15008.0
samples avg,0.977307,0.97763,0.977388,15008.0


In [32]:
checkpoint = torch.load('/kaggle/working/best_model.pt')
model = BERTClass()
# model.load_state_dict(torch.load('/kaggle/working/best_model.pt'))
model.load_state_dict(checkpoint['state_dict'])
model.eval()

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=0)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [33]:
total_labels, total_preds = predict(test_data_loader, model.to(device))
save_classification(y_test=np.array(np.nan_to_num(total_labels)), y_pred=np.array(np.round(total_preds)), labels=target_list)


Predicting...


  _warn_prf(average, modifier, msg_start, len(result))


                           precision    recall  f1-score  support
000 - Normal                0.994811  0.959626  0.976902   2997.0
126 - Path Traversal        0.980818  0.980509  0.980663   3181.0
242 - Code Injection        1.000000  0.997761  0.998879   3127.0
274 - HTTP Verb Tampering   1.000000  1.000000  1.000000   1111.0
66 - SQL Injection          0.995718  0.966123  0.980697   3129.0
88 - OS Command Injection   0.980259  0.984279  0.982265   1463.0
micro avg                   0.992031  0.978745  0.985343  15008.0
macro avg                   0.991934  0.981383  0.986568  15008.0
weighted avg                0.992081  0.978745  0.985302  15008.0
samples avg                 0.977307  0.977630  0.977388  15008.0
Exact Match Ratio           0.976769  0.976769  0.976769  15008.0
Hamming Loss                0.005222  0.005222  0.005222  15008.0
Accuracy                    0.977235  0.977235  0.977235  15008.0


Unnamed: 0,precision,recall,f1-score,support
000 - Normal,0.994811,0.959626,0.976902,2997.0
126 - Path Traversal,0.980818,0.980509,0.980663,3181.0
242 - Code Injection,1.0,0.997761,0.998879,3127.0
274 - HTTP Verb Tampering,1.0,1.0,1.0,1111.0
66 - SQL Injection,0.995718,0.966123,0.980697,3129.0
88 - OS Command Injection,0.980259,0.984279,0.982265,1463.0
micro avg,0.992031,0.978745,0.985343,15008.0
macro avg,0.991934,0.981383,0.986568,15008.0
weighted avg,0.992081,0.978745,0.985302,15008.0
samples avg,0.977307,0.97763,0.977388,15008.0


In [35]:
from IPython.display import FileLink

FileLink('/kaggle/working/best_model.pt')