In [1]:
import os

In [2]:
filenames = os.listdir("new_data")

In [3]:
clean_filenames = []
for f in filenames:
    if ".txt" in f:
        clean_filenames.append(f)

In [4]:
texts = []
for name in clean_filenames:
    with open("new_data/" + name, "rb") as f:
        text = f.readlines()
    texts.append({
        "class": name,
        "list_text": text
    })

In [5]:
final_data = []
for t in texts:
    str_text = []
    for tex in t['list_text']:
        try:
            str_text.append(tex.decode())
        except Exception:
            str_text.append(tex)
        final_data.append({
            "class": t['class'].split(".")[0],
            "list_text": str_text
        })

In [6]:
import pandas as pd

In [7]:
df = pd.DataFrame(final_data)

In [8]:
df

Unnamed: 0,class,list_text
0,select_version,"[UNION SELECT @@VERSION,SLEEP(5),3\n, UNION SE..."
1,select_version,"[UNION SELECT @@VERSION,SLEEP(5),3\n, UNION SE..."
2,select_version,"[UNION SELECT @@VERSION,SLEEP(5),3\n, UNION SE..."
3,select_version,"[UNION SELECT @@VERSION,SLEEP(5),3\n, UNION SE..."
4,select_version,"[UNION SELECT @@VERSION,SLEEP(5),3\n, UNION SE..."
...,...,...
166,select,"[UNION ALL SELECT 1\n, UNION ALL SELECT 1,2\n,..."
167,select,"[UNION ALL SELECT 1\n, UNION ALL SELECT 1,2\n,..."
168,select,"[UNION ALL SELECT 1\n, UNION ALL SELECT 1,2\n,..."
169,select,"[UNION ALL SELECT 1\n, UNION ALL SELECT 1,2\n,..."


In [9]:
df['class'].unique()

array(['select_version', 'select_version_user', 'select'], dtype=object)

In [10]:
import os, sys

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

In [11]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
config = BertConfig.from_pretrained('bert-base-uncased')
config.num_labels = len(df['class'].unique())

# Instantiate model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [12]:
from torch.utils.data import Dataset, DataLoader

In [13]:

index_dict = {    
}
label_2_index = {}
for i, d in enumerate(df['class'].unique()):
    index_dict[i] = d
    label_2_index[d] = i

In [14]:
label_2_index, index_dict

({'select_version': 0, 'select_version_user': 1, 'select': 2},
 {0: 'select_version', 1: 'select_version_user', 2: 'select'})

In [15]:
from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(df, test_size=0.2)

In [16]:
class DocumentSentimentDataset(Dataset):
    # Static constant variable
    LABEL2INDEX = label_2_index
    INDEX2LABEL = index_dict
    NUM_LABELS = len(index_dict)
    
    def __init__(self, dataset, tokenizer, no_special_token=False, *args, **kwargs):
        dataset['class'] = dataset['class'].apply(lambda lab: self.LABEL2INDEX[lab])
        self.data = dataset
        self.tokenizer = tokenizer
        self.no_special_token = no_special_token
    
    def __getitem__(self, index):
        data = self.data.iloc[index, :]
#         print(data)
        text, classes = data['list_text'][0], data['class']
#         print(text, classes)
        subwords = self.tokenizer.encode(text, add_special_tokens=not self.no_special_token)
        return np.array(subwords), np.array(classes), data['list_text'][0]
    
    def __len__(self):
        return len(self.data)    

class DocumentSentimentDataLoader(DataLoader):
    def __init__(self, max_seq_len=512, *args, **kwargs):
        super(DocumentSentimentDataLoader, self).__init__(*args, **kwargs)
        self.collate_fn = self._collate_fn
        self.max_seq_len = max_seq_len
        
    def _collate_fn(self, batch):

        batch_size = len(batch)
        max_seq_len = max(map(lambda x: len(x[0]), batch))
        max_seq_len = min(self.max_seq_len, max_seq_len)
        
        subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
        sentiment_batch = np.zeros((batch_size, 1), dtype=np.int64)
        
        seq_list = []
        for i, (subwords, sentiment, raw_seq) in enumerate(batch):
            subwords = subwords[:max_seq_len]
            subword_batch[i,:len(subwords)] = subwords
            mask_batch[i,:len(subwords)] = 1
            sentiment_batch[i,0] = sentiment
#             print(sentiment_batch)
            
            seq_list.append(raw_seq)
        return subword_batch, mask_batch, sentiment_batch, seq_list

In [17]:
train_dataset = DocumentSentimentDataset(data_train, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(data_test, tokenizer, lowercase=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [21]:
train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)  
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)  

  cpuset_checked))


In [22]:
def forward_sequence_classification(model, batch_data, i2w, is_test=False, device='cpu', **kwargs):
    # Unpack batch data
    if len(batch_data) == 3:
        (subword_batch, mask_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 4:
        (subword_batch, mask_batch, token_type_batch, label_batch) = batch_data
    
    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    label_batch = torch.LongTensor(label_batch)
            
    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        label_batch = label_batch.cuda()

    # Forward model
    outputs = model(subword_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2]
    
    # generate prediction & label list
    list_hyp = []
    list_label = []
    hyp = torch.topk(logits, 1)[1]
    for j in range(len(hyp)):
        list_hyp.append(i2w[hyp[j].item()])
        list_label.append(i2w[label_batch[j][0].item()])
        
    return loss, list_hyp, list_label


In [23]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'select_version': 0, 'select_version_user': 1, 'select': 2}
{0: 'select_version', 1: 'select_version_user', 2: 'select'}


In [24]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [34]:
# Train
n_epochs = 5
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model

        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(test_loader, leave=True, total=len(test_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

  cpuset_checked))
(Epoch 1) TRAIN LOSS:0.5679 LR:0.00000300: 100%|██████████| 5/5 [00:02<00:00,  2.19it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

(Epoch 1) TRAIN LOSS:0.5679 ACC:0.97 F1:0.96 REC:0.97 PRE:0.96 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.5151 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00: 100%|██████████| 2/2 [00:00<00:00,  2.41it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

(Epoch 1) VALID LOSS:0.5151 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00


  cpuset_checked))
(Epoch 2) TRAIN LOSS:0.5262 LR:0.00000300: 100%|██████████| 5/5 [00:02<00:00,  2.21it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

(Epoch 2) TRAIN LOSS:0.5262 ACC:0.99 F1:0.99 REC:1.00 PRE:0.99 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.4487 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00: 100%|██████████| 2/2 [00:00<00:00,  2.35it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

(Epoch 2) VALID LOSS:0.4487 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00


  cpuset_checked))
(Epoch 3) TRAIN LOSS:0.4650 LR:0.00000300: 100%|██████████| 5/5 [00:02<00:00,  2.17it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

(Epoch 3) TRAIN LOSS:0.4650 ACC:0.99 F1:0.99 REC:1.00 PRE:0.99 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.4040 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00: 100%|██████████| 2/2 [00:00<00:00,  2.34it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

(Epoch 3) VALID LOSS:0.4040 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00


  cpuset_checked))
(Epoch 4) TRAIN LOSS:0.4276 LR:0.00000300: 100%|██████████| 5/5 [00:02<00:00,  2.09it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

(Epoch 4) TRAIN LOSS:0.4276 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00 LR:0.00000300


  cpuset_checked))
VALID LOSS:0.3800 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00: 100%|██████████| 2/2 [00:00<00:00,  2.46it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

(Epoch 4) VALID LOSS:0.3800 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00


  cpuset_checked))
(Epoch 5) TRAIN LOSS:0.3950 LR:0.00000300: 100%|██████████| 5/5 [00:02<00:00,  2.15it/s]
  cpuset_checked))


(Epoch 5) TRAIN LOSS:0.3950 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00 LR:0.00000300


VALID LOSS:0.3150 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00: 100%|██████████| 2/2 [00:00<00:00,  2.44it/s]

(Epoch 5) VALID LOSS:0.3150 ACC:1.00 F1:1.00 REC:1.00 PRE:1.00





In [35]:
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']


In [36]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

def document_sentiment_metrics_fn(list_hyp, list_label):
    metrics = {}
    metrics["ACC"] = accuracy_score(list_label, list_hyp)
    metrics["F1"] = f1_score(list_label, list_hyp, average='macro')
    metrics["REC"] = recall_score(list_label, list_hyp, average='macro')
    metrics["PRE"] = precision_score(list_label, list_hyp, average='macro')
    return metrics

In [37]:
def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [44]:
text = 'id or 1=1 UNION ALL SELECT @@VERSION,USER(),SLEEP(5)--'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: id or 1=1 UNION ALL SELECT @@VERSION,USER(),SLEEP(5)-- | Label : select_version (72.771%)
