In [1]:
import pandas as pd
import glob
from tqdm import tqdm, trange
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
import json
import os

import transformers
from transformers import BertForTokenClassification, AdamW

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from transformers import get_linear_schedule_with_warmup


from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [2]:
DATA_PATH = '../data/processed/'
LOG_PATH = '../models/BERT_baseline'

# Load data

In [3]:
def load_data(DATA_PATH):
    docs = []
    sent_num_dataset = 0
    for doc in glob.glob(DATA_PATH+'*'):
        with open(doc) as f:
            docname = ''.join(doc.split('/')[-1].split('.')[:-2])
            data = []
            for para in f.read().strip().split('\n\n'):
              para = ' '.join(para.strip().replace("\n", " ").split())
              if para!='':
                data.extend(sent_tokenize(para))
            # data = sent_tokenize(f.read().strip())
            rows = [{'DocName':docname, 'SentNum':i, 'SentNumOverall':i+sent_num_dataset,'Sentence':sent} for i, sent in enumerate(data)]
            docs+=rows
            sent_num_dataset+=len(data)
    return docs

In [4]:
docs = load_data(DATA_PATH)

In [5]:
docs[:25]

[{'DocName': 'Group-IB_Lazarus',
  'SentNum': 0,
  'SentNumOverall': 0,
  'Sentence': '<THREAT_ACTOR>Lazarus</THREAT_ACTOR> ARISEN'},
 {'DocName': 'Group-IB_Lazarus',
  'SentNum': 1,
  'SentNumOverall': 1,
  'Sentence': 'ARCHITECTURE / TOOLS / ATTRIBUTION'},
 {'DocName': 'Group-IB_Lazarus',
  'SentNum': 2,
  'SentNumOverall': 2,
  'Sentence': '<THREAT_ACTOR>Lazarus</THREAT_ACTOR> arisen: architecture, tools, attribution2'},
 {'DocName': 'Group-IB_Lazarus',
  'SentNum': 3,
  'SentNumOverall': 3,
  'Sentence': 'INTRODUCTION'},
 {'DocName': 'Group-IB_Lazarus',
  'SentNum': 4,
  'SentNumOverall': 4,
  'Sentence': 'In <TIMESTAMP>February 2016</TIMESTAMP>, hackers reportedly attempted to steal approximately 1 billion USD from the <ORG>Central Bank of Bangladesh</ORG> through <SOFTWARE>SWIFT</SOFTWARE>.'},
 {'DocName': 'Group-IB_Lazarus',
  'SentNum': 5,
  'SentNumOverall': 5,
  'Sentence': 'In <TIMESTAMP>February 2017</TIMESTAMP>, several Polish <INDUSTRY>banks</INDUSTRY> were compromised.'}

## Parse sentences

In [6]:
from html.parser import HTMLParser

tags = ['THREAT_ACTOR', 'SOFTWARE', 'INDUSTRY', 'ORG', 'TIMESTAMP',
       'MALWARE', 'COUNTRY', 'IOC', 'IDENTITY', 'CAMPAIGN', 'TOOL',
       'MITRE_ATTACK', 'THEAT_ACTOR', 'ATTACK_PATTERN', 'TECHNIQUE',
       'CITY']

tags_small = [x.lower() for x in tags]

class DataParser(HTMLParser):
    def __init__(self, IOB=True):
        super(DataParser, self).__init__()
        self.IOB = IOB
        self.cur_tag = 'O'
        self.dataset = []
        self.cur_doc = ''
    
    def handle_starttag(self, tag, attrs):
        self.cur_tag = tag



    def handle_endtag(self, tag):
        self.cur_tag = 'O'


    def handle_data(self, data):
        if self.cur_tag=='docname':
            self.cur_doc = data
        else:
          data_tokens = word_tokenize(data)
          if self.cur_tag == 'O':
              tags = list(zip([self.cur_doc]*len(data_tokens),data_tokens,[self.cur_tag]*len(data_tokens)))
          elif self.IOB:
              tags = list(zip([self.cur_doc]*len(data_tokens),data_tokens,['B-'+self.cur_tag]+['I-'+self.cur_tag]*(len(data_tokens)-1)))
          else:
              tags = list(zip([self.cur_doc]*len(data_tokens),data_tokens,[self.cur_tag]*len(data_tokens)))
          self.dataset+=tags

In [7]:
parsed_docs = []
for doc in docs:
  parser = DataParser(IOB = True)
  parser.feed(doc['Sentence'])
  parsed_docs+=[x+(doc['SentNum'], doc['SentNumOverall'], doc['DocName']) for x in parser.dataset]

In [8]:
parsed_docs_df = pd.DataFrame(parsed_docs).drop(columns=0)
parsed_docs_df.columns = ['text','intent','SentNum','SentNumOverall','DocName']

In [9]:
parsed_docs_df.head()

Unnamed: 0,text,intent,SentNum,SentNumOverall,DocName
0,Lazarus,B-threat_actor,0,0,Group-IB_Lazarus
1,ARISEN,O,0,0,Group-IB_Lazarus
2,ARCHITECTURE,O,1,1,Group-IB_Lazarus
3,/,O,1,1,Group-IB_Lazarus
4,TOOLS,O,1,1,Group-IB_Lazarus


# BERT model

In [10]:
MAX_LEN = 75
bs = 32

In [11]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["text"].values.tolist(),
                                                           s["intent"].values.tolist(),
                                                           s["DocName"].values.tolist())]
        self.grouped = self.data.groupby('SentNumOverall').apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [12]:
getter = SentenceGetter(parsed_docs_df)

In [13]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
labels = [[s[1] for s in sentence] for sentence in getter.sentences]

In [25]:
tag_values = list(set(parsed_docs_df["intent"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

with open(os.path.join(LOG_PATH,'tag2idx.json'), 'w') as f:
    json.dump(tag2idx, f)

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
# torch.cuda.get_device_name(0)

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [18]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [27]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences, labels)
]

In [28]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [29]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [30]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [31]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [32]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [33]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [34]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [39]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [37]:
# model.cuda();

In [40]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [42]:
tokenizer.save_pretrained(os.path.join(LOGS_PATH))

('../models/BERT_baseline/vocab.txt',
 '../models/BERT_baseline/special_tokens_map.json',
 '../models/BERT_baseline/added_tokens.json')

In [43]:
from transformers import get_linear_schedule_with_warmup

epochs = 10
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

## Training

In [None]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

macro_prec, macro_rec, macro_f1 = [], [], []

for epoch in trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))

    print("Validation Weighted Precision: {}".format(precision_score(valid_tags,pred_tags,  average = 'weighted')))
    print("Validation Macro Precision: {}".format(precision_score(valid_tags,pred_tags,  average = 'macro')))
    macro_prec.append(precision_score(valid_tags,pred_tags,  average = 'macro'))
    print("Validation Micro Precision: {}".format(precision_score( valid_tags,pred_tags, average = 'micro')))

    print("Validation Weighted Recall: {}".format(recall_score( valid_tags, pred_tags,average = 'weighted')))
    print("Validation Macro Recall: {}".format(recall_score(valid_tags, pred_tags, average = 'macro')))
    macro_rec.append(recall_score(valid_tags, pred_tags, average = 'macro'))
    print("Validation Micro Recall: {}".format(recall_score( valid_tags, pred_tags,average = 'micro')))


    print("Validation Weighted F1-Score: {}".format(f1_score( valid_tags, pred_tags,average = 'weighted')))
    print("Validation macro F1-Score: {}".format(f1_score( valid_tags, pred_tags,average = 'macro')))
    macro_f1.append(f1_score( valid_tags, pred_tags,average = 'macro'))
    print("Validation micro F1-Score: {}".format(f1_score(valid_tags, pred_tags, average = 'micro')))

    import matplotlib.pyplot as plt
    import seaborn as sns


    labels = np.unique(valid_tags)
    a =  confusion_matrix(valid_tags, pred_tags, labels=labels)
    plt.figure(figsize = (20,14))
    sns.set(font_scale=1.4) # for label size
    sns.heatmap(pd.DataFrame(a, index=labels, columns=labels), annot=True, fmt='d', cmap='Blues',robust=True)
    plt.show()
    pd.DataFrame(a, index=labels, columns=labels).to_csv(os.path.join(LOGS_PATH,'conf_matr_'+str(epoch)+'.csv'))
    



    # Use plot styling from seaborn.
    sns.set(style='darkgrid')

    # Increase the plot size and font size.
    sns.set(font_scale=1.5)
    plt.rcParams["figure.figsize"] = (12,6)

    # Plot the learning curve.

        
    plt.plot(loss_values, 'b-o', label="training loss")
    plt.plot(validation_loss_values, 'r-o', label="validation loss")

    # Label the plot.
    plt.title("Learning curve")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()

    plt.show()
    
    model.save_pretrained(LOGS_PATH)
    
    with open(os.path.join(LOGS_PATH,'train_loss.txt'),'w') as f:
        f.write('\n'.join([str(x) for x in loss_values]))
    
    with open(os.path.join(LOGS_PATH,'val_loss.txt'),'w') as f:
        f.write('\n'.join([str(x) for x in validation_loss_values]))
    with open(os.path.join(LOGS_PATH,'macro_prec.txt'),'w') as f:
        f.write('\n'.join([str(x) for x in macro_prec]))
    
    with open(os.path.join(LOGS_PATH,'macro_rec.txt'),'w') as f:
        f.write('\n'.join([str(x) for x in macro_rec]))
        
    with open(os.path.join(LOGS_PATH,'macro_f1.txt'),'w') as f:
        f.write('\n'.join([str(x) for x in macro_f1]))
    
    
    print()
    

## Evaluation

In [56]:
TEST_DATA_PATH = '../data/raw/'
YEAR = 2013
FILENAME = 'FTA 1008 - Darkseoul-Jokra Analysis and Recovery.pdf.txt'

In [57]:
test_docs = []
import os
sent_num_dataset=0
with open(os.path.join(TEST_DATA_PATH, str(YEAR), FILENAME)) as f:
    docname = str(YEAR)+'/'+FILENAME
    data = []
    for para in f.read().strip().split('\n\n'):
      para = ' '.join(para.strip().replace("\n", " ").split())
      if para!='':
        data.extend(sent_tokenize(para))
    rows = [{'DocName':docname, 'SentNum':i, 'SentNumOverall':i+sent_num_dataset,'Sentence':sent} for i, sent in enumerate(data)]
    test_docs+=rows
    sent_num_dataset+=len(data)

In [58]:
test_docs_df = pd.DataFrame(test_docs)

### Load models

In [6]:
tokenizer = BertTokenizer.from_pretrained(os.path.join(LOG_PATH), do_lower_case=False)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


In [9]:
tag2idx = {'B-appdata': 37,
 'B-attack_pattern': 5,
 'B-campaign': 34,
 'B-city': 18,
 'B-country': 15,
 'B-cve': 39,
 'B-excel': 6,
 'B-file': 20,
 'B-identity': 47,
 'B-industry': 14,
 'B-input': 24,
 'B-ioc': 32,
 'B-local': 26,
 'B-major': 44,
 'B-malware': 50,
 'B-mitre_attack': 36,
 'B-n': 25,
 'B-name': 29,
 'B-org': 23,
 'B-pid': 28,
 'B-program': 52,
 'B-software': 38,
 'B-technique': 53,
 'B-theat_actor': 3,
 'B-threat_actor': 30,
 'B-timestamp': 16,
 'B-tool': 17,
 'B-type': 27,
 'B-update': 8,
 'B-user': 1,
 'B-windows': 54,
 'I-appdata': 45,
 'I-campaign': 9,
 'I-city': 31,
 'I-country': 2,
 'I-cve': 21,
 'I-file': 22,
 'I-identity': 43,
 'I-industry': 48,
 'I-input': 49,
 'I-ioc': 41,
 'I-local': 40,
 'I-malware': 0,
 'I-mitre_attack': 7,
 'I-name': 4,
 'I-org': 55,
 'I-program': 10,
 'I-software': 11,
 'I-technique': 46,
 'I-threat_actor': 42,
 'I-timestamp': 12,
 'I-tool': 51,
 'I-type': 33,
 'I-user': 19,
 'I-windows': 35,
 'O': 13,
 'PAD': 56}

In [13]:
LOG_PATH

'../models/BERT_baseline'

In [None]:
model = BertForTokenClassification.from_pretrained(LOG_PATH)

### Annotate text

In [71]:
def annotate_text(data):
  annotation = []
  
  for test_sentence in test_docs_df['Sentence']:
    prev_label='O'
    tokenized_sentence = tokenizer.encode(test_sentence)
    input_ids = torch.tensor([tokenized_sentence])#.cuda()
    with torch.no_grad():
        output = model(input_ids)
    label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)
    tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
    new_tokens, new_labels = [], []
    for token, label_idx in zip(tokens, label_indices[0]):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            new_labels.append(tag_values[label_idx])
            new_tokens.append(token)
    from nltk import pos_tag
    from nltk.tree import Tree
    from nltk.chunk import conlltags2tree
    tokens = new_tokens
    tags = new_labels
    # tag each token with pos
    pos_tags = [pos for token, pos in pos_tag(tokens)]
    # convert the BIO / IOB tags to tree
    conlltags = [(token, pos, tg) for token, pos, tg in zip(tokens, pos_tags, tags)]
    ne_tree = conlltags2tree(conlltags)
    # parse the tree to get our original text
    original_text = []
    for subtree in ne_tree:
        # checking for 'O' tags
        if type(subtree) == Tree:
            original_label = subtree.label()
            original_string = " ".join([token for token, pos  in subtree.leaves()])
            if (original_string!='[CLS]' and original_string!='[SEP]'):
              if original_label==prev_label:
                original_text.append(original_string)
              else:
                original_text.append('<'+original_label.upper()+'>'+original_string)
              prev_label = original_label
        elif type(subtree)==tuple:
          if (subtree[0]!='[CLS]' and subtree[0]!='[SEP]'):
            if prev_label!='O':
              original_text[-1]+='</'+original_label.upper()+'>'
              prev_label='O'
            original_text.append(subtree[0])
    annotation+=[tokenizer.convert_tokens_to_string(original_text)]
  return '\n'.join(annotation)

In [72]:
text_an = annotate_text(test_docs_df)

In [73]:
with open(os.path.join(LOGS_PATH, 'test_'+str(FILENAME)),'w') as f:
    f.write(text_an)