<a href="https://colab.research.google.com/github/hoangcuongnguyen2001/Honours_Repository/blob/main/SciBERT_train_single_label_reset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook documents the procedure for training the single-label models during the 2023 TRAM effort.

The `bootstrap-training-data` file contains the annotations that existed prior, as well as the annotations that were produced during the 2023 effort.

In [1]:
!pip install --upgrade --no-cache-dir gdown
#Upload the JSON file for fine-tuning (from cti-to-mitre-with-nlp)
import gdown

url = 'https://drive.google.com/file/d/1BCkEdKgmH49kjihmrxlXVUQvB0GsiQJW/view?usp=drive_link'
output_path = 'TRAM_fine_tuned_SciBERT.json'
gdown.download(url, output_path, quiet=False,fuzzy=True)

Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.6.6
    Uninstalling gdown-4.6.6:
      Successfully uninstalled gdown-4.6.6
Successfully installed gdown-4.7.1


Downloading...
From: https://drive.google.com/uc?id=1BCkEdKgmH49kjihmrxlXVUQvB0GsiQJW
To: /content/TRAM_fine_tuned_SciBERT.json
100%|██████████| 1.38M/1.38M [00:00<00:00, 119MB/s]


'TRAM_fine_tuned_SciBERT.json'

In [2]:
import pandas as pd
import json



with open('TRAM_fine_tuned_SciBERT.json') as f:
    data = json.loads(f.read())

data = pd.DataFrame(
    [
        {'text': row['text'], 'label': row['label']}
        for row in data

    ]
)
print(data)

                                                   text      label
0     Anchor has used cmd.exe to run its self deleti...  T1059.003
1     Zeus Panda can launch an interface where it ca...  T1059.003
2     Chimera has used the Windows Command Shell and...  T1059.003
3     Cuba has used cmd.exe /c and batch files for e...  T1059.003
4     MechaFlounder has the ability to run commands ...  T1059.003
...                                                 ...        ...
9806                           Kazuar can delete files.  T1070.004
9807  Cobalt Strike can exploit vulnerabilities such...      T1068
9808  QakBot can send stolen information to C2 nodes...      T1041
9809  Turla RPC backdoors have also searched for fil...      T1083
9810  Ramsay has created Registry Run keys to establ...  T1547.001

[9811 rows x 2 columns]


In [3]:
!pip install transformers torch

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m105.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m86.8 MB/s[0m eta [36m0:00:00[0m
Coll

We then load the model and move it to the GPU.

In [4]:
import transformers
import torch

mode: 'bert or gpt' = 'bert'
cuda = torch.device('cuda')

if mode == 'bert':
    model = transformers.BertForSequenceClassification.from_pretrained(
        "allenai/scibert_scivocab_uncased",
        num_labels=data['label'].nunique(),
        output_attentions=False,
        output_hidden_states=False,
    )
    tokenizer = transformers.BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", max_length=512)
elif mode == 'gpt':
    model = transformers.GPT2ForSequenceClassification.from_pretrained(
        "gpt2",
        num_labels=data['label'].nunique(),
        output_attentions=False,
        output_hidden_states=False,
    )
    tokenizer = transformers.GPT2Tokenizer.from_pretrained("gpt2", max_length=512)
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id
else:
    raise ValueError(f"mode must be one of bert or gpt, but is {mode = !r}")

model.train().to(cuda)


Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

We will represent the labels using one hot encoding.

The `apply_attention_mask` function returns an attention mask (which is a tensor) where the element for every non-padding token is `1`.

In [5]:
from sklearn.preprocessing import OneHotEncoder as OHE

encoder = OHE(sparse_output=False)
encoder.fit(data[['label']])

def tokenize(samples: 'list[str]'):
    return tokenizer(samples, return_tensors='pt', padding='max_length', truncation=True, max_length=512).input_ids

def load_data(x, y, batch_size=10):
    x_len, y_len = x.shape[0], y.shape[0]
    assert x_len == y_len
    for i in range(0, x_len, batch_size):
        slc = slice(i, i + batch_size)
        yield x[slc].to(cuda), y[slc].to(cuda)

def apply_attention_mask(x):
    return x.ne(tokenizer.pad_token_id).to(int)


In [6]:
from sklearn.model_selection import train_test_split

train_validation, test = train_test_split(data, test_size=.2, stratify=data['label'])

train, validation = train_test_split(train_validation, test_size = .25, stratify=train_validation['label'])
x_train = tokenize(train['text'].tolist())
x_train

tensor([[  102,  5157,  4430,  ...,     0,     0,     0],
        [  102,   461,   220,  ...,     0,     0,     0],
        [  102,   130, 15002,  ...,     0,     0,     0],
        ...,
        [  102,  4433, 13153,  ...,     0,     0,     0],
        [  102,  1542, 15852,  ...,     0,     0,     0],
        [  102,  6040,  1432,  ...,     0,     0,     0]])

In [7]:
print(len(x_train))

5886


In [8]:
x_validation = tokenize(validation['text'].tolist())
y_validation = torch.Tensor(encoder.transform(validation[['label']]))

In [9]:
y_train = torch.Tensor(encoder.transform(train[['label']]))
y_train

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

The hyperparameters shown here are those that we used, including the number of epochs and batch size.

In [None]:
from torch.optim import AdamW
from tqdm import tqdm
from statistics import mean

optim = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

for epoch in range(4):
    epoch_losses = []
    validation_loss = []
    model.train()
    for x, y in tqdm(load_data(x_train, y_train, batch_size=10)):
        model.zero_grad()
        out = model(x, attention_mask=apply_attention_mask(x), labels=y)
        epoch_losses.append(out.loss.item())
        out.loss.backward()
        optim.step()
    print(f"epoch {epoch + 1} loss: {mean(epoch_losses)}")
    # Free up GPU memory
    torch.cuda.empty_cache()
    model.eval()
    for x, y in load_data(x_validation, y_validation, batch_size=10):

    # validation data
        validation_output = model(x, attention_mask=apply_attention_mask(x), labels=y)
        validation_loss.append(validation_output.loss.item())

    print(f"Validation for epoch {epoch + 1} loss: {mean(validation_loss)}")
    # Free up GPU memory
    torch.cuda.empty_cache()


589it [08:26,  1.16it/s]


epoch 1 loss: 0.13857505054344346
Validation for epoch 1 loss: 0.09592440100339464


464it [06:44,  1.15it/s]

In [None]:
import torch.nn.functional as F

model.eval()

preds = []
batch_size = 20

x_test = tokenize(test['text'].tolist())

with torch.no_grad():
    for i in range(0, x_test.shape[0], batch_size):
        x = x_test[i : i + batch_size].to(cuda)
        out = model(x, attention_mask=apply_attention_mask(x))
        preds.extend(out.logits.to('cpu'))

predicted_labels = (
    encoder.inverse_transform(
        F.one_hot(
            torch.vstack(preds).softmax(-1).argmax(-1),
            num_classes=50
        )
        .numpy()
    )
    .reshape(-1)
)

predicted_labels

In [None]:
from sklearn.metrics import precision_recall_fscore_support as calculate_score

predicted = list(predicted_labels)
actual = test['label'].tolist()

labels = sorted(data['label'].unique())

scores = calculate_score(actual, predicted, labels=labels)

scores_df = pd.DataFrame(scores).T
scores_df.columns = ['P', 'R', 'F1', '#']
scores_df.index = labels
scores_df.loc['(micro)'] = calculate_score(actual, predicted, average='micro', labels=labels)
scores_df.loc['(macro)'] = calculate_score(actual, predicted, average='macro', labels=labels)

scores_df

In [None]:
model = model.save_pretrained("scibert_model")
tokenizer = tokenizer.save_pretrained("scibert_tokenizer")

In [None]:
!zip -r scibert_model.zip scibert_model/
!zip -r scibert_tokenizer.zip scibert_tokenizer/
from google.colab import files
files.download('scibert_model.zip')
files.download('scibert_tokenizer.zip')

In [None]:
from google.colab import drive
drive.mount('/content/drive')