<a href="https://colab.research.google.com/github/hoangcuongnguyen2001/Honours_Repository/blob/main/single_label_k_fold_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook documents the procedure for training the single-label models during the 2023 TRAM effort.

The `bootstrap-training-data` file contains the annotations that existed prior, as well as the annotations that were produced during the 2023 effort.

In [1]:
!pip install --upgrade --no-cache-dir gdown
#Upload the JSON file for fine-tuning (from cti-to-mitre-with-nlp)
import gdown

url = 'https://drive.google.com/file/d/1BCkEdKgmH49kjihmrxlXVUQvB0GsiQJW/view?usp=drive_link'
output_path = 'TRAM_fine_tuned_SciBERT.json'
gdown.download(url, output_path, quiet=False,fuzzy=True)



Downloading...
From: https://drive.google.com/uc?id=1BCkEdKgmH49kjihmrxlXVUQvB0GsiQJW
To: /content/TRAM_fine_tuned_SciBERT.json
100%|██████████| 1.38M/1.38M [00:00<00:00, 134MB/s]


'TRAM_fine_tuned_SciBERT.json'

In [2]:
import pandas as pd
import json



with open('TRAM_fine_tuned_SciBERT.json') as f:
    data = json.loads(f.read())

data = pd.DataFrame(
    [
        {'text': row['text'], 'label': row['label']}
        for row in data

    ]
)
print(data)

                                                   text      label
0     Anchor has used cmd.exe to run its self deleti...  T1059.003
1     Zeus Panda can launch an interface where it ca...  T1059.003
2     Chimera has used the Windows Command Shell and...  T1059.003
3     Cuba has used cmd.exe /c and batch files for e...  T1059.003
4     MechaFlounder has the ability to run commands ...  T1059.003
...                                                 ...        ...
9806                           Kazuar can delete files.  T1070.004
9807  Cobalt Strike can exploit vulnerabilities such...      T1068
9808  QakBot can send stolen information to C2 nodes...      T1041
9809  Turla RPC backdoors have also searched for fil...      T1083
9810  Ramsay has created Registry Run keys to establ...  T1547.001

[9811 rows x 2 columns]


In [3]:
data.head()

Unnamed: 0,text,label
0,Anchor has used cmd.exe to run its self deleti...,T1059.003
1,Zeus Panda can launch an interface where it ca...,T1059.003
2,Chimera has used the Windows Command Shell and...,T1059.003
3,Cuba has used cmd.exe /c and batch files for e...,T1059.003
4,MechaFlounder has the ability to run commands ...,T1059.003


In [4]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(data[['label']])
data['label_transform'] = encoder.transform(data['label'])


  y = column_or_1d(y, warn=True)


In [5]:
#adding
data['text']=data['text'].astype(str)

In [6]:
!pip install transformers torch



We then load the model and move it to the GPU.

In [7]:
import transformers
import torch

mode: 'bert or gpt' = 'bert'
cuda = torch.device('cuda')

if mode == 'bert':
    model = transformers.BertForSequenceClassification.from_pretrained(
        "allenai/scibert_scivocab_uncased",
        num_labels=data['label'].nunique(),
        output_attentions=False,
        output_hidden_states=False,
    )
    tokenizer = transformers.BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", max_length=512)
elif mode == 'gpt':
    model = transformers.GPT2ForSequenceClassification.from_pretrained(
        "gpt2",
        num_labels=data['label'].nunique(),
        output_attentions=False,
        output_hidden_states=False,
    )
    tokenizer = transformers.GPT2Tokenizer.from_pretrained("gpt2", max_length=512)
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id
else:
    raise ValueError(f"mode must be one of bert or gpt, but is {mode = !r}")

model.train().to(cuda)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [8]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=.2, stratify=data['label'])

print(train['label'])

2072        T1083
3473    T1036.005
3031        T1105
8974        T1012
1380    T1548.002
          ...    
8394        T1041
3754        T1016
743     T1518.001
3963    T1573.001
7342        T1105
Name: label, Length: 7848, dtype: object


In [9]:
from sklearn.model_selection import StratifiedKFold
k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)


In [10]:
from torch.utils.data import DataLoader, Dataset

# Define a custom PyTorch dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': label}

# Convert dataframe to dataset
dataset = TextDataset(train['text'].tolist(), train['label_transform'].tolist())


The hyperparameters shown here are those that we used, including the number of epochs and batch size.

In [11]:
from torch.optim import AdamW
from tqdm import tqdm
from statistics import mean
from sklearn.metrics import precision_recall_fscore_support as calculate_score
from sklearn.metrics import accuracy_score

fold_accuracies = []


# Perform k-fold cross-validation
for fold, (train_indices, val_indices) in enumerate(skf.split(train['text'], train['label'])):
    print(f"Training Fold {fold+1}/{k_folds}")
    # Split dataset into train and validation sets for the current fold
    train_dataset = torch.utils.data.Subset(dataset, train_indices)
    val_dataset = torch.utils.data.Subset(dataset, val_indices)
    print(train_dataset)


    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=10, shuffle=False)
    print(type(train_loader))
    # Training loop
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
    criterion = torch.nn.CrossEntropyLoss()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()
    for epoch in range(6):
        epoch_losses = []
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            epoch_losses.append(loss.item())
            loss.backward()
            optimizer.step()
        print(f"epoch {epoch + 1} loss: {mean(epoch_losses)}")

     # Evaluation loop
    model.eval()
    val_predictions = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, predicted_labels = torch.max(outputs.logits, dim=1)
            val_predictions.extend(predicted_labels.tolist())
            val_labels.extend(labels.tolist())

    fold_accuracy = accuracy_score(val_labels, val_predictions)
    fold_accuracies.append(fold_accuracy)
    print(f"Accuracy for Fold {fold+1}: {fold_accuracy}")


Training Fold 1/5
<torch.utils.data.dataset.Subset object at 0x7e16a4e73370>
<class 'torch.utils.data.dataloader.DataLoader'>
epoch 1 loss: 2.4136637228119904
epoch 2 loss: 0.9446478704121083
epoch 3 loss: 0.46787297128919203
epoch 4 loss: 0.22973790789129817
epoch 5 loss: 0.11999403383986205
epoch 6 loss: 0.0956434787393423
Accuracy for Fold 1: 0.835031847133758
Training Fold 2/5
<torch.utils.data.dataset.Subset object at 0x7e16a4ebe290>
<class 'torch.utils.data.dataloader.DataLoader'>
epoch 1 loss: 0.23603135659982255
epoch 2 loss: 0.11860895715357297
epoch 3 loss: 0.06297612631438407
epoch 4 loss: 0.04206924722233348
epoch 5 loss: 0.04323633917485761
epoch 6 loss: 0.03553216006038829
Accuracy for Fold 2: 0.9681528662420382
Training Fold 3/5
<torch.utils.data.dataset.Subset object at 0x7e16a4e734f0>
<class 'torch.utils.data.dataloader.DataLoader'>
epoch 1 loss: 0.06500650573541132
epoch 2 loss: 0.04712580788285905
epoch 3 loss: 0.026301972077348253
epoch 4 loss: 0.03743160245858524
e

In [13]:

predicted_labels = (
    encoder.inverse_transform(val_predictions)
)
predicted_labels

array(['T1047', 'T1113', 'T1562.001', ..., 'T1070.004', 'T1219',
       'T1562.001'], dtype=object)

In [15]:
actual_labels = (
    encoder.inverse_transform(val_labels)
)
actual_labels

array(['T1047', 'T1113', 'T1562.001', ..., 'T1070.004', 'T1219',
       'T1562.001'], dtype=object)

In [16]:
from sklearn.metrics import precision_recall_fscore_support as calculate_score


labels = sorted(data['label'].unique())

scores = calculate_score(actual_labels, predicted_labels, labels=labels)

scores_df = pd.DataFrame(scores).T
scores_df.columns = ['P', 'R', 'F1', '#']
scores_df.index = labels
scores_df.loc['(micro)'] = calculate_score(actual_labels, predicted_labels, average='micro', labels=labels)
scores_df.loc['(macro)'] = calculate_score(actual_labels, predicted_labels, average='macro', labels=labels)

scores_df

Unnamed: 0,P,R,F1,#
T1003.001,1.0,1.0,1.0,29.0
T1005,1.0,1.0,1.0,18.0
T1012,1.0,1.0,1.0,30.0
T1016,0.97561,1.0,0.987654,40.0
T1021.001,1.0,0.961538,0.980392,26.0
T1027,1.0,1.0,1.0,48.0
T1033,1.0,1.0,1.0,30.0
T1036.005,1.0,1.0,1.0,49.0
T1041,1.0,1.0,1.0,32.0
T1047,1.0,1.0,1.0,17.0


In [18]:
model = model.save_pretrained("scibert_model")
tokenizer = tokenizer.save_pretrained("scibert_tokenizer")

!zip -r scibert_model.zip scibert_model/
!zip -r scibert_tokenizer.zip scibert_tokenizer/

  adding: scibert_model/ (stored 0%)
  adding: scibert_model/pytorch_model.bin (deflated 7%)
  adding: scibert_model/config.json (deflated 69%)
  adding: scibert_tokenizer/ (stored 0%)
  adding: scibert_tokenizer/special_tokens_map.json (deflated 42%)
  adding: scibert_tokenizer/vocab.txt (deflated 52%)
  adding: scibert_tokenizer/added_tokens.json (deflated 37%)
  adding: scibert_tokenizer/tokenizer_config.json (deflated 74%)
