In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!mkdir -p drive/MyDrive/data/{hdfs_tokenized,hdfs_bert}
!mkdir -p drive/MyDrive/data/torch_dataset/{train,test}


In [None]:
!pip freeze | grep keras

In [None]:
!pip install tensorflow==2.15.0 --upgrade

In [None]:
!pip install transformers[torch] datasets tokenizers evaluate --upgrade

In [2]:
from datasets import load_dataset, load_from_disk
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from transformers import BertTokenizerFast
import numpy as np
from transformers import AutoConfig, AutoModelForMaskedLM
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
from itertools import chain
import pandas as pd
import pathlib
import re
import evaluate

2024-03-08 22:35:20.782250: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-08 22:35:20.782300: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-08 22:35:20.813528: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-08 22:35:20.961837: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
IS_COLAB = False
COLAB_PREFIX = 'drive/MyDrive/' if IS_COLAB else ''

MAX_LENGTH = 128
HDFS_UNCLEANED_PATH = f"{COLAB_PREFIX}data/hdfsv1.log"
HDFS_CLEANED_PATH = f"{COLAB_PREFIX}data/hdfsv1_regex.log"
TOKENIZER_PATH = f'{COLAB_PREFIX}data/hdfs_tokenized'

files = [HDFS_CLEANED_PATH]

# Pre-Processing

In [None]:
log = pathlib.Path(HDFS_UNCLEANED_PATH).read_text()

## Regex substitution

In [None]:
ipv4_pattern = '(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])'
ipv6_pattern = '(?:(?:[0-9A-Fa-f]{1,4}:){6}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|::(?:[0-9A-Fa-f]{1,4}:){5}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){4}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){3}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,2}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:){2}(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,3}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}:(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,4}[0-9A-Fa-f]{1,4})?::(?:[0-9A-Fa-f]{1,4}:[0-9A-Fa-f]{1,4}|(?:(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5]))|(?:(?:[0-9A-Fa-f]{1,4}:){,5}[0-9A-Fa-f]{1,4})?::[0-9A-Fa-f]{1,4}|(?:(?:[0-9A-Fa-f]{1,4}:){,6}[0-9A-Fa-f]{1,4})?::)(?:%25(?:[A-Za-z0-9\\-._~]|%[0-9A-Fa-f]{2})+)?'
number_pattern = '-?\d+'
datetime_pattern = '\d{2}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}'

log = re.sub(ipv4_pattern, 'IP', log)
log = re.sub(ipv6_pattern, 'IP', log)
log = re.sub(datetime_pattern, 'DATE', log)
log = re.sub(number_pattern, 'NUM', log)

with open('hdfsv1_regex.log', 'w') as f:
    f.write(log)

## Fitting WordPiece tokenizer

In [4]:
# tokenizer = BertWordPieceTokenizer()
# tokenizer.train(files)
# tokenizer.save_model(TOKENIZER_PATH)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

## Dataset preparation

In [6]:
dataset = load_dataset('text', data_files=['data/hdfsv1_regex.log'], split='train')

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
train_dataset = dataset.select(range(0, 1000))
test_dataset = dataset.select(range(1000, 1100))

In [None]:
# dataset = dataset.train_test_split(train_size=0.01, test_size=0.01, shuffle=False)

In [8]:
def encode(examples):
    """Mapping function to tokenize the sentences passed without truncation"""
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH, return_special_tokens_mask=True)

# tokenizing the train dataset
train_dataset = train_dataset.map(encode, batched=True)
# tokenizing the testing dataset
test_dataset = test_dataset.map(encode, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [9]:
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# grabbed from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= MAX_LENGTH:
        total_length = (total_length // MAX_LENGTH) * MAX_LENGTH
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + MAX_LENGTH] for i in range(0, total_length, MAX_LENGTH)]
        for k, t in concatenated_examples.items()
    }
    return result

# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
# might be slower to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
train_dataset = train_dataset.map(group_texts, batched=True,
                                desc=f"Grouping texts in chunks of {MAX_LENGTH}")
test_dataset = test_dataset.map(group_texts, batched=True,
                                desc=f"Grouping texts in chunks of {MAX_LENGTH}")


Grouping texts in chunks of 128:   0%|          | 0/1000 [00:00<?, ? examples/s]

Grouping texts in chunks of 128:   0%|          | 0/100 [00:00<?, ? examples/s]

In [10]:
# convert them from lists to torch tensors
train_dataset.set_format("torch")
test_dataset.set_format("torch")

In [11]:
train_dataset.save_to_disk('drive/MyDrive/data/torch_dataset/train')
test_dataset.save_to_disk('drive/MyDrive/data/torch_dataset/test')

Saving the dataset (0/1 shards):   0%|          | 0/785 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/75 [00:00<?, ? examples/s]

# Training LAnoBERT

First, we'll load all our saved datasets and trained tokenizer.

In [5]:
train_dataset = load_from_disk('data/torch_dataset/train')
test_dataset = load_from_disk('data/torch_dataset/test')

In [13]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [14]:
# model_config = BertConfig(vocab_size=tokenizer.vocab_size, max_position_embeddings=MAX_LENGTH)
# model = BertForMaskedLM(config=model_config)

config = AutoConfig.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForMaskedLM.from_config(config)

training_args = TrainingArguments(
    output_dir='data/hdfs_bert',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs =1,
    eval_accumulation_steps=2,
    # per_device_train_batch_size=5,
    # gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    # per_device_eval_batch_size=64,
    logging_steps=1,             # evaluate, log and save model checkpoints every 1000 step
    save_strategy='epoch',
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=3,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [20]:
trainer.train()

  0%|          | 0/99 [00:00<?, ?it/s]

{'loss': 3.9752, 'grad_norm': 11.026458740234375, 'learning_rate': 1.97979797979798e-05, 'epoch': 0.01}
{'loss': 3.9634, 'grad_norm': 18.617752075195312, 'learning_rate': 1.9595959595959596e-05, 'epoch': 0.02}
{'loss': 3.5273, 'grad_norm': 17.196779251098633, 'learning_rate': 1.9393939393939395e-05, 'epoch': 0.03}
{'loss': 3.6292, 'grad_norm': 11.02869701385498, 'learning_rate': 1.9191919191919194e-05, 'epoch': 0.04}
{'loss': 4.2963, 'grad_norm': 14.201175689697266, 'learning_rate': 1.8989898989898993e-05, 'epoch': 0.05}
{'loss': 4.0635, 'grad_norm': 12.82080078125, 'learning_rate': 1.8787878787878792e-05, 'epoch': 0.06}
{'loss': 4.288, 'grad_norm': 10.600249290466309, 'learning_rate': 1.8585858585858588e-05, 'epoch': 0.07}
{'loss': 2.9356, 'grad_norm': 10.66563892364502, 'learning_rate': 1.8383838383838387e-05, 'epoch': 0.08}
{'loss': 2.9949, 'grad_norm': 12.589249610900879, 'learning_rate': 1.8181818181818182e-05, 'epoch': 0.09}
{'loss': 4.0838, 'grad_norm': 11.869535446166992, 'lear

  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 1.1140453815460205, 'eval_runtime': 1.326, 'eval_samples_per_second': 56.562, 'eval_steps_per_second': 7.542, 'epoch': 1.0}


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


{'train_runtime': 42.3004, 'train_samples_per_second': 18.558, 'train_steps_per_second': 2.34, 'train_loss': 2.811822634754759, 'epoch': 1.0}


TrainOutput(global_step=99, training_loss=2.811822634754759, metrics={'train_runtime': 42.3004, 'train_samples_per_second': 18.558, 'train_steps_per_second': 2.34, 'train_loss': 2.811822634754759, 'epoch': 1.0})

# Inference

In [28]:
import torch

MODEL_PATH = 'data/hdfs_bert/checkpoint-99'
model = BertForMaskedLM.from_pretrained(MODEL_PATH)

In [30]:
from transformers import pipeline

mask_filler = pipeline('fill-mask', model, tokenizer=tokenizer)

In [34]:
#              'NUM NUM NUM INFO dfs.DataNode$PacketResponder: PacketResponder NUM for block blk_NUM terminating'
example_text = 'NUM NUM NUM INFO dfs.DataNode$PacketResponder: PacketResponder NUM for block blk_NUM [MASK]'
mask_filler(example_text, top_k=3)

[{'score': 0.4526243805885315,
  'token': 1013,
  'token_str': '/',
  'sequence': 'num num num info dfs. datanode $ packetresponder : packetresponder num for block blk _ num /'},
 {'score': 0.11271193623542786,
  'token': 1024,
  'token_str': ':',
  'sequence': 'num num num info dfs. datanode $ packetresponder : packetresponder num for block blk _ num :'},
 {'score': 0.0011259051971137524,
  'token': 16371,
  'token_str': 'nu',
  'sequence': 'num num num info dfs. datanode $ packetresponder : packetresponder num for block blk _ num nu'}]