In [1]:
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
)
from typing import List
import torch
from torch import cuda, bfloat16
from datasets import load_dataset
import os
 
import matplotlib.pyplot as plt
import matplotlib as mpl
# import seaborn as sns
from pylab import rcParams
# from trl import SFTTrainer
 
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [2]:
model_id = 'meta-llama/Llama-2-13b-chat-hf'
# model_id = 'codellama/CodeLlama-7b-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_8bit=True,
)


# Need auth token for these
hf_auth = os.environ.get('hf_token')
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

# device_map = {"": 0}

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map="auto",
    use_auth_token=hf_auth
)

model.eval()
print(f"Model loaded on {device}")



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded on cuda:0


In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
 
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [4]:
data = load_dataset("json", data_files="/home/hb/LLM-research/dataset/BGP/BGP_lift/lift_bgp_test1.json")
data["train"]

Found cached dataset json (/home/hb/.cache/huggingface/datasets/json/default-f5d2c046a3af28bd/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['timestamp', 'nb_A', 'nb_W', 'nb_implicit_W', 'nb_dup_A', 'nb_dup_W', 'nb_A_prefix', 'nb_W_prefix', 'max_A_prefix', 'avg_A_prefix', 'max_A_AS', 'avg_A_AS', 'nb_orign_change', 'nb_new_A', 'nb_new_A_afterW', 'max_path_len', 'avg_path_len', 'max_editdist', 'avg_editdist', 'editdist_7', 'editdist_8', 'editdist_9', 'editdist_10', 'editdist_11', 'editdist_12', 'editdist_13', 'editdist_14', 'editdist_15', 'editdist_16', 'editdist_17', 'nb_tolonger', 'nb_toshorter', 'avg_interarrival', 'anomaly_status'],
    num_rows: 78
})

In [5]:
CUTOFF_LEN = 2048

def generate_prompt(data_point):
    # Exclude the 'anomaly_status' from the input data_point
    input_data = {key: value for key, value in data_point.items() if key != 'anomaly_status'}
    ground_truth = data_point['anomaly_status']
    
    return f"""Below is BGP update message data. The task is to determine whether the BGP data indicates an anomaly. The response should be an alert indicating whether it is an anomaly or normal.  # noqa: E501
### Input:
{input_data}
### Response:
{ground_truth}"""
 
 
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)
 
    result["labels"] = result["input_ids"].copy()
 
    return result
 
def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

In [6]:
train_val = data["train"].train_test_split(
    test_size=10, shuffle=True, seed=42
)
train_data = (
    train_val["train"].map(generate_and_tokenize_prompt)
)
val_data = (
    train_val["test"].map(generate_and_tokenize_prompt)
)

Loading cached split indices for dataset at /home/hb/.cache/huggingface/datasets/json/default-f5d2c046a3af28bd/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-eef7a50a59528d8e.arrow and /home/hb/.cache/huggingface/datasets/json/default-f5d2c046a3af28bd/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-133408cb35f59f4e.arrow
Loading cached processed dataset at /home/hb/.cache/huggingface/datasets/json/default-f5d2c046a3af28bd/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-b2c92f3b262e86c2.arrow


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [7]:
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64
 
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

output_dir = "./hyonbo/NLP-BGP-LLaMA13b"
per_device_train_batch_size = 4
gradient_accumulation_steps = 1
optim = "paged_adamw_32bit"
save_steps = 200
logging_steps = 500
learning_rate = 1e-4
max_grad_norm = 0.3
max_steps = 2000
warmup_ratio = 0.05
lr_scheduler_type = "cosine"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    num_train_epochs=3.0
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 52,428,800 || all params: 13,068,293,120 || trainable%: 0.40119087870597137


In [8]:
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, return_tensors="pt", padding=True
)

In [9]:
from trl import SFTTrainer

max_seq_length = 2048

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    peft_config=peft_config,
    dataset_text_field="anomaly_status",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

trainer.train()

Loading cached processed dataset at /home/hb/.cache/huggingface/datasets/json/default-f5d2c046a3af28bd/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-3493477d09028cc1.arrow


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mmkkanhb[0m ([33mdnlab_2023[0m). Use [1m`wandb login --relogin`[0m to forc

Step,Training Loss
500,1.0079
1000,0.1723
1500,0.1584
2000,0.1558


TrainOutput(global_step=2000, training_loss=0.3736383399963379, metrics={'train_runtime': 2539.3028, 'train_samples_per_second': 3.15, 'train_steps_per_second': 0.788, 'total_flos': 2.018421644967936e+16, 'train_loss': 0.3736383399963379, 'epoch': 117.65})

In [10]:
new_model = "nl-llama13b-2k"

trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

('nl-llama13b-2k/tokenizer_config.json',
 'nl-llama13b-2k/special_tokens_map.json',
 'nl-llama13b-2k/tokenizer.model',
 'nl-llama13b-2k/added_tokens.json',
 'nl-llama13b-2k/tokenizer.json')

In [1]:
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
)
import transformers
import torch
from torch import cuda, bfloat16
import os

model_id = 'meta-llama/Llama-2-13b-chat-hf'
# model_id = 'codellama/CodeLlama-7b-hf'
new_model = "nl-llama13b-2k"

hf_auth = os.environ.get('hf_token')
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    # quantization_config=bnb_config,
    device_map="auto",
    use_auth_token=hf_auth
)


# # Reload model in FP16 and merge it with LoRA weights
# base_model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     low_cpu_mem_usage=True,
#     return_dict=True,
#     torch_dtype=torch.float16,
#     device_map='auto',
# )
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
import pandas as pd
from datetime import datetime, timedelta

# Generate timestamps
start_time = datetime(2017, 8, 25, 3, 15, 0)
timestamps = [start_time + timedelta(minutes=5 * i) for i in range(5)]

# Define normal data and one anomaly
data = [
    {
        "timestamp": "2017-08-25T03:25:00.000",
        "nb_A": 3080,
        "nb_W": 309,
        "nb_implicit_W": 2199,
        "nb_dup_A": 320,
        "nb_dup_W": 0,
        "nb_A_prefix": 686,
        "nb_W_prefix": 181,
        "max_A_prefix": 55,
        "avg_A_prefix": 4,
        "max_A_AS": 314,
        "avg_A_AS": 16,
        "nb_orign_change": 210,
        "nb_new_A": 391,
        "nb_new_A_afterW": 170,
        "max_path_len": 15,
        "avg_path_len": 6,
        "max_editdist": 11,
        "avg_editdist": 3,
        "editdist_7": 250,
        "editdist_8": 25,
        "editdist_9": 44,
        "editdist_10": 1,
        "editdist_11": 1,
        "editdist_12": 0,
        "editdist_13": 0,
        "editdist_14": 0,
        "editdist_15": 0,
        "editdist_16": 0,
        "editdist_17": 0,
        "nb_tolonger": 889,
        "nb_toshorter": 1310,
        "avg_interarrival": 1116,
        "anomaly_status": ""
    },
    {
        "timestamp": "2017-08-25T03:35:00.000",
        "nb_A": 86458,
        "nb_W": 356,
        "nb_implicit_W": 3790,
        "nb_dup_A": 7500,
        "nb_dup_W": 0,
        "nb_A_prefix": 75578,
        "nb_W_prefix": 203,
        "max_A_prefix": 46,
        "avg_A_prefix": 1,
        "max_A_AS": 19182,
        "avg_A_AS": 38,
        "nb_orign_change": 182,
        "nb_new_A": 75028,
        "nb_new_A_afterW": 140,
        "max_path_len": 21,
        "avg_path_len": 4,
        "max_editdist": 14,
        "avg_editdist": 3,
        "editdist_7": 243,
        "editdist_8": 58,
        "editdist_9": 75,
        "editdist_10": 69,
        "editdist_11": 9,
        "editdist_12": 13,
        "editdist_13": 0,
        "editdist_14": 1,
        "editdist_15": 0,
        "editdist_16": 0,
        "editdist_17": 0,
        "nb_tolonger": 1180,
        "nb_toshorter": 2610,
        "avg_interarrival": 1087,
        # "anomaly_status": "anomaly due to high value of nb_A, nb_A_prefix, avg_A_prefix, max_A_AS, avg_A_AS, nb_new_A, editdist_9"
    },
    {
        "timestamp": "2017-08-25T04:00:00.000",
        "nb_A": 4311,
        "nb_W": 401,
        "nb_implicit_W": 3069,
        "nb_dup_A": 625,
        "nb_dup_W": 0,
        "nb_A_prefix": 1030,
        "nb_W_prefix": 165,
        "max_A_prefix": 54,
        "avg_A_prefix": 4,
        "max_A_AS": 305,
        "avg_A_AS": 20,
        "nb_orign_change": 206,
        "nb_new_A": 367,
        "nb_new_A_afterW": 250,
        "max_path_len": 19,
        "avg_path_len": 6,
        "max_editdist": 17,
        "avg_editdist": 3,
        "editdist_7": 202,
        "editdist_8": 23,
        "editdist_9": 36,
        "editdist_10": 2,
        "editdist_11": 5,
        "editdist_12": 7,
        "editdist_13": 1,
        "editdist_14": 1,
        "editdist_15": 0,
        "editdist_16": 0,
        "editdist_17": 56,
        "nb_tolonger": 1318,
        "nb_toshorter": 1751,
        "avg_interarrival": 1112,
        "anomaly_status": ""
    },
        {
        "timestamp": "2017-08-25T03:20:00.000",
        "nb_A": 2462,
        "nb_W": 148,
        "nb_implicit_W": 1721,
        "nb_dup_A": 347,
        "nb_dup_W": 0,
        "nb_A_prefix": 434,
        "nb_W_prefix": 66,
        "max_A_prefix": 51,
        "avg_A_prefix": 6,
        "max_A_AS": 188,
        "avg_A_AS": 16,
        "nb_orign_change": 172,
        "nb_new_A": 285,
        "nb_new_A_afterW": 109,
        "max_path_len": 13,
        "avg_path_len": 6,
        "max_editdist": 11,
        "avg_editdist": 3,
        "editdist_7": 165,
        "editdist_8": 36,
        "editdist_9": 37,
        "editdist_10": 0,
        "editdist_11": 1,
        "editdist_12": 0,
        "editdist_13": 0,
        "editdist_14": 0,
        "editdist_15": 0,
        "editdist_16": 0,
        "editdist_17": 0,
        "nb_tolonger": 738,
        "nb_toshorter": 983,
        "avg_interarrival": 1133,
        "anomaly_status": ""
    },
    {
        "timestamp": "2017-08-25T03:25:00.000",
        "nb_A": 3080,
        "nb_W": 309,
        "nb_implicit_W": 2199,
        "nb_dup_A": 320,
        "nb_dup_W": 0,
        "nb_A_prefix": 686,
        "nb_W_prefix": 181,
        "max_A_prefix": 55,
        "avg_A_prefix": 4,
        "max_A_AS": 314,
        "avg_A_AS": 16,
        "nb_orign_change": 210,
        "nb_new_A": 391,
        "nb_new_A_afterW": 170,
        "max_path_len": 15,
        "avg_path_len": 6,
        "max_editdist": 11,
        "avg_editdist": 3,
        "editdist_7": 250,
        "editdist_8": 25,
        "editdist_9": 44,
        "editdist_10": 1,
        "editdist_11": 1,
        "editdist_12": 0,
        "editdist_13": 0,
        "editdist_14": 0,
        "editdist_15": 0,
        "editdist_16": 0,
        "editdist_17": 0,
        "nb_tolonger": 889,
        "nb_toshorter": 1310,
        "avg_interarrival": 1116,
        "anomaly_status": ""
    },
]

anomaly_status_list = [
    "",
    "anomaly due to high value of nb_A, nb_A_prefix, avg_A_prefix, max_A_AS, avg_A_AS, nb_new_A, editdist_9",
    "",
    "",
    ""
]

In [5]:
# Initialize the text generation pipeline
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=512)

# Loop through each instance in the dataset
for i, instance in enumerate(data):
    prompt = f"""
   Below is BGP update message data. Your task is to determine the BGP anomaly status based on the data provided. If an anomaly is detected, specify the reason. If there are no amomalies then state that no anomalies detected. 
    {instance}"""
    
    result = pipe(f"<s>[INST] {prompt} [/INST]")
    print(f"Result for instance {i}:")
    print(result[0]['generated_text'])
    print("\n")

Result for instance 0:
<s>[INST] 
   Below is BGP update message data. Your task is to determine the BGP anomaly status based on the data provided. If an anomaly is detected, specify the reason. If there are no amomalies then state that no anomalies detected. 
    {'timestamp': '2017-08-25T03:25:00.000', 'nb_A': 3080, 'nb_W': 309, 'nb_implicit_W': 2199, 'nb_dup_A': 320, 'nb_dup_W': 0, 'nb_A_prefix': 686, 'nb_W_prefix': 181, 'max_A_prefix': 55, 'avg_A_prefix': 4, 'max_A_AS': 314, 'avg_A_AS': 16, 'nb_orign_change': 210, 'nb_new_A': 391, 'nb_new_A_afterW': 170, 'max_path_len': 15, 'avg_path_len': 6, 'max_editdist': 11, 'avg_editdist': 3, 'editdist_7': 250, 'editdist_8': 25, 'editdist_9': 44, 'editdist_10': 1, 'editdist_11': 1, 'editdist_12': 0, 'editdist_13': 0, 'editdist_14': 0, 'editdist_15': 0, 'editdist_16': 0, 'editdist_17': 0, 'nb_tolonger': 889, 'nb_toshorter': 1310, 'avg_interarrival': 1116, 'anomaly_status': ''} [/INST]  Based on the provided BGP update message data, I detected t

In [4]:
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = f"""
Given BGP update message data, determine whether the BGP data indicates an anomaly: 
{data[0]}"""
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=512)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] 
Given BGP update message data, determine whether the BGP data indicates an anomaly: 
{'timestamp': '2017-08-25T03:25:00.000', 'nb_A': 3080, 'nb_W': 309, 'nb_implicit_W': 2199, 'nb_dup_A': 320, 'nb_dup_W': 0, 'nb_A_prefix': 686, 'nb_W_prefix': 181, 'max_A_prefix': 55, 'avg_A_prefix': 4, 'max_A_AS': 314, 'avg_A_AS': 16, 'nb_orign_change': 210, 'nb_new_A': 391, 'nb_new_A_afterW': 170, 'max_path_len': 15, 'avg_path_len': 6, 'max_editdist': 11, 'avg_editdist': 3, 'editdist_7': 250, 'editdist_8': 25, 'editdist_9': 44, 'editdist_10': 1, 'editdist_11': 1, 'editdist_12': 0, 'editdist_13': 0, 'editdist_14': 0, 'editdist_15': 0, 'editdist_16': 0, 'editdist_17': 0, 'nb_tolonger': 889, 'nb_toshorter': 1310, 'avg_interarrival': 1116, 'anomaly_status': ''} [/INST]  Based on the provided BGP update message data, there are several anomalies that can be identified:

1. High number of duplicate AS paths: nb_dup_A=320, nb_dup_W=0. This indicates that there are a large number of duplicate AS pat