In [1]:
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
)
from typing import List
import torch
from torch import cuda, bfloat16
from datasets import load_dataset
import os
 
import matplotlib.pyplot as plt
import matplotlib as mpl
# import seaborn as sns
from pylab import rcParams
# from trl import SFTTrainer
 
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [2]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'
# model_id = 'codellama/CodeLlama-7b-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_8bit=True,
)


# Need auth token for these
hf_auth = os.environ.get('hf_token')
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

# device_map = {"": 0}

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map="auto",
    use_auth_token=hf_auth
)

model.eval()
print(f"Model loaded on {device}")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded on cuda:0


In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
 
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [4]:
data = load_dataset("json", data_files="/home/hb/LLM-research/dataset/BGP/BGP_lift/lift_bgp_test1.json")
data["train"]

Downloading and preparing dataset json/default to /home/hb/.cache/huggingface/datasets/json/default-c0aa8670fe72e8e5/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/hb/.cache/huggingface/datasets/json/default-c0aa8670fe72e8e5/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['timestamp', 'nb_A', 'nb_W', 'nb_implicit_W', 'nb_dup_A', 'nb_dup_W', 'nb_A_prefix', 'nb_W_prefix', 'max_A_prefix', 'avg_A_prefix', 'max_A_AS', 'avg_A_AS', 'nb_orign_change', 'nb_new_A', 'nb_new_A_afterW', 'max_path_len', 'avg_path_len', 'max_editdist', 'avg_editdist', 'editdist_7', 'editdist_8', 'editdist_9', 'editdist_10', 'editdist_11', 'editdist_12', 'editdist_13', 'editdist_14', 'editdist_15', 'editdist_16', 'editdist_17', 'nb_tolonger', 'nb_toshorter', 'avg_interarrival', 'anomaly_status'],
    num_rows: 545
})

In [5]:
CUTOFF_LEN = 3050

def generate_prompt(data_point):
    # Exclude the 'anomaly_status' from the input data_point
    input_data = {key: value for key, value in data_point.items() if key != 'anomaly_status'}
    ground_truth = data_point['anomaly_status']
    
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
### Instruction:
Your task is to determine the BGP anomaly status based on the data provided. If an anomaly is detected, specify the reason. If there are no anomalies then state that no anomalies detected. 
### BGP update message:
{input_data}
### Anomaly status:
{ground_truth}"""
 
 
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)
 
    result["labels"] = result["input_ids"].copy()
 
    return result
 
def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

In [6]:
train_val = data["train"].train_test_split(
    test_size=54, shuffle=False, seed=42
)
train_data = (
    train_val["train"].map(generate_and_tokenize_prompt)
)
val_data = (
    train_val["test"].map(generate_and_tokenize_prompt)
)

Map:   0%|          | 0/491 [00:00<?, ? examples/s]

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

In [7]:
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64
 
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

output_dir = "./hyonbo/NLP-BGP-LLaMA7b"
per_device_train_batch_size = 4
gradient_accumulation_steps = 1
optim = "paged_adamw_32bit"
save_steps = 200
logging_steps = 500
learning_rate = 1e-4
max_grad_norm = 0.3
max_steps = 2000
warmup_ratio = 0.05
lr_scheduler_type = "cosine"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    num_train_epochs=3.0
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 33,554,432 || all params: 6,771,970,048 || trainable%: 0.49548996469513035


In [8]:
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, return_tensors="pt", padding=True
)

In [9]:
from trl import SFTTrainer

max_seq_length = 3050

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    peft_config=peft_config,
    dataset_text_field="anomaly_status",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

trainer.train()

Map:   0%|          | 0/491 [00:00<?, ? examples/s]

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-b

Step,Training Loss
500,0.78
1000,0.1717
1500,0.1352
2000,0.1087


Checkpoint destination directory ./hyonbo/NLP-BGP-LLaMA7b/checkpoint-200 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./hyonbo/NLP-BGP-LLaMA7b/checkpoint-400 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./hyonbo/NLP-BGP-LLaMA7b/checkpoint-600 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./hyonbo/NLP-BGP-LLaMA7b/checkpoint-800 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./hyonbo/NLP-BGP-LLaMA7b/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./hyonbo/NLP-BGP-LLaMA7b/checkpoint-1200 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./hyo

TrainOutput(global_step=2000, training_loss=0.29891363143920896, metrics={'train_runtime': 1965.5036, 'train_samples_per_second': 4.07, 'train_steps_per_second': 1.018, 'total_flos': 4283060477853696.0, 'train_loss': 0.29891363143920896, 'epoch': 16.26})

In [10]:
new_model = "nl-llama7b-2k"

trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

('nl-llama7b-2k/tokenizer_config.json',
 'nl-llama7b-2k/special_tokens_map.json',
 'nl-llama7b-2k/tokenizer.model',
 'nl-llama7b-2k/added_tokens.json',
 'nl-llama7b-2k/tokenizer.json')

In [1]:
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
)
import transformers
import torch
from torch import cuda, bfloat16
import os

model_id = 'meta-llama/Llama-2-7b-chat-hf'
# model_id = 'codellama/CodeLlama-7b-hf'
new_model = "nl-llama7b-2k"

hf_auth = os.environ.get('hf_token')
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    # quantization_config=bnb_config,
    device_map="auto",
    use_auth_token=hf_auth
)


# # Reload model in FP16 and merge it with LoRA weights
# base_model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     low_cpu_mem_usage=True,
#     return_dict=True,
#     torch_dtype=torch.float16,
#     device_map='auto',
# )
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import pandas as pd
data = pd.read_json('/home/hb/dataset_bgp/bgp_nlp_dataset/test_taiwan.json')

test_data = data.iloc[0:9]

ground_truth = test_data["anomaly_status"]

test_data = test_data.drop(columns=['anomaly_status'])

In [20]:
import pandas as pd
data = pd.read_json('/home/hb/dataset_bgp/bgp_nlp_dataset/google_leak_main_with_anomalies.json')

test_data = data

ground_truth = test_data["anomaly_status"]

test_data = test_data.drop(columns=['anomaly_status'])


In [21]:
ground_truth

0                                 no anomalies detected
1                                 no anomalies detected
2                                 no anomalies detected
3     anomaly due to high value of avg_interarrival=...
4     anomaly due to high value of nb_A=86458, nb_A_...
5     anomaly due to high value of avg_A_AS=20, nb_n...
6     anomaly due to high value of nb_W=93395, nb_W_...
7     anomaly due to high value of nb_W=344, nb_impl...
8     anomaly due to high value of max_A_prefix=51, ...
9                                 no anomalies detected
10                                no anomalies detected
11          anomaly due to high value of avg_editdist=4
Name: anomaly_status, dtype: object

In [22]:
# Initialize the text generation pipeline
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1024)

# Loop through each instance in the dataset
for i, row in test_data.iterrows():
    # Convert the row to a dictionary
    instance = row.to_dict()
    
    # Create the prompt
    prompt = f"""
   Below is BGP update message data. Your task is to determine the BGP anomaly status based on the data provided. If an anomaly is detected, specify the reason. If there are no anomalies then state that no anomalies detected. Keep your answers short.
    {instance}"""
    
    # Generate text
    result = pipe(f"<s>[INST] {prompt} [/INST]\n")
    
    # Print the result
    print(f"Result for instance {i+1}:")
    print(result[0]['generated_text'])
    print("\n")

Result for instance 1:
<s>[INST] 
   Below is BGP update message data. Your task is to determine the BGP anomaly status based on the data provided. If an anomaly is detected, specify the reason. If there are no anomalies then state that no anomalies detected. Keep your answers short.
    {'timestamp': Timestamp('2017-08-25 03:15:00'), 'nb_A': 2727, 'nb_W': 275, 'nb_implicit_W': 1963, 'nb_dup_A': 378, 'nb_dup_W': 0, 'nb_A_prefix': 571, 'nb_W_prefix': 123, 'max_A_prefix': 54, 'avg_A_prefix': 5, 'max_A_AS': 256, 'avg_A_AS': 20, 'nb_orign_change': 184, 'nb_new_A': 220, 'nb_new_A_afterW': 166, 'max_path_len': 12, 'avg_path_len': 5, 'max_editdist': 9, 'avg_editdist': 3, 'editdist_7': 212, 'editdist_8': 18, 'editdist_9': 42, 'editdist_10': 0, 'editdist_11': 0, 'editdist_12': 0, 'editdist_13': 0, 'editdist_14': 0, 'editdist_15': 0, 'editdist_16': 0, 'editdist_17': 0, 'nb_tolonger': 674, 'nb_toshorter': 1289, 'avg_interarrival': 1103} [/INST]
Based on the provided BGP update message data, the f



Result for instance 11:
<s>[INST] 
   Below is BGP update message data. Your task is to determine the BGP anomaly status based on the data provided. If an anomaly is detected, specify the reason. If there are no anomalies then state that no anomalies detected. Keep your answers short.
    {'timestamp': Timestamp('2017-08-25 04:05:00'), 'nb_A': 4585, 'nb_W': 354, 'nb_implicit_W': 3625, 'nb_dup_A': 532, 'nb_dup_W': 0, 'nb_A_prefix': 1140, 'nb_W_prefix': 217, 'max_A_prefix': 49, 'avg_A_prefix': 4, 'max_A_AS': 296, 'avg_A_AS': 21, 'nb_orign_change': 162, 'nb_new_A': 271, 'nb_new_A_afterW': 157, 'max_path_len': 19, 'avg_path_len': 6, 'max_editdist': 17, 'avg_editdist': 3, 'editdist_7': 146, 'editdist_8': 13, 'editdist_9': 22, 'editdist_10': 0, 'editdist_11': 0, 'editdist_12': 0, 'editdist_13': 0, 'editdist_14': 0, 'editdist_15': 0, 'editdist_16': 0, 'editdist_17': 63, 'nb_tolonger': 1071, 'nb_toshorter': 2554, 'avg_interarrival': 1083} [/INST]

Based on the provided BGP update message data,

In [None]:
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = f"""
Below is BGP update message data. Your task is to determine the BGP anomaly status based on the data provided. If an anomaly is detected, specify the reason. If there are no amomalies then state that no anomalies detected. Keep your answers short.
{test_data[0]}"""
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=2056)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] 
Below is BGP update message data. Your task is to determine the BGP anomaly status based on the data provided. If an anomaly is detected, specify the reason. If there are no amomalies then state that no anomalies detected. Keep your answers short.
{'timestamp': '2017-08-25T03:25:00.000', 'nb_A': 3080, 'nb_W': 309, 'nb_implicit_W': 2199, 'nb_dup_A': 320, 'nb_dup_W': 0, 'nb_A_prefix': 686, 'nb_W_prefix': 181, 'max_A_prefix': 55, 'avg_A_prefix': 4, 'max_A_AS': 314, 'avg_A_AS': 16, 'nb_orign_change': 210, 'nb_new_A': 391, 'nb_new_A_afterW': 170, 'max_path_len': 15, 'avg_path_len': 6, 'max_editdist': 11, 'avg_editdist': 3, 'editdist_7': 250, 'editdist_8': 25, 'editdist_9': 44, 'editdist_10': 1, 'editdist_11': 1, 'editdist_12': 0, 'editdist_13': 0, 'editdist_14': 0, 'editdist_15': 0, 'editdist_16': 0, 'editdist_17': 0, 'nb_tolonger': 889, 'nb_toshorter': 1310, 'avg_interarrival': 1116} [/INST]  Based on the provided BGP update message data, the following anomalies are detected:

1