In [1]:
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
)
from typing import List
import torch
from torch import cuda, bfloat16
from datasets import load_dataset
import os
 
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [2]:
model_id = 'meta-llama/Llama-2-13b-chat-hf'
# model_id = 'codellama/CodeLlama-7b-hf'
# model_id = "meta-llama/Meta-Llama-3-70B-Instruct"

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
)


# Need auth token for these
hf_auth = os.environ.get('hf_token')
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

# device_map = {"": 0}

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map="auto",
    use_auth_token=hf_auth
)

model.eval()
print(f"Model loaded on {device}")



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


Model loaded on cuda:0


In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
 
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [5]:
data = load_dataset("json", data_files="/home/hb/LLM-research/dataset/5G/network_analysis/network_analysis_main_new.json")
data["train"]

Downloading and preparing dataset json/default to /home/hb/.cache/huggingface/datasets/json/default-a868141dc84e87dc/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/hb/.cache/huggingface/datasets/json/default-a868141dc84e87dc/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['instruction', 'input', 'output', 'most_similar_instructions', 'avg_similarity_score'],
    num_rows: 3001
})

In [5]:
CUTOFF_LEN = 3050

def generate_prompt(data_point):
    # Exclude the 'anomaly_status' from the input data_point
    input_data = {key: value for key, value in data_point.items() if key != 'anomaly_status'}
    ground_truth = data_point['anomaly_status']
    
    return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.  # noqa: E501
### Instruction:
Your task is to determine the BGP anomaly status based on the data provided. If an anomaly is detected, specify the reason. If there are no anomalies then state that no anomalies detected. 
### BGP update message:
{input_data}
### Anomaly status:
{ground_truth}"""
 
 
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)
 
    result["labels"] = result["input_ids"].copy()
 
    return result
 
def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

In [6]:
train_val = data["train"].train_test_split(
    test_size=54, shuffle=False, seed=42
)
train_data = (
    train_val["train"].map(generate_and_tokenize_prompt)
)
val_data = (
    train_val["test"].map(generate_and_tokenize_prompt)
)

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

In [7]:
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64
 
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

output_dir = "./hyonbo/NLP-BGP-LLaMA3-70b"
per_device_train_batch_size = 4
gradient_accumulation_steps = 1
optim = "paged_adamw_32bit"
save_steps = 200
logging_steps = 500
learning_rate = 1e-4
max_grad_norm = 0.3
max_steps = 2000
warmup_ratio = 0.05
lr_scheduler_type = "cosine"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    num_train_epochs=3.0
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 131,072,000 || all params: 70,684,778,496 || trainable%: 0.1854317192313438


In [8]:
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, return_tensors="pt", padding=True
)

In [9]:
from trl import SFTTrainer

max_seq_length = 3050

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    peft_config=peft_config,
    dataset_text_field="anomaly_status",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

trainer.train()

Map:   0%|          | 0/54 [00:00<?, ? examples/s]



Step,Training Loss
500,1.5327
1000,0.2507
1500,0.1515
2000,0.1007


TrainOutput(global_step=2000, training_loss=0.5088691596984863, metrics={'train_runtime': 10374.1869, 'train_samples_per_second': 0.771, 'train_steps_per_second': 0.193, 'total_flos': 3.072201020493005e+16, 'train_loss': 0.5088691596984863, 'epoch': 16.26})

In [10]:
new_model = "nl-llama3-70b-2k"

trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

('nl-llama3-70b-2k/tokenizer_config.json',
 'nl-llama3-70b-2k/special_tokens_map.json',
 'nl-llama3-70b-2k/tokenizer.json')

In [2]:
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
)
import transformers
import torch
from torch import cuda, bfloat16
import os

# model_id = 'meta-llama/Llama-2-7b-chat-hf'
# model_id = 'codellama/CodeLlama-7b-hf'
model_id = "meta-llama/Meta-Llama-3-70B-Instruct"
new_model = "nl-llama3-70b-2k"

hf_auth = os.environ.get('hf_token')

bnb_config = transformers.BitsAndBytesConfig(
    load_in_8bit=True,
)

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map="auto",
    use_auth_token=hf_auth
)


# # Reload model in FP16 and merge it with LoRA weights
# base_model = AutoModelForCausalLM.from_pretrained(
#     model_id,
#     low_cpu_mem_usage=True,
#     return_dict=True,
#     torch_dtype=torch.float16,
#     device_map='auto',
# )
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
import pandas as pd
data = pd.read_json('/home/hb/dataset_bgp/bgp_nlp_dataset/test_taiwan.json')

test_data = data.iloc[0:9]

ground_truth = test_data["anomaly_status"]

test_data = test_data.drop(columns=['anomaly_status'])

In [4]:
import pandas as pd
data = pd.read_json('/home/hb/dataset_bgp/bgp_nlp_dataset/google_leak_main_with_anomalies.json')

test_data = data

ground_truth = test_data["anomaly_status"]

test_data = test_data.drop(columns=['anomaly_status'])

In [5]:
ground_truth

0                                 no anomalies detected
1                                 no anomalies detected
2                                 no anomalies detected
3     anomaly due to high value of avg_interarrival=...
4     anomaly due to high value of nb_A=86458, nb_A_...
5     anomaly due to high value of avg_A_AS=20, nb_n...
6     anomaly due to high value of nb_W=93395, nb_W_...
7     anomaly due to high value of nb_W=344, nb_impl...
8     anomaly due to high value of max_A_prefix=51, ...
9                                 no anomalies detected
10                                no anomalies detected
11          anomaly due to high value of avg_editdist=4
Name: anomaly_status, dtype: object

In [9]:
# Initialize the text generation pipeline
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1024)

# Loop through each instance in the dataset
for i, row in test_data.iterrows():
    # Convert the row to a dictionary
    instance = row.to_dict()
    
    # Create the prompt
    prompt = f"""
   Below is BGP update message data. Your task is to determine the BGP anomaly status based on the data provided. If an anomaly is detected, specify the reason. If there are no anomalies then state that no anomalies detected. Keep your answers short.
    {instance}"""
    
    # Generate text
    result = pipe(f"<s>[INST] {prompt} [/INST]\n")
    
    # Print the result
    print(f"Result for instance {i+1}:")
    print(result[0]['generated_text'])
    print("\n")

Result for instance 1:
<s>[INST] 
   Below is BGP update message data. Your task is to determine the BGP anomaly status based on the data provided. If an anomaly is detected, specify the reason. If there are no anomalies then state that no anomalies detected. Keep your answers short.
    {'timestamp': Timestamp('2017-08-25 03:15:00'), 'nb_A': 2727, 'nb_W': 275, 'nb_implicit_W': 1963, 'nb_dup_A': 378, 'nb_dup_W': 0, 'nb_A_prefix': 571, 'nb_W_prefix': 123, 'max_A_prefix': 54, 'avg_A_prefix': 5, 'max_A_AS': 256, 'avg_A_AS': 20, 'nb_orign_change': 184, 'nb_new_A': 220, 'nb_new_A_afterW': 166, 'max_path_len': 12, 'avg_path_len': 5, 'max_editdist': 9, 'avg_editdist': 3, 'editdist_7': 212, 'editdist_8': 18, 'editdist_9': 42, 'editdist_10': 0, 'editdist_11': 0, 'editdist_12': 0, 'editdist_13': 0, 'editdist_14': 0, 'editdist_15': 0, 'editdist_16': 0, 'editdist_17': 0, 'nb_tolonger': 674, 'nb_toshorter': 1289, 'avg_interarrival': 1103} [/INST]
* [inst] * [inst] * [inst] * [inst] * [inst] * [inst

KeyboardInterrupt: 

In [None]:
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = f"""
Below is BGP update message data. Your task is to determine the BGP anomaly status based on the data provided. If an anomaly is detected, specify the reason. If there are no amomalies then state that no anomalies detected. Keep your answers short.
{test_data[0]}"""
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1024)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])