In [1]:
# %pip install -qU transformers accelerate einops langchain xformers bitsandbytes
# %pip install scipy

In [1]:
from torch import cuda, bfloat16
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
import pandas as pd
import os

model_id = 'meta-llama/Llama-2-70b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)


hf_token = os.environ.get('hf_token')
# Need auth token for these
hf_auth = hf_token
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)
model.eval()
print(f"Model loaded on {device}")



Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


Model loaded on cuda:0


In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [3]:
import json
from tablular_eval_util import combine_csv_files, split_dataframe, preprocess_data, run_llm_inference

directory = '/home/hb/dataset_bgp/bgp_tab_dataset_test'
combined_df = combine_csv_files(directory)

# Split the DataFrame into smaller chunks
split_size = 20
data_list = split_dataframe(combined_df, split_size)

# Preprocess the data into the required format
formatted_data = [preprocess_data(chunk) for chunk in data_list]

formatted_data_file = 'llm_table_bgp_data_test_20.json'
with open(formatted_data_file, 'w') as f:
    json.dump(formatted_data, f, indent=4)

with open(formatted_data_file, 'r') as f:
    formatted_data = json.load(f)

output_results_file = 'llm_70B_bgp_data_with_outputs_20.json'
run_llm_inference(formatted_data, model, tokenizer, max_length=3050, output_results_file=output_results_file)

Processed 1/3
Processed 2/3
Processed 3/3
[{'instruction': 'The goal for this task is to determine if the data indicates an anomaly. The context, section, and table columns provide important information for identifying the correct anomaly type.', 'input_seg': '[TLE] The context is about BGP data analysis for detecting anomalies. The section is related to a specific time period of BGP monitoring. [TAB] col: | timestamp | asn | num_routes | num_new_routes | num_withdrawals | num_origin_changes | num_route_changes | max_path_length | avg_path_length | max_edit_distance | avg_edit_distance | num_announcements | num_unique_prefixes_announced | anomaly_status | row 1: | 2017-12-12 04:00:00 | 39523 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | no anomalies detected | [SEP] row 2: | 2017-12-12 04:05:00 | 39523 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | no anomalies detected | [SEP] row 3: | 2017-12-12 04:10:00 | 39523 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 0.0 | 0 | 0 | no anomalies de

In [4]:
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "From the collector RRC06, gather BGP announcements for the IPv6 prefixes over a period from April 5, 2020, 12:00 to April 12, 2020, 12:00 and conduct a time-series analysis to observe trends. Use PyBGPStream for this task."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=512)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] From the collector RRC06, gather BGP announcements for the IPv6 prefixes over a period from April 5, 2020, 12:00 to April 12, 2020, 12:00 and conduct a time-series analysis to observe trends. Use PyBGPStream for this task. [/INST]  Sure, here's an example of how you could use PyBGPStream to gather BGP announcements for IPv6 prefixes and conduct a time-series analysis to observe trends:
```
import pybgpstream
import pandas as pd
import matplotlib.pyplot as plt

# Set up the BGP collector
collector = pybgpstream.BGPCollector(RRC06)

# Define the time period for the analysis
start_time = '2020-04-05 12:00:00'
end_time = '2020-04-12 12:00:00'

# Gather BGP announcements for IPv6 prefixes during the defined time period
announcements = collector.get_announcements(start_time, end_time, afi='ipv6')

# Convert the announcements to a pandas DataFrame
df = pd.DataFrame(announcements)

# Remove any duplicate announcements
df = df.drop_duplicates()

# Group the announcements by day and co

In [None]:
import json
input_file_path = "/home/hb/LLM-research/evaluation/bgp_test_QA_val.json" 

with open(input_file_path, "r") as input_file:
    questions_data = json.load(input_file)

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=512)

generated_answers_qa = []

n_question = 0
for q_dict in questions_data:
    n_question += 1
    question = q_dict["question"]
    # Generate answer using the language model
    prompt = f"[INST]{question}[/INST]"
    generated_answer = pipe(prompt)[0]['generated_text']
    print(f"Output: {generated_answer}")
    generated_answers_qa.append(generated_answer)
    print(f"---------------------{n_question}-----------------------------")

In [None]:
import json

input_file_path = "/home/hb/fine-tuning-alpaca/evaluation/bgp_test_multiple_choice.json" 

with open(input_file_path, "r") as input_file:
    questions_data = json.load(input_file)

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=128)

generated_answers = []
ground_truth = [q_dict["answer"] for q_dict in questions_data]

n_question = 0
for q_dict in questions_data:
    question = q_dict["question"]
    options = q_dict["options"]
    
    # Construct prompt with question and options
    prompt = f"{question}\nChoose the correct answer:\n" + "\n".join(options)
    # print(prompt)
    
    # Generate answer using the language model
    generated_answer = pipe(prompt)[0]['generated_text']
    print(f"Generated Answer: {generated_answer}")
    
        # Parse the generated answer
    answer_lines = generated_answer.splitlines()
    parsed_answer = ""
    for line in answer_lines:
        if line.startswith("Answer: "):
            parsed_answer = re.search(r'\b(\w)\)', line).group(1)
            print(f"Parsed_answer: {parsed_answer}")
            break
    generated_answers.append(parsed_answer)
    n_question += 1
    print(f"----------------{n_question}-----------------")
print(generated_answers)


total_questions = len(questions_data)
correct_answers = 0

for ground_answer, generated_answer in zip(ground_truth, generated_answers):
    if generated_answer.lower() == ground_answer.lower():
        print("Correct")
        correct_answers += 1
    else:
        print("Incorrect")

accuracy = (correct_answers / total_questions) * 100

print(f"Accuracy: {accuracy:.2f}%")
print(f"Correct answers: {correct_answers}")
print(f"Incorrect answers: {total_questions - correct_answers}")

Generated Answer: When analyzing BGP route propagation, which attribute provides information about the path that the BGP update has traversed?
Choose the correct answer:
a) AS-Path
b) MED (Multi-Exit Discriminator)
c) Local Preference
d) Next Hop

Answer: a) AS-Path

Explanation:
The AS-Path attribute in BGP provides information about the path that the BGP update has traversed. It is a list of autonomous system numbers (ASNs) that the update has passed through, with the originating ASN at
Parsed_answer: a
----------------1-----------------
Generated Answer: Which of the following is a primary purpose of BGP?
Choose the correct answer:
a) Establishing communication between devices within a LAN
b) Exchanging routing information between autonomous systems
c) Optimizing local network traffic
d) Assigning IP addresses to devices within a network

Answer: b) Exchanging routing information between autonomous systems

Explanation: BGP (Border Gateway Protocol) is a protocol used for exchanging

In [None]:
import json
import re

input_file_path = "/home/hb/fine-tuning-alpaca/evaluation/bgp_test_fill_the_blank.json" 

with open(input_file_path, "r") as input_file:
    questions_data = json.load(input_file)

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=128)

generated_answers = []
filled_answers = []

n_question = 0
for q_dict in questions_data:
    question = q_dict["question"]
    answer = q_dict["answer"]

    # Generate answer using the language model
    prompt = f"[INST]{question} \n Fill the blank: \n [/INST]"
    generated_answer = pipe(prompt)[0]['generated_text']
    # print(f"Output: {generated_answer}")
    
    generated_answer = re.search(r'\[/INST\]\s+(.*)', generated_answer, re.DOTALL).group(1).strip()
    if "Sure" in generated_answer:
        generated_answer = generated_answer.split('\n', 1)[1]  # Skip the first line
    print(f"Generated Answer: {generated_answer}")
    generated_answers.append(generated_answer)
    
    # Find the position of the placeholder in the first string
    placeholder_position = question.find("________")

    # Extract the content filled in the placeholder
    filled_content = generated_answer[placeholder_position:placeholder_position + len("autonomous systems (ASes)")]
    filled_answers.append(filled_content)
    n_question += 1
    print(f"Filled content: {filled_content}")
    print(f"---------------------{n_question}-----------------------------")
