In [1]:
import transformers
from transformers import (
    pipeline,
    logging,
)
from typing import List
import torch
from torch import cuda, bfloat16
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
# import seaborn as sns
from pylab import rcParams
import os

In [2]:
model_id = 'hyonbokan/llama_5G_pcap_5kEpoch'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'


# Need auth token for these
hf_auth = os.environ.get('hf_token')

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    # quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)


model.eval()
print(f"Model loaded on {device}")

Downloading config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Model loaded on cuda:0


In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "right"

Downloading tokenizer_config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

In [4]:
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "Given a n3.pcap file containing network traffic data, create a Python script to process the PCAP. The script should extract the source IP, destination IP, source port, destination port, and the raw payload data for each packet in the PCAP. Store this data in a pandas DataFrame and display it."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=712)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Given a n3.pcap file containing network traffic data, create a Python script to process the PCAP. The script should extract the source IP, destination IP, source port, destination port, and the raw payload data for each packet in the PCAP. Store this data in a pandas DataFrame and display it. [/INST]  Sure, here is a Python script that processes a PCAP file containing 5G network traffic and extracts the source IP, destination IP, source port, destination port, and the raw payload data for each packet:

import pandas as pd
import scapy

# Load the PCAP file
packets = scapy.rdpcap('5g_traffic.pcap')

# Initialize an empty DataFrame
df = pd.DataFrame(columns=['Source IP', 'Destination IP', 'Source Port', 'Destination Port', 'Payload'])

# Iterate over all the packets
for packet in packets:

    # Extract the source and destination IPs
    src_ip = packet[IP].src
    dst_ip = packet[IP].dst

    # Extract the source and destination ports
    src_port = packet[TCP].sport if TCP in

In [4]:
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "Generate Python code to calculate and visualize 5G network performance KPIs: Total Network Capacity, Capacity per Area, Capacity per Point, Cost per Capacity, Cost per Area, and Surplus per Area. Load data from '5G_Infrastructure/demand_driven_postcode_data_results.csv'. Use keywords: 'capacity', 'cost', 'area', 'numpoints' to identify relevant columns."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=712)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Generate Python code to calculate and visualize 5G network performance KPIs: Total Network Capacity, Capacity per Area, Capacity per Point, Cost per Capacity, Cost per Area, and Surplus per Area. Load data from '5G_Infrastructure/demand_driven_postcode_data_results.csv'. Use keywords: 'capacity', 'cost', 'area', 'numpoints' to identify relevant columns. [/INST]  Sure! Here is an example of how you can use Python to calculate and visualize 5G network performance KPIs from a CSV file:

import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file
df = pd.read_csv('5G_Infrastructure/demand_driven_postcode_data_results.csv')

# Identify relevant columns
capacity_columns = [col for col in df.columns if 'capacity' in col.lower()]
cost_columns = [col for col in df.columns if 'cost' in col.lower()]
area_columns = [col for col in df.columns if 'area' in col.lower()]
numpoints_columns = [col for col in df.columns if 'numpoints' in col.lower()]

# Calculate total network capa

In [None]:
import json
import re

input_file_path = "/home/hb/LLM-research/evaluation/5G/5G_analysis_multi_choice_v2.json" 

with open(input_file_path, "r") as input_file:
    questions_data = json.load(input_file)

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=128)

generated_answers = []
ground_truth = [q_dict["answer"] for q_dict in questions_data]
n_question = 0

for q_dict in questions_data:
    question = q_dict["question"]
    options = q_dict["options"]
    
    # Construct prompt with question and options
    prompt = f"Choose the correct answer. {question}\n" + "\n".join(options)
    # print(prompt)
    
    # Generate answer using the language model
    generated_answer = pipe(prompt)[0]['generated_text']
    print(f"Generated Answer: {generated_answer}")
    
    # Parse the generated answer
    answer_lines = generated_answer.splitlines()
    parsed_answer = ""
    for line in answer_lines:
            if line.startswith("Answer: ") or line.startswith("Correct answer: "):
                match = re.search(r'\b(\w)[\)-]', line)
                if match is not None:
                    parsed_answer = match.group(1)
                    print(f"Parsed_answer: {parsed_answer}")
                    break
                else:
                    parsed_answer = "Couldn't parse the answer"
                    print(f"Parsed_answer: {parsed_answer}")
    generated_answers.append(parsed_answer)
    n_question += 1
    print(f"----------------{n_question}-----------------")
print(generated_answers)


total_questions = len(questions_data)
correct_answers = 0

for ground_answer, generated_answer in zip(ground_truth, generated_answers):
    if generated_answer.lower() == ground_answer.lower():
        print("Correct")
        correct_answers += 1
    else:
        print(f"GPT answer: {generated_answer}. Correct answer: {ground_answer}")
        print("Incorrect")

accuracy = (correct_answers / total_questions) * 100

print(f"Accuracy: {accuracy:.2f}%")
print(f"Correct answers: {correct_answers}")
print(f"Incorrect answers: {total_questions - correct_answers}")