In [6]:
import os
from openai import OpenAI

client = OpenAI(
    api_key = os.environ.get("OPENAI_API_KEY"),
)

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [2]:
import pandas as pd

df = pd.read_csv("/home/hb/django_react/BGP-LLaMA-webservice/3356_real_time.csv")

In [3]:
def ask_openai_about_df(df, question):
    # Convert DataFrame to JSON
    df_json = df.to_json()
    prompt = f"""
    Here is a DataFrame in JSON format:

    {df_json}

    Please answer the following question based on the data provided: 
    {question}
    """
    
    # Send prompt to OpenAI
    response = client.chat.completions.create(
      model="gpt-4o-mini",
      messages=[
          {"role": "system", "content": "You are a data analysis assistant."},
          {"role": "user", "content": prompt}
      ]
    )

    # Return the model's answer
    return response['choices'][0]['message']['content']

# Example query
result = ask_openai_about_df(df, "List all prefixes announced by ASN")
print(result)


APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


# LLaMA agent

In [7]:
import re
import uuid
from llama_index.core.schema import MetadataMode
from tqdm.notebook import tqdm
import torch
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings
from llama_index.core.query_pipeline import (
    QueryPipeline as QP,
    Link,
    InputComponent,
)
from llama_index.experimental.query_engine.pandas import (
    PandasInstructionParser,
)
from llama_index.core import PromptTemplate
from IPython.display import Markdown, display

In [8]:
# Model names (make sure you have access on HF)
LLAMA2_7B = "meta-llama/Llama-2-7b-hf"
LLAMA2_7B_CHAT = "meta-llama/Llama-2-7b-chat-hf"
LLAMA2_13B = "meta-llama/Llama-2-13b-hf"
LLAMA2_13B_CHAT = "meta-llama/Llama-2-13b-chat-hf"
LLAMA2_70B = "meta-llama/Llama-2-70b-hf"
LLAMA2_70B_CHAT = "meta-llama/Llama-2-70b-chat-hf"

LLAMA3_70B_INST = "meta-llama/Meta-Llama-3-70B-Instruct"
LLAMA3_70B = "meta-llama/Meta-Llama-3-70B"
CodeLlama_70b_py_hf = "meta-llama/CodeLlama-70b-Python-hf"
CodeLlama_13b_py_hf = "meta-llama/CodeLlama-13b-Python-hf"

LLAMA3_8B_INSTRUCT = "meta-llama/Meta-Llama-3.1-8B-Instruct"
custom_model = "hyonbokan/bgp-llama-knowledge-5k"

# SYSTEM_PROMPT = """\
#     Context information is below.
    
#     ---------------------
    
#     Given the context information and not prior knowledge.
#     generate only questions based on the below query.
    
#     You are a Teacher/ Professor. Your task is to setup \
#     questions for an upcoming \
#     quiz/examination. The questions should be diverse in nature \
#     across the document. Restrict the questions to the \
#     context information provided."
#     """

SYSTEM_PROMPT = """\
    Context information is below.
    
    ---------------------
    """

query_wrapper_prompt = PromptTemplate(
    "[INST]<>\n" + SYSTEM_PROMPT + "<>\n\n{query_str}[/INST] "
)

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=2048,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=LLAMA3_8B_INSTRUCT,
    model_name=LLAMA3_8B_INSTRUCT,
    device_map="auto",
    # change these settings below depending on your GPU
    model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True},
)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

# BGP data read

In [9]:
import pandas as pd
# df = pd.read_json("/home/hb/dataset_bgp/bgp_nlp_dataset/google_leak_main_with_anomalies.json")

df= pd.read_csv("/home/hb/dataset_bgp/candidates/bgp_features_asn_1136_eu_leak.csv")
df

Unnamed: 0,Timestamp,Autonomous System Number,Total Routes,New Routes,Withdrawals,Origin Changes,Route Changes,Maximum Path Length,Average Path Length,Maximum Edit Distance,Average Edit Distance,Announcements,Unique Prefixes Announced,Graph Average Degree,Graph Betweenness Centrality,Graph Closeness Centrality,Graph Eigenvector Centrality
0,2019-06-06 00:00:00,1136,0,0,0,0,0,0,0.0,0,0.0,0,0,0.0,0.000000,0.000000,0.00000
1,2019-06-06 00:05:00,1136,12,0,0,0,0,0,0.0,0,0.0,12,12,2.0,0.166667,0.750000,0.50000
2,2019-06-06 00:10:00,1136,0,0,0,0,0,0,0.0,0,0.0,0,0,0.0,0.000000,0.000000,0.00000
3,2019-06-06 00:15:00,1136,4,0,0,0,0,0,0.0,0,0.0,4,4,1.6,0.333333,0.521905,0.43094
4,2019-06-06 00:20:00,1136,6,0,0,0,0,0,0.0,0,0.0,6,6,1.6,0.333333,0.521905,0.43094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,2019-06-06 23:40:00,1136,0,0,0,0,0,0,0.0,0,0.0,0,0,0.0,0.000000,0.000000,0.00000
285,2019-06-06 23:45:00,1136,0,0,0,0,0,0,0.0,0,0.0,0,0,0.0,0.000000,0.000000,0.00000
286,2019-06-06 23:50:00,1136,0,0,0,0,0,0,0.0,0,0.0,0,0,0.0,0.000000,0.000000,0.00000
287,2019-06-06 23:55:00,1136,0,0,0,0,0,0,0.0,0,0.0,0,0,0.0,0.000000,0.000000,0.00000


In [10]:
instruction_str = (
    "1. Convert the query to executable Python code using Pandas.\n"
    "2. The final line of code should be a Python expression that can be called with the `eval()` function.\n"
    "3. The code should represent a solution to the query.\n"
    "4. PRINT ONLY THE EXPRESSION.\n"
    "5. Do not quote the expression.\n"
)

pandas_prompt_str = (
    "You are working with a pandas dataframe in Python.\n"
    "The name of the dataframe is `df`.\n"
    "This is the result of `print(df.head())`:\n"
    "{df_str}\n\n"
    "Follow these instructions:\n"
    "{instruction_str}\n"
    "Query: {query_str}\n\n"
    "Expression:"
)
response_synthesis_prompt_str = (
    "Given an input question, synthesize a response from the query results.\n"
    "Query: {query_str}\n\n"
    "Pandas Instructions (optional):\n{pandas_instructions}\n\n"
    "Pandas Output: {pandas_output}\n\n"
    "Response: "
)

pandas_prompt = PromptTemplate(pandas_prompt_str).partial_format(
    instruction_str=instruction_str, df_str=df.head(5)
)
pandas_output_parser = PandasInstructionParser(df)
response_synthesis_prompt = PromptTemplate(response_synthesis_prompt_str)

In [11]:
qp = QP(
    modules={
        "input": InputComponent(),
        "pandas_prompt": pandas_prompt,
        "llm1": llm,
        "pandas_output_parser": pandas_output_parser,
        "response_synthesis_prompt": response_synthesis_prompt,
        "llm2": llm,
    },
    verbose=True,
)
qp.add_chain(["input", "pandas_prompt", "llm1", "pandas_output_parser"])
qp.add_links(
    [
        Link("input", "response_synthesis_prompt", dest_key="query_str"),
        Link(
            "llm1", "response_synthesis_prompt", dest_key="pandas_instructions"
        ),
        Link(
            "pandas_output_parser",
            "response_synthesis_prompt",
            dest_key="pandas_output",
        ),
    ]
)
# add link from response synthesis prompt to llm2
qp.add_link("response_synthesis_prompt", "llm2")

In [13]:
response = qp.run(
    query_str="What is the highest number of announcements?",
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;38;2;155;135;227m> Running module input with input: 
query_str: What is the highest number of announcements?

[0m[1;3;38;2;155;135;227m> Running module pandas_prompt with input: 
query_str: What is the highest number of announcements?

[0m[1;3;38;2;155;135;227m> Running module llm1 with input: 
prompt: You are working with a pandas dataframe in Python.
The name of the dataframe is `df`.
This is the result of `print(df.head())`:
             Timestamp  Autonomous System Number  Total Routes  New Rout...

[0m

KeyboardInterrupt: 

In [21]:
# LLama3_instruct
display(Markdown(f"{response}"))


The output of the code will be a list of tuples, where each tuple contains a timestamp and an anomaly score. The timestamp is in the format 'YYYY-MM-DD HH:MM:SS', and the anomaly score is a floating-point number representing the difference between the count of unique prefixes announced for that timestamp and the mean of the count of unique prefixes announced for all timestamps.

For example, the output could be:

[('2022-01-01 00:00:00', 0.0), ('2022-01-01 00:01:00', 0.2), ('2022-01-01 00:02:00', -0.3), ...]

The anomaly score can be used to identify anomalies in the BGP updates. A positive anomaly score indicates that the count of unique prefixes announced for that timestamp is higher than the mean, which could indicate an anomaly. A negative anomaly score indicates that the count of unique prefixes announced for that timestamp is lower than the mean, which could also indicate an anomaly. The threshold for determining whether a score is an anomaly can be set based on the specific requirements of the problem.