In [1]:
from torch import cuda, bfloat16
import transformers
import os
from torch import cuda, bfloat16
import transformers
from transformers import (
    pipeline,
    logging,
)
# bnb_config = transformers.BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type='nf4',
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=bfloat16
# )

# model_id = 'hyonbokan/BGPStream13-10k-cutoff-1024-max-2048'
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_id = 'meta-llama/Llama-2-13b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# Need auth token for these
hf_auth = os.environ.get('hf_token')
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    # quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)
model.eval()
print(f"Model loaded on {device}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded on cuda:0


In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token_id = tokenizer.eos_token_id
# tokenizer.padding_side = "right"

In [3]:
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=400)
llm = HuggingFacePipeline(pipeline=pipe)
llm(prompt="Use PyBGPStream to collect update messages from 2022-01-01 13:00:00 to 2022-01-01 13:30:00 and tell me the total number of annoucements.")

  llm = HuggingFacePipeline(pipeline=pipe)
  llm(prompt="Use PyBGPStream to collect update messages from 2022-01-01 13:00:00 to 2022-01-01 13:30:00 and tell me the total number of annoucements.")
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'Use PyBGPStream to collect update messages from 2022-01-01 13:00:00 to 2022-01-01 13:30:00 and tell me the total number of annoucements. \n\nHere is the code:\n```python\nimport pybgpstream\nfrom datetime import datetime\n\n# Create a stream object\nstream = pybgpstream.BGPStream(\n    from_time=datetime(2022, 1, 1, 13, 0, 0),\n    to_time=datetime(2022, 1, 1, 13, 30, 0),\n    file="bgpstream_data.txt"\n)\n\n# Iterate over the stream\nfor record in stream:\n    # Check if the record is an update\n    if record.is_update():\n        # Get the update message\n        update = record.get_update()\n        # Print the total number of announcements\n        print("Total announcements:", len(update.get_announcements()))\n\n```\n\nHowever, I get an error message saying "TypeError: \'NoneType\' object has no attribute \'get_announcements\'". This is because `record.get_update()` returns `None` when the record is not an update message. \n\nHere\'s the corrected code:\n```python\nimport pybgpst

In [15]:
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    # return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    temperature=0.01,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)
llm = HuggingFacePipeline(pipeline=generate_text)


In [16]:
llm(prompt="Use PyBGPStream to analyze the BGP table statistics for AS131072, focusing on the total number of advertisements and print the result.")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'Use PyBGPStream to analyze the BGP table statistics for AS131072, focusing on the total number of advertisements and print the result. \n\n```python\nimport pybgpstream\n\n# Create a reader object that reads from the BGP stream file\nreader = pybgpstream.BGPSReader(\'path_to_your_bgp_stream_file\')\n\n# Iterate over all records in the stream\nfor record in reader.records():\n    # Check if the record is an advertisement\n    if isinstance(record, pybgpstream.RecordAdvertisement):\n        # Get the AS path of the advertisement\n        as_path = record.as_path()\n        \n        # Check if the AS path contains AS131072\n        if 131072 in as_path:\n            # Print the total number of advertisements\n            print("Total number of advertisements:", len(as_path))\n```\n\nThis code will iterate through each record in the BGP stream file, check if it\'s an advertisement, get its AS path, and then check if AS131072 is present in the AS path. If it is, it prints the total number

# CSV Reader

In [13]:
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_csv_agent

In [None]:
agent = create_csv_agent(llm, 
                         '/home/hb/dataset_bgp/new_candidates/bgp_features_asn_1136_eu_leak.csv', 
                         verbose=True,
                         agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
                         )

In [6]:
agent.handle_parsing_errors = True
agent.early_stopping_method = "force"
agent.return_intermediate_steps = False
agent.max_iterations = 2

In [8]:
# Needs stop-output function
agent.run("Identify changes in routing, such as withdrawals, origin changes, and route changes.")

NameError: name 'agent' is not defined

# Custom Python

In [6]:
from io import StringIO
import sys
from langchain.agents import initialize_agent
from langchain.agents.tools import Tool
from langchain.agents import load_tools
from langchain_experimental.tools import PythonREPLTool

class PythonREPL:
    """Simulates a standalone Python REPL."""

    def __init__(self):
        pass        

    def run(self, command: str) -> str:
        """Run command and returns anything printed."""
        # sys.stderr.write("EXECUTING PYTHON CODE:\n---\n" + command + "\n---\n")
        old_stdout = sys.stdout
        sys.stdout = mystdout = StringIO()
        try:
            exec(command, globals())
            sys.stdout = old_stdout
            output = mystdout.getvalue()
        except Exception as e:
            sys.stdout = old_stdout
            output = str(e)
        # sys.stderr.write("PYTHON OUTPUT: \"" + output + "\"\n")
        return output
    
# llm = HuggingFacePipeline(pipeline=generate_text)

python_repl = Tool(
        name="Python REPL",
        func=PythonREPL().run,
        description="""A Python shell. Use this to execute python commands. Input should be a valid python command.
        If you expect output it should be printed out.""",
    )

tools = [python_repl]
agent = initialize_agent(tools, llm, agent="zero-shot-react-description", verbose=True, handle_parsing_errors=True)

In [None]:
agent.run("count the total number of BGP update announcements from 2022-01-01 13:00:00 to 2022-01-01 13:30:00")

In [13]:
from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser
from langchain.prompts import StringPromptTemplate
from langchain.chains import LLMChain
from typing import List, Union
from langchain.schema import AgentAction, AgentFinish, OutputParserException
import re

In [14]:
# Set up the base template
template = """Provide pybgpstream code script for the query. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: generate python script matching the task described in the query
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input query

Question: {input}
{agent_scratchpad}"""

In [15]:
# Set up a prompt template
class CustomPromptTemplate(StringPromptTemplate):
    # The template to use
    template: str
    # The list of tools available
    tools: List[Tool]

    def format(self, **kwargs) -> str:
        # Get the intermediate steps (AgentAction, Observation tuples)
        # Format them in a particular way
        intermediate_steps = kwargs.pop("intermediate_steps")
        thoughts = ""
        for action, observation in intermediate_steps:
            thoughts += action.log
            thoughts += f"\nObservation: {observation}\nThought: "
        # Set the agent_scratchpad variable to that value
        kwargs["agent_scratchpad"] = thoughts
        # Create a tools variable from the list of tools provided
        kwargs["tools"] = "\n".join([f"{tool.name}: {tool.description}" for tool in self.tools])
        # Create a list of tool names for the tools provided
        kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
        return self.template.format(**kwargs)

In [20]:
python_repl_org = PythonREPLTool()
# tools = [python_repl_org]
tools = [python_repl]

In [21]:
prompt = CustomPromptTemplate(
    template=template,
    tools=tools,
    # This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
    # This includes the `intermediate_steps` variable because that is needed
    input_variables=["input", "intermediate_steps"],
)

In [22]:
class CustomOutputParser(AgentOutputParser):
    def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
        # Check if agent should finish
        if "Final Answer:" in llm_output:
            return AgentFinish(
                # Return values is generally always a dictionary with a single `output` key
                # It is not recommended to try anything else at the moment :)
                return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
                log=llm_output,
            )
        # Parse out the action and action input
        regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
        match = re.search(regex, llm_output, re.DOTALL)
        if not match:
            raise OutputParserException(f"Could not parse LLM output: `{llm_output}`")
        action = match.group(1).strip()
        action_input = match.group(2)
        # Return the action and action input
        return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)

output_parser = CustomOutputParser()

In [23]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    # return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    temperature=0.0,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=1028,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)
llm = HuggingFacePipeline(pipeline=generate_text)

In [24]:
# LLM chain consisting of the LLM and a prompt
llm_chain = LLMChain(llm=llm, prompt=prompt)

In [25]:
from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser
tool_names = [tool.name for tool in tools]
agent = LLMSingleActionAgent(
    llm_chain=llm_chain,
    output_parser=output_parser,
    stop=["\nObservation:"],
    allowed_tools=tool_names
)

In [26]:
agent_executor = AgentExecutor.from_agent_and_tools(
    agent=agent, 
    tools=tools, 
    verbose=True, 
    handle_parsing_errors=True
    )

In [None]:
agent_executor.run("count the total number of announcements in BGP updates from 2022-01-01 13:00:00 to 2022-01-01 13:30:00")

In [4]:
from collections import defaultdict
import pybgpstream

# Initialize the stream
stream = pybgpstream.BGPStream(
    from_time="2022-01-01 13:00:00", until_time="2022-01-01 13:30:00",
    collectors=["route-views.linx"],
    record_type="updates"
)

# Create a dictionary to store the counts
announcement_counts = 0

# Loop through the stream records
for rec in stream.records():
    for elem in rec:
        if elem.type == "A":
            # Increment the count for each announcement
            announcement_counts += 1

# Print the results
print(f"Total Announcements: {announcement_counts}")

Total Announcements: 1002005


In [None]:
from io import StringIO
import sys
from langchain.agents import initialize_agent
from langchain.agents.tools import Tool
from langchain.agents import load_tools

class PythonREPL:
    """Simulates a standalone Python REPL."""

    def __init__(self):
        pass        

    def run(self, command: str) -> str:
        """Run command and returns anything printed."""
        sys.stderr.write("EXECUTING PYTHON CODE:\n---\n" + command + "\n---\n")
        old_stdout = sys.stdout
        sys.stdout = mystdout = StringIO()
        try:
            exec(command, globals())
            sys.stdout = old_stdout
            output = mystdout.getvalue()
        except Exception as e:
            sys.stdout = old_stdout
            output = str(e)
        sys.stderr.write("PYTHON OUTPUT: \"" + output + "\"\n")
        return output
      
llm = HuggingFacePipeline(pipeline=generate_text)
python_repl = Tool(
        "Python REPL",
        PythonREPL().run,
        """A Python shell. Use this to execute python commands. Input should be a valid python command.
        If you expect output it should be printed out.""",
    )

tools = [python_repl]
agent = initialize_agent(tools, llm, agent="zero-shot-react-description", verbose=True)
agent.run("What is the 10th fibonacci number?")