In [12]:
#read two dictionary of as paths to a certain IP prefix and ask LLM for BGP Event Report
import pybgpstream
import networkx as nx
from itertools import groupby
from collections import defaultdict
import matplotlib.pyplot as plt
import openai
import os
import re
import spacy
import collections
import numpy as np
import copy
import time
import evaluate
from tqdm.notebook import trange, tqdm
import pandas as pd
import json
import random
from datetime import datetime, timedelta
openai.api_key = "YOUR OPENAI API KEY"
os.environ["OPENAI_API_KEY"] = "YOUR OPENAI API KEY"

In [2]:
#get from LLM
Target_IPs = [
    "8.8.8.0/24",
    "1.1.1.0/24",
    "9.9.9.0/24",
    "17.0.0.0/8",
    "23.0.0.0/8",
    "31.13.24.0/21",
    "52.0.0.0/8",
    "64.233.160.0/19",
    "104.16.0.0/12",
    "172.217.0.0/16",
    "185.60.216.0/22",
    "199.16.156.0/22",
    "204.79.195.0/24",
    "205.251.192.0/19",
    "208.80.152.0/22",
    "216.58.192.0/19",
    "216.239.32.0/19",
    "69.63.176.0/20",
    "74.125.0.0/16",
    "131.253.21.0/24"
]

In [None]:
#examine which one we can use by checking rib from rrc00
Final_target_IPs = []
for IP_prefix in tqdm(Target_IPs):
    #rcc collects rib every 8 hours, we pick the 2nd last checkpoint
    stream = pybgpstream.BGPStream(
        from_time="2022-02-03 01:04:00", until_time="2022-02-03 09:00:00",
        collectors=["rrc00"],
        record_type="ribs",
        filter = f"prefix exact {IP_prefix}"  #collect as path to ip prefix that are exact to the target IP prefix
    )
    as_path = defaultdict(dict)
    i = 0
    for rec in tqdm(stream.records()):
        for ele in rec:
            # Get the peer ASn
            peer = str(ele.peer_asn)
            hops = [k for k, g in groupby(ele.fields['as-path'].split(" "))]
            #print(ele)
            if str(ele.type) == "R":
                if 'as-path' and "prefix" in ele.fields:
                    i += 1
    if i > 0:
        Final_target_IPs.append(IP_prefix)

In [4]:
Final_target_IPs

['8.8.8.0/24',
 '1.1.1.0/24',
 '9.9.9.0/24',
 '17.0.0.0/8',
 '31.13.24.0/21',
 '64.233.160.0/19',
 '104.16.0.0/12',
 '172.217.0.0/16',
 '185.60.216.0/22',
 '199.16.156.0/22',
 '204.79.195.0/24',
 '216.58.192.0/19',
 '216.239.32.0/19',
 '69.63.176.0/20',
 '74.125.0.0/16',
 '131.253.21.0/24']

In [None]:
#First test if these IP are vaild
rcc_collector_lists = ["rrc00", "rrc01", "rrc03", "rrc04", "rrc05", "rrc06", "rrc07", "rrc10", "rrc11", "rrc12", "rrc10", "rrc11",
                      "rrc12", "rrc13", "rrc14", "rrc15", "rrc16", "rrc17", "rrc18", "rrc19", "rrc20", "rrc21", "rrc22", "rrc23",
                      "rrc24", "rrc25", "rrc26"]
history_rib = defaultdict(dict)
for IP_prefix in tqdm(Final_target_IPs):
    for collector in tqdm(rcc_collector_lists):
        #rcc collects rib every 8 hours, we pick the 2nd last checkpoint
        stream = pybgpstream.BGPStream(
            from_time="2022-02-03 01:04:00", until_time="2022-02-03 09:00:00",
            collectors=[collector],
            record_type="ribs",
            filter = f"prefix exact {IP_prefix}"  #collect as path to ip prefix that are less or more specific to the target IP prefix
        )
        as_path = defaultdict(dict)
        i = 0
        for rec in tqdm(stream.records()):
            for ele in rec:
                # Get the peer ASn
                peer = str(ele.peer_asn)
                hops = [k for k, g in groupby(ele.fields['as-path'].split(" "))]
                #print(ele)
                if str(ele.type) == "R":
                    if 'as-path' and "prefix" in ele.fields:
                        IP = ele.fields["prefix"]
                        as_path[IP][peer] = hops
                        if i == 0:
                            print(hops)
                        i += 1
        history_rib[collector].update(as_path)
        with open("Data/record_2.txt", "a") as f:
            f.write(f"{IP_prefix}, {collector}, {i}.\n")

In [6]:
with open("Data/" + f"synthetic_history_rib.json", "w") as f:
    json.dump(history_rib, f)

In [19]:
# chat function to call openai api
llm = openai.OpenAI()
def chat(messages, model="gpt-4o", n=1):
    '''
    function to call llm api
    '''
    response = llm.chat.completions.create(
                        model=model,
                        messages=messages,
                        n=n
                        )
    text_response = [response.choices[i].message.content for i in range(n)]
    
    return text_response

# write function to generate event data
def generate_bgp_event(history_rib, target_ip, event_type = "Hijack"):
    """
    this function takes as paths to one ip prefix as input and generate data of a bgp anomaly event
    Args:
        event_type: "Hijack" for BGP Hijack or "Route Leak" for BGP Route Leak
        history_rib: dictionary of history routing table to target_ip
        target_ip: target ip prefix that is the victim of the synthetic BGP anomaly event
    Return:
        rib_after_incident: AS paths after the synthetic event
        event: text description of the event
    """
    # first describe the as paths to target ip in history
    user_prompt = f"You are an expert in Border Gateway Protocol. Given a set of AS paths collected from multiple collectors,\
                to a specific IP prefix, describe patterns in them. List a few typical AS paths.\n \
                Here are the AS paths:\n \
                {history_rib}"
    message = [{"role":"user", "content":user_prompt}]
    output = chat(message)[0]

    # 2. generate event based on event type
    if event_type == "Hijack":
        # a. text description of bgp hijack event
        system_prompt_1 = "You are an expert in Border Gateway Protocol. Now you need to generate an BGP hijack event for a specific IP prefix.\
                    I will provide you the IP prefix and descriptions of AS paths to this IP prefix. You need randomly generate a hijacker AS,\
                    a few examples of the hijacked AS paths. Also, randomly decide if this is a\
                    sub-prefix hijack or not, if it is, name the sub-prefix that's been hijacks. Finally, decide how many percent of peers \
                    detect this hijack event."
        user_prompt_1 = f"The target IP is {target_ip}\n \
                            The description of AS paths is: {output}"
        message_1 = [{"role":"system", "content":system_prompt_1}] + \
                    [{"role":"user", "content":user_prompt_1}]
        event = chat(message_1)[0]

        # b. extract key inforamtion from hijack event
        user_prompt_2 = "Extract hijacker AS number from the event description and add </h> symbol before and after the number.\n"
        user_prompt_2_1 = "Extract hijacked sub-prefix from the event description and add </s> symbol before and after the prefix. If these is no \
                            sub-prefix, return an 'NA' with </s> symbol before and after it."
        user_prompt_2_3 = "Extract hijacked AS path examples from the event description in one line. Output each path as a list, \
                            separate each AS number by ',', separate each path by a </p> symbol. No bracket.\
                            Add a </P> symbol before the first path and after the last path."
        user_prompt_2_4 = "Extract dectecting percentage from the event description. Output just the number without percentage sign. Add </c> symbol\
                            before and after the number."
        user_prompt_2_2 = f"The event description is: {event}"
        message_2 = [{"role":"user", "content":user_prompt_2}] + \
                    [{"role":"user", "content":user_prompt_2_1}] + \
                    [{"role":"user", "content":user_prompt_2_3}] + \
                    [{"role":"user", "content":user_prompt_2_4}] + \
                    [{"role":"user", "content":user_prompt_2_2}]
        event_info = chat(message_2)[0]
    else: 
        # i. text description of bgp route leak event
        system_prompt_1 = "You are an expert in Border Gateway Protocol. Now you need to generate an BGP route leak event for a specific IP prefix.\
                    I will provide you the IP prefix and descriptions of AS paths to this IP prefix. You need randomly generate a leaker AS,\
                    one example of the AS path after the route leak event. Also, randomly decide if this is a\
                    sub-prefix leak or not, if it is, name the sub-prefix that's been leak. Finally, decide how many percent of peers \
                    detect this route leak event." 
        user_prompt_1 = f"The target IP is {target_ip}\n \
                            The description of AS paths is: {output}"
        message_1 = [{"role":"system", "content":system_prompt_1}] + \
                    [{"role":"user", "content":user_prompt_1}]
        event = chat(message_1)[0]

        # ii. extract key inforamtion from bgp route leak event
        user_prompt_2 = "Extract leaker AS number from the event description and add </h> symbol before and after the number.\n"
        user_prompt_2_1 = "Extract leaked sub-prefix from the event description and add </s> symbol before and after the prefix. If these is no \
                            sub-prefix, return an 'NA' with </s> symbol before and after it."
        user_prompt_2_3 = "Extract leaked AS path examples from the event description in one line. Output each path as a list, \
                            separate each AS number by ',', separate each path by a </p> symbol. No bracket.\
                            Add a </P> symbol before the first path and after the last path."
        user_prompt_2_4 = "Extract dectecting percentage from the event description. Output just the number without percentage sign. Add </c> symbol\
                            before and after the number."
        user_prompt_2_2 = f"The event description is: {event}"
        message_2 = [{"role":"user", "content":user_prompt_2}] + \
                    [{"role":"user", "content":user_prompt_2_1}] + \
                    [{"role":"user", "content":user_prompt_2_3}] + \
                    [{"role":"user", "content":user_prompt_2_4}] + \
                    [{"role":"user", "content":user_prompt_2_2}]
        event_info = chat(message_2)[0]

    #extract information from gpt output
    AS_2 = event_info.split("</h>")[1]
    sub_ip = event_info.split("</s>")[1]
    post_strpath = event_info.split("</P>")[1].split("</p>")
    post_aspath = []
    for str_path in post_strpath:
        post_aspath.append(str_path.split(','))
    detect_percentage = int(event_info.split("</c>")[1])

    # create rib_after_incident based on the provided information
    # initialization
    rib_after_incident = copy.deepcopy(history_rib)
    operate_ip = target_ip
    if sub_ip != 'NA':
        operate_ip = sub_ip
        for collector in list(rib_after_incident.keys()):
            if target_ip in list(rib_after_incident[collector].keys()):
                tep_paths = copy.deepcopy(rib_after_incident[collector][target_ip])
                rib_after_incident[collector].update({operate_ip:tep_paths})

    #collect list of peers
    peer_dict = defaultdict(list)
    for collector in list(rib_after_incident.keys()):
        if operate_ip in list(rib_after_incident[collector].keys()):
            peer_dict[collector] = list(rib_after_incident[collector][operate_ip].keys())

    # select peers that detected this event
    n_peer = 0
    select_peer_dict = defaultdict(list)
    while n_peer == 0:
        for collector in (peer_dict.keys()):
            select_peer = []
            for peer in peer_dict[collector]:
                r = random.randint(1, 100)
                if r <= detect_percentage:
                    n_peer += 1
                    select_peer.append(peer)
            select_peer_dict[collector] = select_peer

    #replace as paths from select peers to as paths after incident from gpt
    for collector in list(rib_after_incident.keys()):
        select_peers = select_peer_dict[collector]
        for peer in select_peers:
            old_path = rib_after_incident[collector][operate_ip][peer]
            new_path = random.choice(post_aspath)
            if event_type == "Route Leak":
                idx = new_path.index(AS_2)
                new_path = new_path[idx:]
            else:
                idx = new_path.index(AS_2)
                new_path = new_path[:idx+1]
                if new_path[0] != AS_2:
                    new_path = new_path[1:]
            if len(old_path)-1 < len(new_path):
                new_new_path = old_path[:1] + new_path
            else:
                new_new_path = old_path[:-len(new_path)] + new_path
            rib_after_incident[collector][operate_ip][peer] = new_new_path

    # event record to the csv file
    for collector in list(history_rib.keys()):
        for peer in list(history_rib[collector][target_ip].keys()):
            target_as = history_rib[collector][target_ip][peer][-1]
            break
        break
    new_event_record = ['', '', '', '', '', '', '', '']
    new_event_record[0] = event_type
    new_event_record[1] = target_as
    new_event_record[2] = AS_2
    new_event_record[3] = target_ip
    user_prompt_time = f"Randomly generate a time stamp of the form 'yyyy-mm-dd hour:minute:second' and add a </t> symbol\
                        before and after it"
    message_time = [{"role":"user", "content":user_prompt_time}]
    time = chat(message_time)[0]
    new_event_record[4] = time.split("</t>")[1]
    return rib_after_incident, event, new_event_record


In [28]:
# read all history rib
with open("Data/" + f"synthetic_history_rib.json", "r") as f:
    all_history_rib = json.load(f)

# get all ip prefixes
ip_list = list(all_history_rib["rrc00"].keys())

# get event records
event_record = pd.read_csv('Data/BGP_explain_data.csv', na_filter=False)
idx = 24

# generate bgp anomaly event data
for target_ip in tqdm(ip_list):
    # first extract history rib
    history_rib = defaultdict(dict)
    for collector in list(all_history_rib.keys()):
        history_rib[collector] = defaultdict(dict)
        if target_ip in list(all_history_rib[collector].keys()):
            history_rib[collector][target_ip] = all_history_rib[collector][target_ip]

    # generate hijack event
    rib_after_incident_1 = None
    while rib_after_incident_1 is None:
        try:
            rib_after_incident_1, event_1, new_event_record_1 = generate_bgp_event(history_rib, target_ip, event_type = "Hijack")
        except:
             pass
    
    # generate route leak event
    rib_after_incident_2 = None
    while rib_after_incident_2 is None:
        try:
            rib_after_incident_2, event_2, new_event_record_2 = generate_bgp_event(history_rib, target_ip, event_type = "Route Leak")
        except:
             pass
    
    # save hijack event
    with open(f"e_1/{idx}_history_rib.json", "w") as f:
        json.dump(history_rib, f)
    with open(f"e_1/{idx}_before_event_rib.json", "w") as f:
        json.dump(history_rib, f)
    with open(f"e_1/{idx}_after_event_rib.json", "w") as f:
        json.dump(rib_after_incident_1, f)
    with open(f"Data/event_{idx}.txt", "w") as f:
        json.dump(event_1,f)
    new_event_record_1[-1] = f"Data/event_{idx}.txt"
    event_record.loc[idx] = new_event_record_1

    # save route leak event
    with open(f"e_1/{idx+1}_history_rib.json", "w") as f:
        json.dump(history_rib, f)
    with open(f"e_1/{idx+1}_before_event_rib.json", "w") as f:
        json.dump(history_rib, f)
    with open(f"e_1/{idx+1}_after_event_rib.json", "w") as f:
        json.dump(rib_after_incident_2, f)
    with open(f"Data/event_{idx+1}.txt", "w") as f:
        json.dump(event_2,f)
    new_event_record_2[-1] = f"Data/event_{idx+1}.txt"
    event_record.loc[idx+1] = new_event_record_2
    idx += 2
event_record.to_csv('Data/BGP_explain_data.csv', index=False)

  0%|          | 0/16 [00:00<?, ?it/s]

In [26]:
print(event)
#Even though event description might have some minor errors, the rule-based code in generate_bgp_event will ensure
# the generated BGP data is legit for a BGP hijack or a BGP route leak

To simulate a BGP route leak event for the IP prefix `8.8.8.0/24`, we'll randomly select a leaker AS, create an example of the AS path after the route leak, decide if it's a sub-prefix leak, and estimate the percentage of peers detecting this event.

### Random Leaker AS:
Let's randomly choose AS `199524` as the leaker AS. This AS is a regional provider that frequently appears in the paths and might unintentionally advertise incorrect routing information.

### Example of AS Path after Route Leak:
Original Path: `199524 -> 15169`
Leaked Path: `199524 -> 1299 -> 3549 -> 3356 -> 15169`

In this scenario, AS `199524` incorrectly leaks the route to a global transit provider, AS `1299`, which then spreads it through other major transit ASes, causing a significant propagation of the incorrect path through Tier 1 carriers.

### Sub-Prefix Leak Decision:
Let's decide if this is a sub-prefix leak. We'll randomly choose that this is not a sub-prefix leak. The entire prefix `8.8.8.0/24` is being l