In [1]:
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import os

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

In [2]:
import json

json_file_path = '/home/hb/LLM-research/finetuning_dataset/BGP/PyBGPStream/PyBGPStream_main10K.json'

with open(json_file_path, 'r') as file:
    data = json.load(file)

# Concatenate instruction and output for each item
docs = [item['instruction'] + " " + item['output'] for item in data]

embeddings = embed_model.embed_documents(docs)

print(f"We have {len(embeddings)} doc embeddings, each with "
      f"a dimensionality of {len(embeddings[0])}.")


We have 10110 doc embeddings, each with a dimensionality of 384.


In [6]:
from pinecone import Pinecone
from pinecone import PodSpec
from pinecone import ServerlessSpec, PodSpec
import os

api_key = os.environ.get('PINECONE_API')
environment = os.environ.get('PINECONE_ENVIRONMENT')

pc = Pinecone(api_key='PINECONE_API')
spec = PodSpec(environment='PINECONE_ENVIRONMENT')


In [7]:
import time

index_name = 'llama-2-rag'

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

# we create a new index
pc.create_index(
        index_name,
        dimension=len(embeddings[0]),
        metric='cosine',
        spec=spec
    )

# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [8]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [9]:
from datasets import load_dataset

data = load_dataset("json", data_files="/home/hb/LLM-research/finetuning_dataset/BGP/PyBGPStream/PyBGPStream_main10K.json", split="train")
data

Found cached dataset json (/home/hb/.cache/huggingface/datasets/json/default-ecefba89b33c753a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


Dataset({
    features: ['instruction', 'input', 'output', 'most_similar_instructions', 'avg_similarity_score'],
    num_rows: 10110
})

In [10]:
from datasets import DatasetDict

data_df = data.to_pandas()

batch_size = 32
total_rows = len(data_df)

for i in range(0, total_rows, batch_size):
    i_end = min(len(data_df), i+batch_size)
    batch = data_df.iloc[i:i_end]
    # ids = [f"{x['instruction']}-{x['output']}" for i, x in batch.iterrows()]
    ids = [f"id_{j+1}" for j in range(i, i_end)]

    instructions = [row['instruction'] for index, row in batch.iterrows()]
    embeds = embed_model.embed_documents(instructions)
    # get metadata to store in Pinecone
    metadata = [
        {'instruction': row['instruction'],
         'input': row['input'],
         'output': row['output']} for index, row in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

In [11]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-7b-chat-hf'
# model_id = 'codellama/CodeLlama-7b-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

bnb_config = transformers.BitsAndBytesConfig(
    load_in_8bit=True,
)


# Need auth token for these
hf_auth = os.environ.get('hf_token')

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

# device_map = {"": 0}

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map="auto",
    use_auth_token=hf_auth
)

model.eval()
print(f"Model loaded on {device}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded on cuda:0


In [12]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)
 
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

In [13]:
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    temperature=0.5,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # mex number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

In [14]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

In [15]:
from langchain.vectorstores import Pinecone

text_field = 'text'  # field in metadata that contains text content

vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

  warn_deprecated(


In [16]:
query = 'Generate Python code using PyBGPStream'

vectorstore.similarity_search(
    query=query,  # the search query
    k=3,  # returns top 3 most relevant chunks of text
    # namespace='my_namespace'
)

Found document with no `code` key. Skipping.
Found document with no `code` key. Skipping.
Found document with no `code` key. Skipping.


[]

In [17]:
from langchain.chains import RetrievalQA

rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm, chain_type='stuff',
    retriever=vectorstore.as_retriever()
)

In [18]:
llm("Develop a Python script utilizing PyBGPStream to analyze BGP advertisements from AS34549 within the timeframe of 13:00 to 14:00 on January 1, 2024 from collectors rrc00. The script's core objective is to compute metrics that illustrate the degree of similarity or divergence among the AS paths of these advertisements, in comparison to a broader, aggregate perspective of AS34549's routing behavior. The script should unveil any discernible trends or anomalies in AS path configurations, thereby showing the underlying routing strategies or policy adaptations employed by AS34549. Do not use PyBGPStream 'filter' parameter.")

  warn_deprecated(


"Develop a Python script utilizing PyBGPStream to analyze BGP advertisements from AS34549 within the timeframe of 13:00 to 14:00 on January 1, 2024 from collectors rrc00. The script's core objective is to compute metrics that illustrate the degree of similarity or divergence among the AS paths of these advertisements, in comparison to a broader, aggregate perspective of AS34549's routing behavior. The script should unveil any discernible trends or anomalies in AS path configurations, thereby showing the underlying routing strategies or policy adaptations employed by AS34549. Do not use PyBGPStream 'filter' parameter.\n\nInput:\n\n* A text file containing BGP advertisements from AS34549 during the specified time frame (e.g., as34549-jan1-2024.txt)\n* An output file where the results will be written (e.g., as34549-jan1-2024-analysis.txt)\n\nOutput:\n\n* A detailed analysis of the AS paths present in the input advertisements, including their lengths, types, and other relevant attributes.\

'\n\nI have attached the file "5G_Infrastructure/demand_driven_postcode_data_results.csv" which contains the data for the calculation of the 5G network performance KPIs.\n\nPlease provide me with the Python code to perform the calculations as mentioned above.\n\nThank you!\n\nHere is the file "5G_Infrastructure/demand_driven_postcode_data_results.csv":\n\n| Postcode | Latitude | Longitude | Demand (GB) | NumPoints |\n| --- | --- | --- | --- | --- |\n| SW1A 2AA | 51.498673 | -0.128874 | 200 | 10 |\n| SW1W 9TQ | 51.495795 | -0.135344 | 150 | 15 |\n| SW3 4RP | 51.481845 | -0.200892 | 300 | 20 |\n| SW3 5EA | 51.483893 | -0.206419 | 250 | 15 |\n| WC2N 5DU | 51.507346 | -0.126876 | 400 | 25 |\n| WC2H 7BG | 51.509888 | -0.134286 | 350 | 20 |\n\nNote that the demand values are in GB (gigabytes) and the number of points (NumPoints) is also provided for each postcode.\n\nPlease help me with the Python code to perform the calculations as mentioned above. Thank you!'

"Develop a Python script utilizing PyBGPStream to analyze BGP advertisements from AS34549 within the timeframe of 13:00 to 14:00 on January 1, 2024 from collectors rrc00. The script's core objective is to compute metrics that illustrate the degree of similarity or divergence among the AS paths of these advertisements, in comparison to a broader, aggregate perspective of AS34549's routing behavior. The script should unveil any discernible trends or anomalies in AS path configurations, thereby showing the underlying routing strategies or policy adaptations employed by AS34549. Do not use PyBGPStream 'filter' parameter.\n\nThe script should be able to accomplish the following tasks:\n\n1. Load and parse BGP advertisements from AS34549 during the specified time frame using PyBGPStream.\n2. Extract the AS paths from each advertisement and convert them into a standardized format for analysis.\n3. Compute similarity/dissimilarity metrics between the AS paths of individual advertisements and the overall distribution of AS paths observed across all advertisements.\n4. Visualize the results through appropriate visualization tools (e.g., matplotlib, seaborn) to facilitate interpretation and insights.\n5. Save the output to a file or database for future reference or further analysis.\n\nTo achieve this task, you can follow these steps:\n\nStep 1: Install required packages\n\nYou will need to install the following packages:\n\n* `pybgpstream`: This package provides a Python interface to BGP data structures.\n* `pandas`: This package is used for data manipulation and analysis.\n* `matplotlib` or `seaborn`: These packages are used for visualization.\n\nYou can install these packages using pip:\n```\npip install pybgpstream pandas matplotlib seaborn\n```\nStep 2: Load BGP data\n\nUse the `pybgpstream` module to load BGP data from the specified time frame. You can use the `load_rpc()` function to load data from a remote collector, like `collectors-rrc00`.\n```python\nimport pybgpstream\n\n# Load BGP data from collectors-rrc00\ndata = pybgpstream.load_rpc('collectors-rrc00', start='2024-01-01 13:00:00', end='2024-01-01 14:00:00')\n```\nStep 3: Extract AS paths\n\nUse the `parse_advertisement()` function provided by `pybgpstream` to extract the AS paths from each BGP advertisement.\n```python\n# Extract AS paths from each advertisement\nas_paths = [pybgpstream.parse_advertisement(advertisement).get('as_path') for advertisement in data"