## Installing required libraries

In [None]:
!pip install sentence-transformers lancedb groq



## Importing libraries

In [None]:
import requests
from bs4 import BeautifulSoup
import networkx as nx
import re
import json
import lancedb
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from groq import Groq
from pprint import pprint

## Getting all links at certain depth

In [None]:
def get_links(url, depth=1):  # Setting the depth to be 1
    G = nx.DiGraph()          # Creating graph
    G.add_node(url, depth=0)  # Adding base node
    visited = set([url])
    queue = [url]             # Creating a queue and visited url set.

    while queue:
        current_url = queue.pop(0)                        # Getting 1st url
        current_depth = G.nodes[current_url]['depth']
        print(current_url,current_depth)
        if current_depth < depth:                         # Checking depth
            try:
                response = requests.get(current_url)      # Getting Scrapped data from URL
                soup = BeautifulSoup(response.content, 'html.parser')
                for a_tag in soup.find_all('a', href=True):
                    link = a_tag['href']                   
                    if re.match(r'^https?://', link):     # checking for correct http link
                        if link not in visited:
                            G.add_node(link, depth=current_depth + 1)      # Adding sub nodes and edges
                            G.add_edge(current_url, link)
                            queue.append(link)                             # Updating visited nodes 
                            visited.add(link)
            except Exception as e:
                print(f"Failed to retrieve {current_url}: {e}")

    return G    


## Scrapping URL Content

In [None]:
def scrape_content(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        paragraphs = soup.find_all('p')
        content = ' '.join([para.get_text() for para in paragraphs])
        return content
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return ""

## Storing URLs with scrapped data

In [None]:
base_url = "https://docs.nvidia.com/cuda/"    # Base URL for scrapping data
web_graph = get_links(base_url)               # Creating graph with URLs upto given depth

content_dict = {}

for url in web_graph.nodes:
    content_dict[url] = scrape_content(url)   # Scrapping content and storing it in dictionary 

with open('scraped_data.json', 'w') as f:     # Saving scrapped data
    json.dump(content_dict, f)


https://docs.nvidia.com/cuda/ 0
https://developer.nvidia.com/nvidia-video-codec-sdk 1
https://nvlabs.github.io/cub/ 1
https://nvidia.github.io/libcudacxx/ 1
https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html 1
https://nvidia.github.io/cccl/thrust/ 1
https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html 1
https://docs.nvidia.com/deploy/cuda-compatibility/index.html 1
https://docs.nvidia.com/cupti/index.html 1
https://docs.nvidia.com/gpudirect-storage/index.html 1
https://docs.nvidia.com/compute-sanitizer/index.html 1
https://docs.nvidia.com/nsight-systems/index.html 1
https://docs.nvidia.com/nsight-compute/index.html 1
https://docs.nvidia.com/nsight-visual-studio-edition/index.html 1
https://developer.nvidia.com/cuda-toolkit-archive 1
https://www.nvidia.com/en-us/about-nvidia/privacy-policy/ 1
https://www.nvidia.com/en-us/about-nvidia/privacy-center/ 1
https://www.nvidia.com/en-us/preferences/start/ 1
https://www.nvidia.com/en-us/about-nvidia/terms-of-

## Chunking data and storing it's embeddings using clustering

In [None]:
def chunk_data(data):
    model = SentenceTransformer('all-MiniLM-L6-v2')     # setting model for embeddings
    sentences = data.split(".")

    if len(sentences) < 5:
        sentences = data.split(',')

    print(f"Length of chuncked sentences : {len(sentences)}")

    if len(sentences) >= 5:
        embeddings = model.encode(sentences)           # encoding all sentences    

        num_clusters = 5  # Adjust this based on your needs
        clustering_model = KMeans(n_clusters=num_clusters)     # cluster diffterent sentences into different groups
        clustering_model.fit(embeddings)
        cluster_assignment = clustering_model.labels_

        clustered_data = [[] for i in range(num_clusters)]     # assigning different sentences to cluster
        for sentence_id, cluster_id in enumerate(cluster_assignment):
            clustered_data[cluster_id-1].append(sentences[sentence_id])

        clustered_data_new = []                               # joining sentences of same cluster
        for i in clustered_data:
            clustered_data_new.append(".".join(i))

        embed_data = []                                      # storing embedding of new joined sentences
        for i in clustered_data_new:
            emb = model.encode(i)
            embed_data.append(emb)

        return clustered_data_new, embed_data               # returning sentences and embeddings
    else:
        print("Empty data returning")
        return [],[]

# Storing chunks and embeddings of all URLs

In [None]:
  with open('scraped_data.json', 'r') as f:
      content_dict = json.load(f)             # Loading scrapped data

  all_chunks = []
  all_embeddings = []

  print(len(content_dict.keys()))

  for url, content in list(content_dict.items()):   # storing chunks and embeddings of all stored documents
      print(f"URL during Chunking : {url}")
      chunks, embeddings = chunk_data(content)
      for c,e in zip(chunks,embeddings):
          chk = c
          chk.replace(" ","")
          chk.replace('\n',"")
          if len(chk) < 10 :
              continue
          all_chunks.append(c)
          all_embeddings.append(e)



23
URL during Chunking : https://docs.nvidia.com/cuda/


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Length of chuncked sentences : 127


  super()._check_params_vs_input(X, default_n_init=10)


URL during Chunking : https://developer.nvidia.com/nvidia-video-codec-sdk
Length of chuncked sentences : 114


  super()._check_params_vs_input(X, default_n_init=10)


URL during Chunking : https://nvlabs.github.io/cub/
Length of chuncked sentences : 1
Empty data returning
URL during Chunking : https://nvidia.github.io/libcudacxx/
Length of chuncked sentences : 1
Empty data returning
URL during Chunking : https://docs.nvidia.com/gpudirect-storage/api-reference-guide/index.html
Length of chuncked sentences : 322


  super()._check_params_vs_input(X, default_n_init=10)


URL during Chunking : https://nvidia.github.io/cccl/thrust/
Length of chuncked sentences : 29


  super()._check_params_vs_input(X, default_n_init=10)


URL during Chunking : https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html
Length of chuncked sentences : 368


  super()._check_params_vs_input(X, default_n_init=10)


URL during Chunking : https://docs.nvidia.com/deploy/cuda-compatibility/index.html
Length of chuncked sentences : 324


  super()._check_params_vs_input(X, default_n_init=10)


URL during Chunking : https://docs.nvidia.com/cupti/index.html
Length of chuncked sentences : 5


  super()._check_params_vs_input(X, default_n_init=10)


URL during Chunking : https://docs.nvidia.com/gpudirect-storage/index.html
Length of chuncked sentences : 2
Empty data returning
URL during Chunking : https://docs.nvidia.com/compute-sanitizer/index.html
Length of chuncked sentences : 1
Empty data returning
URL during Chunking : https://docs.nvidia.com/nsight-systems/index.html
Length of chuncked sentences : 9


  super()._check_params_vs_input(X, default_n_init=10)


URL during Chunking : https://docs.nvidia.com/nsight-compute/index.html
Length of chuncked sentences : 19


  super()._check_params_vs_input(X, default_n_init=10)


URL during Chunking : https://docs.nvidia.com/nsight-visual-studio-edition/index.html
Length of chuncked sentences : 18


  super()._check_params_vs_input(X, default_n_init=10)


URL during Chunking : https://developer.nvidia.com/cuda-toolkit-archive
Length of chuncked sentences : 7


  super()._check_params_vs_input(X, default_n_init=10)


URL during Chunking : https://www.nvidia.com/en-us/about-nvidia/privacy-policy/
Length of chuncked sentences : 93


  super()._check_params_vs_input(X, default_n_init=10)


URL during Chunking : https://www.nvidia.com/en-us/about-nvidia/privacy-center/
Length of chuncked sentences : 5


  super()._check_params_vs_input(X, default_n_init=10)


URL during Chunking : https://www.nvidia.com/en-us/preferences/start/
Length of chuncked sentences : 1
Empty data returning
URL during Chunking : https://www.nvidia.com/en-us/about-nvidia/terms-of-service/
Length of chuncked sentences : 145


  super()._check_params_vs_input(X, default_n_init=10)


URL during Chunking : https://www.nvidia.com/en-us/about-nvidia/accessibility/
Length of chuncked sentences : 5
URL during Chunking : https://www.nvidia.com/en-us/about-nvidia/company-policies/


  super()._check_params_vs_input(X, default_n_init=10)


Length of chuncked sentences : 7


  super()._check_params_vs_input(X, default_n_init=10)


URL during Chunking : https://www.nvidia.com/en-us/product-security/
Length of chuncked sentences : 8


  super()._check_params_vs_input(X, default_n_init=10)


URL during Chunking : https://www.nvidia.com/en-us/contact/
Length of chuncked sentences : 11


  super()._check_params_vs_input(X, default_n_init=10)


In [None]:
df = pd.DataFrame({"vector":all_embeddings,"docs":all_chunks})  # Storing chunks and embeddings

In [None]:
df.head()

Unnamed: 0,vector,docs
0,"[-0.025895298, -0.027912818, -0.090532914, -0....",If you do not agree with the terms and condit...
1,"[-0.046788294, -0.05065362, -0.06595036, -0.04...",Installation Guides Programming Guides CUDA AP...
2,"[-0.06084559, 0.010138369, -0.060803737, -0.00...",Using built-in capabilities for distributing ...
3,"[-0.083506435, 0.022876907, -0.07854397, -0.03...",\r\nThe toolkit includes GPU-accelerated libra...
4,"[-0.103047416, 0.024801489, -0.06617258, -0.00...",Applications that follow the best practices f...


## Creating Vector Database with lancedb and storing data

In [None]:
uri = "data/url-scrap"
db = lancedb.connect(uri)

table = db.create_table("url_rag",data=df)  # Creating a vector db table from dataframe

In [None]:
table.to_pandas()

Unnamed: 0,vector,docs
0,"[-0.025895298, -0.027912818, -0.090532914, -0....",If you do not agree with the terms and condit...
1,"[-0.046788294, -0.05065362, -0.06595036, -0.04...",Installation Guides Programming Guides CUDA AP...
2,"[-0.06084559, 0.010138369, -0.060803737, -0.00...",Using built-in capabilities for distributing ...
3,"[-0.083506435, 0.022876907, -0.07854397, -0.03...",\r\nThe toolkit includes GPU-accelerated libra...
4,"[-0.103047416, 0.024801489, -0.06617258, -0.00...",Applications that follow the best practices f...
...,...,...
76,"[-0.02574309, 0.032440744, -0.021723332, 0.015...",Below is a list of published NVIDIA Security ...
77,"[-0.05397123, -0.069211304, 0.046810895, -0.00...",Get help with your existing NVIDIA products an...
78,"[0.12230451, 0.03236777, 0.0051685073, 5.24056...",Alabama Madison California Palo Alto Santa C...
79,"[0.05249632, -0.07003562, 0.010563484, 0.02913...","Find experienced, professional partners. More..."


## Searching top results related to query

In [None]:
def hybrid_retrieval(query, table):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = model.encode(query)             # Encoding query

    results = table.search(query_embedding).limit(5).to_pandas()    # Getting top 5 results

    return results

In [32]:
query = "What is Cuda ?"
res = hybrid_retrieval(query, table)   
res = list(res.docs)     # Retrieved top 5 documents

In [33]:
res

['CUDA Compatibility CUDA Compatibility describes the use of new CUDA toolkit components on systems with older base installations. The NVIDIA® CUDA® Toolkit enables developers to build NVIDIA GPU accelerated compute applications for desktop computers, enterprise, and data centers to hyperscalers. It consists of the CUDA compiler toolchain including the CUDA runtime (cudart) and various CUDA libraries and tools. To build an application, a developer has to install only the CUDA Toolkit and necessary libraries required for linking. In order to run a CUDA application, the system should have a CUDA enabled GPU and an NVIDIA display driver that is compatible with the CUDA Toolkit that was used to build the application itself. Figure 1 Components of CUDA\uf0c1 Every CUDA toolkit also ships with an NVIDIA display driver package for convenience. This driver supports all the features introduced in that version of the CUDA Toolkit. The driver package includes both the user mode CUDA driver (libcu

## Using llama-3.1-70b for results

In [34]:
# Calling llama 3.1 from groq to fetch the results using given query and documents.
client = Groq(
    api_key="gsk_Your_Grok_Key",
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": f"You have been given a query {query}. Based on the given top 5 document chunks {res} return the answer."
        }
    ],
    model="llama-3.1-70b-versatile",
)


In [37]:
out = chat_completion.choices[0].message.content
pprint(out,width = 120)     # Output of the given query

('CUDA stands for Compute Unified Device Architecture. It is a development environment created by NVIDIA for creating '
 'high-performance applications that can run on NVIDIA GPUs. The CUDA Toolkit provides a set of tools, libraries, and '
 'programming models for developers to build and optimize applications that can take advantage of the massively '
 'parallel processing capabilities of NVIDIA GPUs.')
