In [None]:
!pip install -qU "semantic-router[fastembed]" langchain langchain_community==0.2.6 fastembed==0.3.2 langchain_core openai pymilvus bs4 "grpcio<=1.63.0,>=1.49.1" 

In [None]:
!pip install -qU semantic-chunkers==0.0.3

In [1]:
import requests
from bs4 import BeautifulSoup
import os
import time
from langchain_core.documents import Document
from urllib.parse import urlparse, urljoin
from openai import OpenAI
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection, connections, utility
from typing import Any, List, Tuple, Dict, Literal, Optional
from pydantic import Field
from semantic_router.schema import DocumentSplit
from langchain_core.documents import Document
from semantic_router.splitters import RollingWindowSplitter
from semantic_router.utils.logger import logger
from semantic_router.encoders import FastEmbedEncoder
from semantic_router.encoders import OpenAIEncoder
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Define the Credentials
ZILLIZ_CLOUD_URI = ""         # os.environ['ZILLIZ_URI'] #"https://in03-c2cc7c5da8decab.api.gcp-us-west1.zillizcloud.com"
ZILLIZ_CLOUD_API_KEY = ""
COLLECTION_NAME="trial"

In [3]:
DENSE_EMBEDDING_MODEL = ""
SPARSE_EMBEDDING_MODEL = ""
SEMANTIC_ENCODER = ""
SEMANTIC_SCORE_THERESHOLD = 0.3
BASE_URL = 'https://docs.nvidia.com/cuda/'

In [4]:
# Check the Milvus connection and collection status First
def connection_status(collection_name):
    connections.connect(
        uri=ZILLIZ_CLOUD_URI,
        token=ZILLIZ_CLOUD_API_KEY
    )

    utility.get_server_version()
    if utility.has_collection(collection_name):
        utility.drop_collection(COLLECTION_NAME)
    else:
        print(f"New Collection -> {collection_name}")

In [5]:
from typing import Any, Dict, List, Optional
from pydantic import BaseModel, Field, model_validator, validator
from langchain_core.embeddings import Embeddings

class SparseFastEmbedEmbeddings(BaseModel, Embeddings):
    """Qdrant FastEmbedding models.
    FastEmbed is a lightweight, fast, Python library built for embedding generation.

    To use this class, you must install the `fastembed` Python package.

    `pip install fastembed`
    Example:
        from langchain_community.embeddings import FastEmbedEmbeddings
        fastembed = FastEmbedEmbeddings()
    """

    model_name: str = "BAAI/bge-small-en-v1.5"
    """Name of the FastEmbedding model to use
    Defaults to "BAAI/bge-small-en-v1.5"
    Find the list of supported models at
    https://qdrant.github.io/fastembed/examples/Supported_Models/
    """

    cache_dir: Optional[str] = Field(default=None)
    """The path to the cache directory.
    Defaults to `local_cache` in the parent directory
    """

    threads: Optional[int] = Field(default=None)
    """The number of threads single onnxruntime session can use.
    Defaults to None
    """

    doc_embed_type: str = "default"
    """Type of embedding to use for documents
    The available options are: "default" and "passage"
    """

    model: Any = Field(default=None, exclude=True)  # Renamed to 'model' and marked as private

    class Config:
        """Configuration for this pydantic object."""
        extra = 'forbid'

    @model_validator(mode='before')
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that FastEmbed has been installed."""
        return values

    def __init__(self, **data):
        super().__init__(**data)
        self._initialize_model()

    def _initialize_model(self):
        """Initialize the FastEmbed model."""
        try:
            # >= v0.2.0
            from fastembed import SparseTextEmbedding

            self.model = SparseTextEmbedding(
                model_name=self.model_name,
                cache_dir=self.cache_dir,
                threads=self.threads,
            )
        except ImportError as ie:
            try:
                # < v0.2.0
                from fastembed.embedding import FlagEmbedding

                self.model = FlagEmbedding(
                    model_name=self.model_name,
                    cache_dir=self.cache_dir,
                    threads=self.threads,
                )
            except ImportError:
                raise ImportError(
                    "Could not import 'fastembed' Python package. "
                    "Please install it with `pip install fastembed`."
                ) from ie

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Generate embeddings for documents using FastEmbed.

        Args:
            texts: The list of texts to embed.

        Returns:
            List of embeddings, one for each text.
        """
        embeddings: List[np.ndarray]
        if self.doc_embed_type == "passage":
            embeddings = self.model.passage_embed(texts)
        else:
            embeddings = self.model.embed(texts)
        return [
            {int(idx): float(val) for idx, val in zip(embed.indices, embed.values)}
            for embed in embeddings
        ]

    def embed_query(self, text: str) -> List[float]:
        """Generate query embeddings using FastEmbed.

        Args:
            text: The text to embed.

        Returns:
            Embeddings for the text.
        """
        query_embeddings: np.ndarray = next(self.model.query_embed(text))
        return query_embeddings.tolist()




### Intelligent WebCrawling

In [6]:
def get_main_links(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'html.parser')
  main_links = []

  # Extracting links with "index.html" considering nesting
  for a in soup.find_all('a', href=True):
    href = a['href']
    if href and href.endswith("index.html"):  # Check for ending with "index.html"
      full_url = urljoin(url, href)
      # Avoid duplicate links and links pointing to external domains
      if full_url not in main_links and full_url.startswith(url):
        main_links.append(full_url)

  return main_links

# Function to extract subsection links from a main link
def get_subsection_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    subsection_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        full_url = urljoin(url, href)
        if '#' in full_url and full_url not in subsection_links:
            subsection_links.append(full_url)
    return subsection_links

# Function to extract the main content by section ID and its subsections
def extract_section_content(cached_soup, section_id):
  section_content = []
  soup = cached_soup  # Use the cached soup object

  main_section = soup.find('section', {'id': section_id})

  if main_section:
    for element in main_section.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'pre']):
            if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                section_content.append('\n' + element.get_text().upper() + '\n')
            elif element.name == 'p':
                section_content.append(element.get_text() + '\n\n')
            elif element.name == 'li':
                section_content.append('* ' + element.get_text() + '\n')
            elif element.name == 'pre':  # for code blocks
                section_content.append('\n' + element.get_text() + '\n')

  return '\n'.join(section_content)

def extract_main_section_ids(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'html.parser')

  # Find all sections with an ID attribute
  sections_with_id = soup.find_all('section', {'id': True})

  # This list will store top-level section IDs (no nested sections)
  main_section_ids = []

  # Iterate through sections with ID
  for section in sections_with_id:
    # Check if the current section has a parent section with an ID
    if not section.parent or not section.parent.has_attr('id'):
      # If no parent ID, it's likely a top-level section
      main_section_ids.append(section['id'])

  return main_section_ids, soup

def create_document(content, main_link, section_id):
  doc_data = Document(page_content=content, metadata = {"source_link":main_link, "section_id":section_id})
  return doc_data

# Main function to orchestrate the extraction and saving process
def Scrape_data(base_url):
    base_url = base_url
    main_document_list = []

    # Step 1: Extract main links
    main_links = get_main_links(base_url)
    
    # Step 2: Extract subsection links and save content
    for main_link in main_links:
        # Extract main section IDs dynamically
        main_section_ids, soup = extract_main_section_ids(main_link)
        # Extract and save content grouped by main sections
        for section_id in main_section_ids:
            if section_id!="notices":
              content = extract_section_content(soup, section_id)
              if content:
                  sub_doc = create_document(content, main_link, section_id)
                  main_document_list.append(sub_doc)

    return main_document_list

In [7]:
def get_embedding_dim(model_name):
    embeddings = FastEmbedEmbeddings(model_name=model_name)
    document_embeddings = embeddings.embed_documents("Have a great day")
    return len(document_embeddings[0])

In [8]:
def create_collection(collection_name, model_name):
    dimension = get_embedding_dim(model_name)
    fields = [
        FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="source_link", dtype=DataType.VARCHAR, max_length=500),
        FieldSchema(
            name="text", dtype=DataType.VARCHAR, max_length=65535
        ),
        FieldSchema(
            name="prechunk", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_capacity=5, max_length=65535
            ),
        FieldSchema(
            name="postchunk", dtype=DataType.ARRAY, element_type=DataType.VARCHAR, max_capacity=5, max_length=65535
            ),
        FieldSchema(
            name="section_id", dtype=DataType.VARCHAR, max_length=65535
        ),
        FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
        FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, dim=dimension),
    ]

    schema = CollectionSchema(fields=fields)
    collection = Collection(name=collection_name, schema=schema, shards_num=1, consistency_level="Strong")

    dense_index_params = {
        "index_type": "IVF_SQ8",
        "metric_type": "L2",
        "params": {"nlist": 128},
    }

    sparse_index_params = {
        "index_type": "SPARSE_INVERTED_INDEX",
        "metric_type": "IP",
    }
    collection.create_index(field_name="sparse_vector", index_params=sparse_index_params)
    collection.create_index(field_name="dense_vector", index_params=dense_index_params)
    collection.load()
    return collection

In [9]:
def dense_encode_docs(texts: List[str], embed_model):
    embeddings = FastEmbedEmbeddings(model_name=embed_model)
    document_embeddings = embeddings.embed_documents(texts)
    return document_embeddings

In [10]:
def sparse_encode_docs(texts: List[str], embed_model):
    embeddings = SparseFastEmbedEmbeddings(model_name=embed_model)
    document_embeddings = embeddings.embed_documents(texts)
    return document_embeddings

In [11]:
def embed_insert(data: list, collection, sparse_embed_model, dense_embed_model):
    print(data[1])
    sparse_embeddings = sparse_encode_docs(data[1], sparse_embed_model)
    dense_embeddings = dense_encode_docs(data[1], dense_embed_model)
    collection.insert(
        [
            data[0], # source            
            data[1], # text, page_content
            data[2], # prechunk
            data[3], # postchunk
            data[4], #section_id
            sparse_embeddings, # sparse_embedding
            dense_embeddings, # dense_embeddings
        ]
    )

In [12]:
def build_metadata(doc_metadata:dict, splits: list[DocumentSplit]):
    source_link_meta = doc_metadata['source_link']
    section_id_meta = doc_metadata['section_id']
    final_doc = []
    for i, split in enumerate(splits):
        prechunk = [splits[i-2].content if i-2 >= 0 else "", splits[i-1].content if i-1 >= 0 else ""]
        postchunk = [splits[i+1].content if i+1 < len(splits) else "", splits[i+2].content if i+2 < len(splits) else ""]
        page_content = split.content 
        metadata = { 
            "prechunk": prechunk,
            "postchunk": postchunk,
            "source_link": source_link_meta,
            "section_id" : section_id_meta
        }
        doc_obj = Document(page_content=page_content, metadata=metadata)
        final_doc.append(doc_obj)

    return final_doc

In [13]:
def semantic_splitter(encoder_name, semantic_score_threshold):
    encoder = FastEmbedEncoder(name=encoder_name)

    encoder.score_threshold = semantic_score_threshold

    splitter = RollingWindowSplitter(
        encoder=encoder,
        dynamic_threshold=False,
        min_split_tokens=100,
        max_split_tokens=2000,
        window_size=3,
        plot_splits=False,  # set this to true to visualize chunking
        enable_statistics=True  # to print chunking stats
    )
    return splitter

In [14]:
def insert_data_db(prepared_data, collection_obj, collection_name, sparse_embed_model, dense_embed_model):
    data_batch = [[], [], [], [], []]
    overall_time_st = time.time()
    BATCH_SIZE = 2
    current_source = None
    section = 0
    current_title = None
    doc_title = None

    for content in prepared_data:
        source = content.metadata["source_link"]
        page_content = content.page_content
        prechunk = content.metadata["prechunk"]
        postchunk = content.metadata["postchunk"]
        section = content.metadata["section_id"]

        data_batch[0].append(source)
        data_batch[1].append(page_content)
        data_batch[2].append(prechunk)
        data_batch[3].append(postchunk)
        data_batch[4].append(section)
        
        if len(data_batch[0]) % BATCH_SIZE == 0:
            print("Inside Data embed")
            st = time.time()
            ins = embed_insert(data_batch, collection_obj, sparse_embed_model, dense_embed_model)
            print("Total time taken to  process each batch & insert data to milvus is: ", time.time() - st)
            data_batch = [[], [], [], [], []]
            print("="*100)
            # break

    print("overall time to prepare data for insertion: ", time.time() - overall_time_st)

In [15]:
def Store_data_Milvus():

    #Check the Connection status with Milvus
    connection_status(COLLECTION_NAME)

    # Create a collection
    collection = create_collection(COLLECTION_NAME, DENSE_EMBEDDING_MODEL)
    print("Successfully Loaded the Collection")
    
    # Scrape the Base-URL Data
    scraped_data_lst = Scrape_data(BASE_URL)

    # Prepare Data
    splitter = semantic_splitter(SEMANTIC_ENCODER, SEMANTIC_SCORE_THERESHOLD)

    # Prepare Metadata and Store in Milvus
    for data in scraped_data_lst[:3]:
        splits = splitter([data.page_content])
        doc_metadata = data.metadata
        prepared_data = build_metadata(doc_metadata, splits)
        insert_data_db(prepared_data, collection, COLLECTION_NAME, SPARSE_EMBEDDING_MODEL, DENSE_EMBEDDING_MODEL)
    return {"Successfully Store Data Into the Milvus"}, collection

In [16]:
# Store The Data Into The Milvus
message, collection = Store_data_Milvus()

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 62415.24it/s]


Successfully Loaded the Collection


Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 77385.68it/s]
[32m2024-07-16 13:50:10 INFO semantic_router.utils.logger Single document exceeds the maximum token limit of 2000. Splitting to sentences before semantically splitting.[0m


Splitting Statistics:
  - Total Documents: 449
  - Total Splits: 3
  - Splits by Threshold: 0
  - Splits by Max Chunk Size: 2
  - Last Split: 1
  - Minimum Token Size of Split: 197
  - Maximum Token Size of Split: 1996
  - Similarity Split Ratio: 0.00
Inside Data embed
['1. CUDA 12.5 UPDATE 1 RELEASE NOTES\uf0c1 The release notes for the NVIDIA® CUDA® Toolkit can be found online at https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html. Note The release notes have been reorganized into two major sections: the general CUDA release notes, and the CUDA libraries release notes including historical information for 12.x releases. 1.1. CUDA TOOLKIT MAJOR COMPONENT VERSIONS\uf0c1 Starting with CUDA 11, the various components in the toolkit are versioned independently. For CUDA 12.5 Update 1, the table below indicates the versions: Component Name Version Information Supported Architectures Supported Platforms CUDA C++ Core Compute Libraries Thrust 2.4.0 x86_64, arm64-sbsa, aarch64-j

Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 91180.52it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 67432.54it/s]
[32m2024-07-16 13:50:37 INFO semantic_router.utils.logger Single document exceeds the maximum token limit of 2000. Splitting to sentences before semantically splitting.[0m


Total time taken to  process each batch & insert data to milvus is:  4.527934551239014
overall time to prepare data for insertion:  4.528050899505615
Splitting Statistics:
  - Total Documents: 1204
  - Total Splits: 12
  - Splits by Threshold: 0
  - Splits by Max Chunk Size: 11
  - Last Split: 1
  - Minimum Token Size of Split: 1107
  - Maximum Token Size of Split: 2000
  - Similarity Split Ratio: 0.00
Inside Data embed
['2. CUDA LIBRARIES\uf0c1 This section covers CUDA Libraries release notes for 12.x releases. * CUDA Math Libraries toolchain uses C++11 features, and a C++11-compatible standard library (libstdc++ >= 20150422) is required on the host. CUDA Math Libraries toolchain uses C++11 features, and a C++11-compatible standard library (libstdc++ >= 20150422) is required on the host. 2.1. CUBLAS LIBRARY\uf0c1 2.1.1. CUBLAS: RELEASE 12.5 UPDATE 1\uf0c1 * New Features Performance improvement to matrix multiplication targeting large language models, specifically for small batch sizes

Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 86778.70it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 54189.97it/s]


Total time taken to  process each batch & insert data to milvus is:  4.476426601409912
Inside Data embed
['If strict compliance with BLAS is required, the user may manually check for alpha value before invoking the functions or switch to CUBLAS_POINTER_MODE_HOST. BLAS level 2 and 3 functions might not treat alpha in a BLAS compliant manner when alpha is zero and the pointer mode is set to CUBLAS_POINTER_MODE_DEVICE. The expected behavior is that the corresponding computations would be skipped. You may encounter the following issues: (1) HER{,2,X,K,2K} may zero the imaginary part on the diagonal elements of the output matrix; and (2) HER{,2,X,K,2K}, SYR{,2,X,K,2K} and others may produce NaN resulting from performing computation on matrices A and B which would otherwise be skipped. If strict compliance with BLAS is required, the user may manually check for alpha value before invoking the functions or switch to CUBLAS_POINTER_MODE_HOST. * Resolved Issues cuBLASLt matmul operations might h

Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 89240.51it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 74631.74it/s]


Total time taken to  process each batch & insert data to milvus is:  3.0369365215301514
Inside Data embed
['CUBLASLT_EPILOGUE_{RELU,GELU}_AUX * CUBLASLT_EPILOGUE_D{RELU,GELU} CUBLASLT_EPILOGUE_D{RELU,GELU} * Improved Hopper performance on arm64-sbsa by adding Hopper kernels that were previously supported only on the x86_64 architecture for Windows and Linux. Improved Hopper performance on arm64-sbsa by adding Hopper kernels that were previously supported only on the x86_64 architecture for Windows and Linux. * Known Issues There are no forward compatible kernels for single precision complex gemms that do not require workspace. Support will be added in a later release. Known Issues * There are no forward compatible kernels for single precision complex gemms that do not require workspace. Support will be added in a later release. There are no forward compatible kernels for single precision complex gemms that do not require workspace. Support will be added in a later release. * Resolved I

Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 69905.07it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 79739.62it/s]


Total time taken to  process each batch & insert data to milvus is:  2.838268995285034
Inside Data embed


Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 91512.09it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 79739.62it/s]


Total time taken to  process each batch & insert data to milvus is:  3.5387067794799805
Inside Data embed


Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 79638.68it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 77385.68it/s]


Total time taken to  process each batch & insert data to milvus is:  2.680485725402832
Inside Data embed
['RELEASE 12.4\uf0c1 * Resolved Issues Host-specific code in cuda_fp16/bf16 headers is now free from type-punning and shall work correctly in the presence of optimizations based on strict-aliasing rules. Resolved Issues * Host-specific code in cuda_fp16/bf16 headers is now free from type-punning and shall work correctly in the presence of optimizations based on strict-aliasing rules. Host-specific code in cuda_fp16/bf16 headers is now free from type-punning and shall work correctly in the presence of optimizations based on strict-aliasing rules. 2.5.3. CUDA MATH: RELEASE 12.3\uf0c1 * New Features Performance of SIMD Integer CUDA Math APIs was improved. New Features * Performance of SIMD Integer CUDA Math APIs was improved. Performance of SIMD Integer CUDA Math APIs was improved. * Resolved Issues The __hisinf() Math APIs from cuda_fp16.h and cuda_bf16.h headers were silently produci

Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 88301.14it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 76260.07it/s]


Total time taken to  process each batch & insert data to milvus is:  2.5525712966918945
overall time to prepare data for insertion:  19.124083518981934
Splitting Statistics:
  - Total Documents: 44
  - Total Splits: 1
  - Splits by Threshold: 0
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 718
  - Maximum Token Size of Split: 718
  - Similarity Split Ratio: 0.00
overall time to prepare data for insertion:  3.337860107421875e-06


In [47]:
# to check the data from Present in the Milvus
collection_query_source = collection.query(expr='source_link like "https://docs.nvidia.com/cuda%"', output_fields=["text"])

RPC error: [query], <MilvusException: (code=100, message=collection not found[collection=450745645481276887])>, <Time:{'RPC start': '2024-07-15 19:38:00.981352', 'RPC error': '2024-07-15 19:38:01.244442'}>


MilvusException: <MilvusException: (code=100, message=collection not found[collection=450745645481276887])>

In [42]:
collection_query_source

data: [] , extra_info: {'cost': 0}