In [1]:
pip install sentence-transformers

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install qdrant_client

Defaulting to user installation because normal site-packages is not writeable
Collecting qdrant_client
  Using cached qdrant_client-1.12.1-py3-none-any.whl (267 kB)
Collecting grpcio>=1.41.0
  Downloading grpcio-1.68.1-cp39-cp39-macosx_10_9_universal2.whl (11.1 MB)
[K     |████████████████████████████████| 11.1 MB 285 kB/s eta 0:00:01
[?25hCollecting portalocker<3.0.0,>=2.7.0
  Downloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Collecting grpcio-tools>=1.41.0
  Downloading grpcio_tools-1.68.1-cp39-cp39-macosx_10_9_universal2.whl (5.6 MB)
[K     |████████████████████████████████| 5.6 MB 29.8 MB/s eta 0:00:01
Collecting protobuf<6.0dev,>=5.26.1
  Downloading protobuf-5.29.1-cp38-abi3-macosx_10_9_universal2.whl (417 kB)
[K     |████████████████████████████████| 417 kB 14.2 MB/s eta 0:00:01
Collecting h2<5,>=3
  Downloading h2-4.1.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 22.1 MB/s eta 0:00:01
Collecting hyperframe<7,>=6.0
  Downloading hyperframe

In [None]:
import numpy as np
from numpy.typing import ndarray
from enum import enum
from uuid import uuid4
from typing import any, dict, list
from pydantic import basemodel, field

from pymongo import mongoclient
from qdrant_client import qdrantclient
from qdrant_client.models import vectorparams, distance, pointstruct

from sentence_transformers import sentencetransformer
import re

class datacategory(str, enum):
    posts = "posts"
    articles = "articles"
    repositories = "repositories"
    queries = "queries"

class basedocument(basemodel):
    id: str = field(default_factory=lambda: str(uuid4()))

    @classmethod
    def from_mongo(cls, data: dict):
        """load document from mongodb."""
        data["id"] = data.pop("_id")
        return cls(**data)

class vectorbasedocument(basedocument):
    content: str
    metadata: dict[str, any] = field(default_factory=dict)

class cleaninghandler:
    def clean_repository(self, data: dict) -> dict:
        return {
            "_id": data["_id"],
            "content": self._clean_text(" ".join(data.get("content", {}).values())),
            "name": data.get("name"),
            "link": data.get("link"),
            "platform": data.get("platform"),
        }

    def clean_post(self, data: dict) -> dict:
        return {
            "id": data["id"],
            "content": self._clean_text(" ".join(data.get("content", {}).values())),
            "platform": data.get("platform"),
            "image": data.get("image"),
        }

    def clean_article(self, data: dict) -> dict:
        return {
            "id": data["id"],
            "content": self._clean_text(" ".join(data.get("content", {}).values())),
            "link": data.get("link"),
            "platform": data.get("platform"),
        }

    @staticmethod
    def _clean_text(text: str) -> str:
        text = re.sub(r"[^\w\s.,!?]", " ", text)
        return re.sub(r"\s+", " ", text).strip()

def chunk_text(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> list[str]:
    """
    splits the input text into chunks of specified size with overlap.
    """
    sentences = re.split(r"(?<!\w\.\w.)(?<![a-z][a-z]\.)(?<=\.|\?|\!)\s", text)

    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= chunk_size:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

class chunkinghandler:
    def chunk(self, cleaned_content: str) -> list[dict]:
        """
        splits the cleaned content into chunks and returns them as dictionaries.
        """
        chunks = chunk_text(cleaned_content)  
        chunk_dicts = [{"content": chunk} for chunk in chunks]  # wrap in a dict
        return chunk_dicts

class embeddingmodel:
    def __init__(self):
        self.model = sentencetransformer("sentence-transformers/all-minilm-l6-v2")

    def embed(self, texts: list[str]) -> list[list[float]]:
        return self.model.encode(texts, convert_to_tensor=false)

embedding_model = embeddingmodel()

class embeddinghandler:
    def embed_chunks(self, chunks):
        """
        embeds the given chunks and returns them with their embeddings.
        """
        embedded_chunks = []
        for chunk in chunks:
            embedding = embedding_model.embed([chunk["content"]])[0]  # single chunk embedding
            embedded_chunk = {
                "content": chunk["content"],
                "embedding": embedding,
                "metadata": chunk.get("metadata", {}),
            }
            embedded_chunks.append(embedded_chunk)
        return embedded_chunks

def convert_numpy_to_list(data):
    """recursively converts all numpy arrays to python lists."""
    if isinstance(data, np.ndarray):
        return data.tolist()
    elif isinstance(data, dict):
        return {k: convert_numpy_to_list(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [convert_numpy_to_list(v) for v in data]
    return data

class datapipeline:
    def __init__(self, mongo_collection, qdrant_client: qdrantclient, qdrant_collection_name: str):
        self.mongo_collection = mongo_collection
        self.qdrant_client = qdrant_client
        self.qdrant_collection = qdrant_collection_name
        self.cleaning_handler = cleaninghandler()
        self.chunking_handler = chunkinghandler()
        self.embedding_handler = embeddinghandler()

        # ensure collection exists in qdrant
        self._ensure_qdrant_collection()

    def _ensure_qdrant_collection(self):
        # if collection doesn't exist, create it
        collections = self.qdrant_client.get_collections()
        existing_collections = [c.name for c in collections.collections]
        if self.qdrant_collection not in existing_collections:
            self.qdrant_client.create_collection(
                collection_name=self.qdrant_collection,
                vectors_config=vectorparams(size=384, distance=distance.cosine)
            )

    def process_repository_by_id(self, repo_id: str):
        """fetch raw data from mongodb, process it, and store it in qdrant."""
        # fetch raw data from mongodb
        raw_data = self.mongo_collection.find_one({"_id": repo_id})
        if not raw_data:
            print(f"no repository found with id: {repo_id}")
            return

        # step 1: clean the data
        cleaned_data = self.cleaning_handler.clean_repository(raw_data)

        # step 2: chunk the content
        chunks = self.chunking_handler.chunk(cleaned_data["content"])

        # step 3: embed the chunks
        embedded_chunks = self.embedding_handler.embed_chunks(chunks)

        # step 4: store in qdrant
        points = []
        for i, chunk in enumerate(embedded_chunks):
            point_id = str(uuid4())
            points.append(
                pointstruct(
                    id=point_id,
                    vector=chunk["embedding"],
                    payload={
                        "repository_id": repo_id,
                        "content": chunk["content"]
                    }
                )
            )

        # upsert points into qdrant collection
        self.qdrant_client.upsert(collection_name=self.qdrant_collection, points=points)
        print(f"repository {repo_id} processed and stored in qdrant successfully.")

# mongodb connection
client = mongoclient("mongodb://localhost:27017/")
db = client["github_scraper"]
collection = db["repositories"]

# qdrant client connection
qdrant_client = qdrantclient(url="http://localhost:6333")

# initialize pipeline with qdrant
pipeline = datapipeline(
    mongo_collection=collection,
    qdrant_client=qdrant_client,
    qdrant_collection_name="repository_chunks"  # name of your qdrant collection
)

# process repository by id
repository_id = "d4b9ba47-79ef-4cac-ac5b-a00ecab94779"  # replace with the actual repository id
pipeline.process_repository_by_id(repository_id)

Repository d4b9ba47-79ef-4cac-ac5b-a00ecab94779 processed and stored in Qdrant successfully.


In [3]:
collections = qdrant_client.get_collections()
print(collections)

collections=[CollectionDescription(name='repositories'), CollectionDescription(name='repository_chunks')]


In [None]:
from typing import list, dict
from qdrant_client import qdrantclient
from qdrant_client.models import scoredpoint

class retriever:
    def __init__(self, qdrant_client: qdrantclient, collection_name: str, embedding_model: embeddingmodel):
        self.qdrant_client = qdrant_client
        self.collection_name = collection_name
        self.embedding_model = embedding_model

    def retrieve(self, query: str, top_k: int = 5) -> list[dict]:
        # 1. embed the query
        query_embedding = self.embedding_model.embed([query])[0]  # single query embedding

        # 2. search in qdrant
        search_results: list[scoredpoint] = self.qdrant_client.search(
            collection_name=self.collection_name,
            query_vector=query_embedding,
            limit=top_k,
            with_vectors=false,  # we don't need the vectors in the result, just the payload and score
        )

        # 3. format the results
        formatted_results = []
        for result in search_results:
            formatted_results.append({
                "id": result.id,
                "score": result.score,
                "payload": result.payload
            })

        return formatted_results

# example usage:
# please make sure that you have qdrant_client and embedding_model already instantiated
retriever = retriever(qdrant_client, "repository_chunks", embedding_model)

query = "what is ros"
results = retriever.retrieve(query, top_k=5)
for res in results:
    print(res)

{'id': '885413fa-ce4e-49d6-989f-45f971ff3128', 'score': 0.6704952, 'payload': {'repository_id': 'd4b9ba47-79ef-4cac-ac5b-a00ecab94779', 'content': '235 https github.com ros controls ros2_controllers issues 235 _ Contributors Bence Magyar, Denis Štogl, bailaC 0.5.0 2021 08 30 Add auto declaration of parameters.'}}
{'id': '53a60d37-26cb-497a-8507-04a7361eedb1', 'score': 0.66536087, 'payload': {'repository_id': 'd4b9ba47-79ef-4cac-ac5b-a00ecab94779', 'content': '375 https github.com ros controls ros2_controllers issues 375 _ Contributors Denis Štogl 2.7.0 2022 07 03 2.6.0 2022 06 18 2.5.0 2022 05 13 2.4.0 2022 04 29 2.3.0 2022 04 21 2.2.0 2022 03 25 2.1.0 2022 02 23 Move test nodes from the ros2_control_demos repository.'}}
{'id': 'd91b9e43-4f24-4a67-8913-129ac0bba8d2', 'score': 0.6594087, 'payload': {'repository_id': 'd4b9ba47-79ef-4cac-ac5b-a00ecab94779', 'content': '235 https github.com ros controls ros2_controllers issues 235 _ refactor get_current_state to get_state 232 https github.

In [None]:
import requests

class ollamahandler:
    def __init__(self, ollama_url: str = "http://localhost:11434/api/generate"):
        self.ollama_url = ollama_url

    def send_prompt(self, question: str, results: list[dict]) -> str:
        """
        sends the formatted prompt to ollama and returns the generated response.
        """
        # format the prompt
        formatted_results = "\n".join(
            [f"result {i+1}:\ncontent: {res['payload']['content']}\nscore: {res['score']}\n"
             for i, res in enumerate(results)]
        )
        prompt = f"""you are an expert assistant. answer the following query based on the provided information.
        query: {question}
        
        retrieved results:
        {formatted_results}
        
        please provide an insightful and concise answer."""
        
        # send the prompt to ollama
        response = requests.post(
            url="http://localhost:11434/api/generate",
            json={
                "model": "llama3.2",    
                "prompt": prompt
            },
            stream=true
        )
        # response = requests.post(
        #     url=self.ollama_url,
        #     json={"prompt": prompt}
        # )
        if response.status_code == 200:
            import json
        
            responses = []
            # process the stream line-by-line
            for line in response.iter_lines(decode_unicode=true):
                if line.strip():  
                    data = json.loads(line)
                    responses.append(data)
                    # if the chunk indicates it's done, stop reading
                    if data.get("done", false):
                        break
        
            # combine all partial responses
            full_text = "".join(item["response"] for item in responses)
            return full_text
        else:
            return f"error: {response.status_code}, {response.text}"

# extend the retriever class to include ollama
class retrieverwithollama(retriever):
    def __init__(self, qdrant_client: qdrantclient, collection_name: str, embedding_model: embeddingmodel, ollama_handler: ollamahandler):
        super().__init__(qdrant_client, collection_name, embedding_model)
        self.ollama_handler = ollama_handler

    def retrieve_and_ask_ollama(self, query: str, top_k: int = 5) -> str:
        # retrieve relevant results
        results = self.retrieve(query, top_k=top_k)
        
        # send the query and results to ollama
        return self.ollama_handler.send_prompt(query, results)

# initialize ollama handler
ollama_handler = ollamahandler(ollama_url="http://localhost:11434/api/generate")

# initialize the enhanced retriever
retriever_with_ollama = retrieverwithollama(qdrant_client, "repository_chunks", embedding_model, ollama_handler)

# example usage
query = "what is ros"
ollama_response = retriever_with_ollama.retrieve_and_ask_ollama(query, top_k=5)
print(ollama_response)

Based on the retrieved results, ROS stands for Robot Operating System. It is an open-source software framework that enables communication between hardware components and provides a set of tools and libraries for building robot applications.

The specific result (Issue 235) mentioned in the query appears to be related to adding auto-declaration of parameters in the ROS2_controllers repository. However, this information doesn't provide a comprehensive overview of ROS.

A more detailed explanation can be provided based on publicly available knowledge:

ROS is an open-source project that aims to standardize the development of robot software. It provides a common framework for building robot applications and has become a widely-used platform in various industries such as robotics, autonomous systems, and artificial intelligence.

The main components of ROS include:

1. ROS Core: The core functionality of ROS, which includes package management, dependency resolution, and node communication.


In [12]:
pip install gradio

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting gradio
  Downloading gradio-4.44.1-py3-none-any.whl (18.1 MB)
[K     |████████████████████████████████| 18.1 MB 12.6 MB/s eta 0:00:01
Collecting fastapi<1.0
  Downloading fastapi-0.115.6-py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 10.5 MB/s eta 0:00:01
[?25hCollecting tomlkit==0.12.0
  Downloading tomlkit-0.12.0-py3-none-any.whl (37 kB)
Collecting semantic-version~=2.0
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting uvicorn>=0.14.0
  Downloading uvicorn-0.32.1-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 11.3 MB/s eta 0:00:01
[?25hCollecting gradio-client==1.3.0
  Downloading gradio_client-1.3.0-py3-none-any.whl (318 kB)
[K     |████████████████████████████████| 318 kB 8.6 MB/s eta 0:00:01
[?25hCollecting typer<1.0,>=0.12
  Downloadin

In [None]:
import gradio as gr

# preset questions
questions = [
    "what is ros?",
    "steps to install ros?",
    "what is the latest version of ros?",
    "tell me how can i navigate to a specific pose - include",
    "what are the benefits of cloud computing?"
]

def get_answer(query):
    try:
        answer = retriever_with_ollama.retrieve_and_ask_ollama(query, top_k=5)
        return answer
    except exception as e:
        return f"error: {str(e)}"


# build the gradio interface
with gr.blocks() as demo:
    gr.markdown("# ros query application")
    gr.markdown("select a question from the dropdown and then click **search** to see the response.")
    
    question_dropdown = gr.dropdown(choices=questions, label="select a question")
    search_button = gr.button("search")
    answer_output = gr.textbox(label="response", lines=10)

    search_button.click(fn=get_answer, inputs=question_dropdown, outputs=answer_output)

# launch gradio app
demo.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "/Users/krishmurjani/Library/Python/3.9/lib/python/site-packages/gradio/queueing.py", line 536, in process_events
    response = await route_utils.call_process_api(
  File "/Users/krishmurjani/Library/Python/3.9/lib/python/site-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
  File "/Users/krishmurjani/Library/Python/3.9/lib/python/site-packages/gradio/blocks.py", line 1935, in process_api
    result = await self.call_function(
  File "/Users/krishmurjani/Library/Python/3.9/lib/python/site-packages/gradio/blocks.py", line 1520, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "/Users/krishmurjani/Library/Python/3.9/lib/python/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "/Users/krishmurjani/Library/Python/3.9/lib/python/site-packages/anyio/_backends/_asyncio.py"