In [None]:
from crawl4ai import *
import asyncio
import nest_asyncio
from utils import scrape_duckduckgo_content
import os
import uuid
from sentence_transformers import SentenceTransformer
import chromadb
from google.generativeai import genai
from dotenv import load_dotenv

load_dotenv()

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

class DeepSearchPipeline:
    def __init__(self, query):
        self.query = query
        self.links = []
        self.setup_gemini(api_key=os.environ["GEMINI_API_KEY"])
        self.knowledge_base_directory = str(uuid.uuid4())
        self.collection_name = f"knowledge_base_{uuid.uuid4().hex[:8]}"  # Create unique collection name
        self.scrape_all_contents()
        asyncio.run(self.crawl_and_create_repo())
        self.setup_vector_db()
        

    def scrape_all_contents(self):
        self.links = scrape_duckduckgo_content(self.query, 1)

    def setup_gemini(self, api_key):
        # Configure the Google AI client with your API key 
        genai.configure(api_key=api_key)

        # Get the Gemini Pro model
        self.model = genai.GenerativeModel('gemini-pro')

    def query_gemini(self, query_text):
        # Retrieve relevant context from the vector database
        relevant_context = self.query_vector_db(query_text)

        # Prepare the prompt with context
        parts = [query_text]
        if relevant_context['documents']:
            for doc in relevant_context['documents'][0]:
                parts.append(doc)
        else:
            parts.append("No relevant context found.")

        # Generate content with the Gemini model
        response = self.model.generate_content(parts)

        # Return the response text
        return response.text
    
    async def setupVectorDatabase(self):
        self.nn = 

    async def crawl_and_create_repo(self):
        os.makedirs(self.knowledge_base_directory)
        
        async def crawl():
            async with AsyncWebCrawler() as crawler:
                for link in self.links:
                    result = await crawler.arun(
                        url=link
                    )
                    with open(f"{self.knowledge_base_directory}/{self.links.index(link)}.md", "w+") as f:
                        f.write(result.markdown)
        
        await crawl()

    async def setupVectorDabtase(self):
        self.embedding_model = SentenceTransformer("all-mpnet-base-v2")

        self.chroma_client = chromadb.Collection();
    


    async def queryVectorDatabase(self, query_text):
        query_embedding = self.embedding_model.encode(query_text).tolist()
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=2
        )
        return results
    
    async def returnFunctionQualityAndValidateEverything(self):
        self.chroma_client = chromadb.Client()
        self.collection = self.chroma_client.create_collection(name=self.collection_name)

        self.collection.retrieve()




    def setup_vector_db(self):
        self.embedding_model = SentenceTransformer('all-mpnet-base-v2')
        # Initialize ChromaDB client
        self.chroma_client = chromadb.Client()
        
        # Delete collection if it exists and create a new one
        try:
            self.chroma_client.delete_collection(name=self.collection_name)
        except ValueError:
            pass  # Collection doesn't exist, nothing to delete
        
        # Create new collection with unique name
        self.collection = self.chroma_client.create_collection(name=self.collection_name)
        
        # Process and store embeddings
        for filename in os.listdir(self.knowledge_base_directory):
            if filename.endswith(".md"):
                filepath = os.path.join(self.knowledge_base_directory, filename)
                with open(filepath, "r") as f:
                    content = f.read()
                # Generate embedding for the content
                embedding = self.embedding_model.encode(content).tolist()
                # Add embedding to ChromaDB
                self.collection.add(
                    embeddings=[embedding],
                    documents=[content],
                    metadatas=[{"source": filename}],
                    ids=[filename]
                )

    def query_vector_db(self, query_text, n_results=2):
        # Generate embedding for the query
        query_embedding = self.embedding_model.encode(query_text).tolist()
        # Query ChromaDB
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=n_results
        )
        return results

# Example usage
deepSearch = DeepSearchPipeline("Python")

ImportError: cannot import name 'genai' from 'google.generativeai' (/Users/imdigitalashish/Projects/Ashish/AgenticProjects/venv/lib/python3.13/site-packages/google/generativeai/__init__.py)

In [11]:
query_results = deepSearch.query_gemini("When python was released?")


AttributeError: 'DeepSearchPipeline' object has no attribute 'query_gemini'

In [9]:
query_results

{'ids': [['0.md', '1.md']],
 'embeddings': None,
 'documents': [['**Notice:** While JavaScript is not essential for this website, your interaction with the content will be limited. Please turn JavaScript on for the full experience. \n[Skip to content](https://www.python.org/<#content> "Skip to content")\n[ ▼ Close ](https://www.python.org/<#python-network>)\n  * [Python](https://www.python.org/</> "The Python Programming Language")\n  * [PSF](https://www.python.org/<https:/www.python.org/psf/> "The Python Software Foundation")\n  * [Docs](https://www.python.org/<https:/docs.python.org> "Python Documentation")\n  * [PyPI](https://www.python.org/<https:/pypi.org/> "Python Package Index")\n  * [Jobs](https://www.python.org/</jobs/> "Python Job Board")\n  * [Community](https://www.python.org/</community/>)\n\n[ ▲ The Python Network ](https://www.python.org/<#top>)\n#  [![python™](https://www.python.org/static/img/python-logo.png)](https://www.python.org/</>)\n[Donate](https://www.python.or