## Here we have defined a custom DSpy RetirverModelClient

The client uses OllamaEmbeddingFunction to  fetch  answers.

The OllamaEmbeddingFunction is also provided Here


This OllamaEmbeddingFunction  uses Ollama python library to build the interface between ollama embedder and retriever
with a collection that is created by initialize_chromadb_collection method

The initialize_chromadb_collection method depends on OllamaEmbeddingFunction for writing vectors to the chromadb server

DSPythonicRMClient uses OllamaEmbeddingFunction for reading passages from chromadb server

### OllamaEmbeddingFunction 

In [1]:
import random
import ollama
import chromadb
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings
from typing import Optional, Union, List
import pandas as pd



class OllamaEmbeddingFunction(EmbeddingFunction[Documents]):
    Documents = Union[str, List[str], pd.DataFrame]

    def __init__(self, model_name: str = "mxbai-embed-large", collection=None):
        """Initialize the embedding function."""
        self.model_name = model_name
        self.collection = collection  # Store the collection

    def __call__(self, input: Documents) -> List[List[float]]:
        """Embed the input documents."""
        return self._embed(input)

    def _embed(self, documents: Documents) -> List[List[float]]:
        """Generate embeddings for the input documents using Ollama."""
        embeddings = []

        # Handle different input types
        if isinstance(documents, str):
            # If input is a single string, convert it to a list
            documents = [documents]
        elif isinstance(documents, pd.DataFrame):
            # If input is a DataFrame, extract the first column as a list of strings
            documents = documents.iloc[:, 0].tolist()
        elif isinstance(documents, list):
            # If input is a list, ensure all elements are strings
            documents = [str(doc) for doc in documents]
        elif hasattr(documents, '__iter__'):
            # If input is any other iterable (like a set or tuple), convert to list of strings
            documents = [str(doc) for doc in documents]
        else:
            raise ValueError("Unsupported document type. Please provide a string, list, or pandas DataFrame.")

        # Generate embeddings for each document
        for doc in documents:
            response = ollama.embeddings(
                model=self.model_name,
                prompt=doc
            )
            embeddings.append(response["embedding"])

        return embeddings

    def _retrieve(self, query: Union[str, List[str]], k: int) -> List[str]:
        """Retrieve relevant documents based on a query using Ollama."""
        if self.collection is None:
            raise ValueError("Collection is not set. Please initialize the OllamaEmbeddingFunction with a valid collection.")

        # Handle different input types for query
        if isinstance(query, list):
            # If input is a list, join it into a single string
            query = ' '.join(query)

        response = ollama.embeddings(
            model=self.model_name,
            prompt=query
        )
        query_embedding = response["embedding"]

        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=k
        )

        return results['documents'] [:k]  # Return the top k documents in correct order



### Make the Robust ChromaDB Collection Method

In [2]:
import random
import chromadb
from typing import Optional
import random
import chromadb
from typing import Optional
import uuid

# Create a dictionary to store the last used collection name and serial number
global last_used_info

last_used_info = {}



def initialize_chromadb_collection(host: str = 'localhost', port: int = 8000, reset: Optional[bool] = False, create_new_collection: bool = True, last_used: Optional[dict] = None) -> chromadb.Collection:
    """
    Initializes a ChromaDB HTTP client and creates or retrieves a collection.

    Args:
        host (str): The host where the ChromaDB server is running. Defaults to 'localhost'.
        port (int): The port on which the ChromaDB server is listening. Defaults to 8000.
        reset (Optional[bool]): If True, resets the ChromaDB client before creating or using a collection. Defaults to False.
        create_new_collection (bool): If True, creates a new collection with a serial numbered name. If False, uses the last used collection name. Defaults to True.
        last_used (Optional[dict]): A dictionary to store the last used collection name and number. Defaults to None.

    Returns:
        chromadb.Collection: The created or existing ChromaDB collection.
    """
    # Initialize last_used if it is None
    if last_used is None:
        last_used = {'collection_name': None, 'serial_number': 0}
    elif 'serial_number' not in last_used:
        last_used['serial_number'] = 0

    # Create a ChromaDB HTTP client
    client = chromadb.HttpClient(host=host, port=port)
    
    # Reset the client if requested
    if reset:
        client.reset()
    
    if create_new_collection:
        # Increment the serial number for the new collection name
        last_used['serial_number'] += 1
        collection_name = f"docs{last_used['serial_number']}"
        
        # Use get_or_create_collection to avoid UniqueConstraintError
        collection = client.get_or_create_collection(name=collection_name)
        
        # Store the collection name for future use
        last_used['collection_name'] = collection_name
    else:
        # Use the last used collection name
        collection_name = last_used.get('collection_name')
        
        if collection_name is None:
            raise ValueError("No previous collection name found. Set create_new_collection to True to create a new collection.")
        
        # Get or create the collection with the last used name
        collection = client.get_or_create_collection(name=collection_name)

    # Print the name of the created or used collection
    print(f"Using collection: {collection.name}")
    
    return collection


### Now comes the RMC

This RMC cannot (currently) fetch from any url / port and it is dependent on OllamaEmbeddingFunction  for encoding decoding

In [3]:
from typing import Union, List, Optional
import dspy  # Ensure dspy is imported correctly

class DSPythonicRMClient(dspy.Retrieve):
    def __init__(self, embedding_function: OllamaEmbeddingFunction, k: int = 3):
        """
        Initialize the DSPythonicRMClient.

        Args:
            embedding_function (OllamaEmbeddingFunction): The embedding function to use for retrieval.
            k (int): The number of top passages to retrieve. Defaults to 3.
        """
        super().__init__(k=k)
        self.embedding_function = embedding_function

    def forward(self, query: Union[str, List[str]], k: Optional[int] = None) -> dspy.Prediction:
        """
        Retrieve passages based on the embedded query.

        Args:
            query (Union[str, List[str]]): The query string or list of queries for which to retrieve passages.
            k (Optional[int]): The number of results to return. Defaults to k.

        Returns:
            dspy.Prediction: An object containing the retrieved passages.
        """
        # Initialize k based on the provided k or fallback to self.k
        k = k if k is not None else self.k
        
        # Ensure that k is a valid integer
        if k <= 0:
            raise ValueError("k must be a positive integer.")
        
        # Retrieve documents using the embedding function
        retrieved_documents = self.embedding_function._retrieve(query, k=k)

        return dspy.Prediction(passages=retrieved_documents)

### Example use With Rag

In [4]:
import os
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

# Make sure to download the punkt tokenizer if you haven't already
""" nltk.download('punkt') """

def load_documents(folder_path):
    """Recursively searches for .md, .docx, and .txt files in the given folder path and its subfolders."""
    documents = []
    data = []

    for root, _, files in os.walk(folder_path):
        for filename in files:
            file_path = os.path.join(root, filename)

            if filename.endswith('.md'):
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()

            

            elif filename.endswith('.txt'):
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()

            else:
                continue  # Skip files that are not .md or .txt

            # Split the content into sentences
            sentences = sent_tokenize(content)

            documents.extend(sentences)  # Add sentences to the documents list
            data.append({'index': len(documents) - len(sentences), 'filename': filename, 'content': sentences})
            #print(pd.DataFrame(data).to_markdown)

    return documents




# Step 1: Initialize ChromaDB Collection
collection = initialize_chromadb_collection(create_new_collection=True) #, last_used=last_used_info

# Step 2: Create an instance of OllamaEmbeddingFunction
embedding_function = OllamaEmbeddingFunction(model_name="mxbai-embed-large", collection=collection)

# Step 3: Embed Documents and Add Them to the Collection
documents = load_documents("/home/riju279/Documents/writings/Obsidian/Videodraft2/") #str(input("Enter the absoluter path to the folder"))
last_used_info = {}
# Embed documents and add them to the collection
embeddings = embedding_function(documents)
collection.add(
    ids=[str(uuid.uuid4()) for _ in range(len(documents))],
    embeddings=embeddings,
    documents=documents
)



Using collection: docs1


### Above code is a fully working example for making embeddings out of markdown files from a given Directory

getting proper infrastructure for doing large scale dspy experiments was important for us.


# Step 4: Create an instance of DSPythonicRMClient and LLM


In [5]:

lrm = DSPythonicRMClient(embedding_function=embedding_function, k=10)
olm=dspy.OpenAI(api_base="http://localhost:11434/v1/", api_key="ollama", model="mistral-nemo:latest", stop='\n\n', model_type='chat')

dspy.settings.configure(lm=olm,rm=lrm)

In [6]:
import dspy

class GenerateQuestion(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.OutputField(desc="ask questions about the facts")
    



import time

generate_question = dspy.ChainOfThoughtWithHint(GenerateQuestion)
questions=[]
for i in range(len(documents)):
    context=documents[i]
    
    hint= f"ask why, how and what about {documents[i]}"
    pred=generate_question(context=context, hint=hint)
    #print(f"({i})  **\n\n  Context: {context}\n\n")
    #print(f"Predicted Question: {pred.question}\n\n -----\n\n######\n\n")
    questions.append(pred.question)
    time.sleep(.1)



In [7]:

del questions[-1]

for i in range(len(questions)):
    print(questions[i])

What is biochemistry? How does it help us understand life processes?
Why are biochemistry and molecular biology important fields of study?
Why do these diseases require study in biochemistry?
Why do scientists study chemical reactions in cells?
What are the four major biomolecules that make up living organisms?
Why are stomata located primarily on the undersides of plant leaves?
Why are carbohydrates broken down?
How does cellular respiration occur?
What role do proteins play in this process?
Why do nucleic acids like DNA store genetic information?
How do lipids like cholesterol regulate cell membrane structure?
What makes these functions unique to each type of molecule?
Why do cells require multiple metabolic pathways like glycolysis, the citric acid cycle, and oxidative phosphorylation?
Why are interactions between these pathways crucial for cells?
Why does glycolysis occur?
Why does the citric acid cycle produce NADH and FADH2?
How do these molecules help generate ATP through oxidat

### Now that we have a lot of questions we will try to get answers from internet

In [None]:
import os
import json
import random
import uuid
import requests
import logging
import re
from pathlib import Path
from datetime import datetime
from typing import Optional, List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
import dspy
import ollama
import chromadb

# Load environment variables from the .env file
load_dotenv()

# Get the Serper API key
SERPER_API_KEY = os.getenv('SERPER_API_KEY')

# Configure logging
logging.basicConfig(level=logging.INFO)

def get_search_results(query: str) -> List[dict]:
    """Search the internet for relevant content and return accurate results."""
    url = "https://google.serper.dev/search"
    payload = json.dumps({"q": query})
    headers = {
        'X-API-Key': SERPER_API_KEY,
        'Content-Type': 'application/json'
    }

    response = requests.post(url, headers=headers, data=payload)
    results = response.json()

    structured_results = []
    if 'organic' in results:
        for item in results['organic']:
            structured_results.append({
                'title': item.get('title'),
                'link': item.get('link'),
                'snippet': item.get('snippet')
            })

    return structured_results 

def clean_text(soup: BeautifulSoup) -> str:
    """
    Clean text extracted from a BeautifulSoup object.

    Args:
        soup (BeautifulSoup): The BeautifulSoup object containing HTML content.

    Returns:
        str: The cleaned text.
    """
    # Extract text from specific tags
    relevant_tags = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'li'])
    text_content = ' '.join(tag.get_text() for tag in relevant_tags)
    
    # Clean the text to remove unwanted characters and patterns
    cleaned_text = re.sub(r'%PDF.*?endobj', '', text_content, flags=re.DOTALL)  # Remove PDF content
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s\! $ %]', ' ', cleaned_text)  # Remove non-ASCII characters
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Replace multiple whitespace characters and trim
    
    return cleaned_text

def scrape_multiple_links(structured_results: List[dict]) -> List[dict]:
    """
    Scrape multiple links and clean the extracted text.

    Args:
        structured_results (list): A list of dictionaries containing links to scrape.

    Returns:
        list: A list of cleaned texts extracted from the links.
    """
    extracted_texts = []  # List to store the extracted texts
    
    for item in structured_results:
        link = item['link']
        try:
            # Send a GET request to the URL
            response = requests.get(link)
            if response.status_code != 200:
                logging.warning(f"Rejected link due to non-2xx status code: {link} (Status code: {response.status_code})")
                continue  # Skip this link
            
            # Parse the HTML content using Beautiful Soup
            soup = BeautifulSoup(response.content, 'html.parser')
            cleaned_text = clean_text(soup)
            if cleaned_text:  # Only append if cleaned_text is not empty
                extracted_texts.append({'link': link, 'text': cleaned_text})
            
        except requests.exceptions.RequestException as e:
            logging.error(f"Error fetching {link}: {e}")

    return extracted_texts  # Return the list of extracted texts


last_used_info={}
collection = initialize_chromadb_collection(create_new_collection=True, last_used=last_used_info) 

#last_used_info=last_used_info
# Step 2: Create an instance of OllamaEmbeddingFunction
embedding_function = OllamaEmbeddingFunction(model_name="mxbai-embed-large", collection=collection)

# Step 3: Perform Initial search on the questions and fetch cleaned long results and embed
questions =questions  # Example questions

for question in questions[:148]:
    # Search and scrape
    search_results = get_search_results(question)
    scraped_texts = scrape_multiple_links(search_results)
    
    # Tokenize and embed
    tokenized_texts = []
    for item in scraped_texts:
        tokenized = sent_tokenize(item['text'])
        tokenized_texts.extend(tokenized)
        #print(tokenized_texts[:len(tokenized_texts)])
        # Embed documents and add them to the collection
        embeddings = embedding_function(tokenized_texts)
    
    # Generate unique IDs using UUIDs
    ids = [str(uuid.uuid4()) for _ in range(len(tokenized_texts))]

    # Add documents, embeddings, and IDs to the collection
    collection.add(
        ids=ids,
        embeddings=embeddings,
        documents=tokenized_texts
    )


In [10]:

# Initialize DSPythonicRMClient and OpenAI client
lrm = DSPythonicRMClient(embedding_function=embedding_function,k=10)
olm = dspy.OpenAI(api_base="http://localhost:11434/v1/", api_key="ollama", model="mistral-nemo:latest", stop='\n\n', model_type='chat')

# Configure DSPy settings
dspy.settings.configure(lm=olm, rm=lrm)


### now we will do RAG on the data we have collected

In [19]:
import dspy


class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")



import time

generate_answers = dspy.ChainOfThoughtWithHint(GenerateAnswer)


answers=[]
contexts=[]
for i in range(100):
    question=questions[i]
    context=embedding_function._retrieve(question, k=5)
    hint= f"{documents[i]}"
    pred=generate_answers(question=question, context=context, hint=hint)
    print(f"({i})  **\n\n The asked Question is:    {question}\n\n")
    #print(f"({i})  **\n\n  The Context of the question is: {context[:20]}\n\n")
    print(f"({i})  **\n\n  The Hint for the question is: {hint[:100]}\n\n")
    print(f"({i}) Predicted Answer: {pred.answer}\n\n -----\n\n######\n\n *********")
    contexts.append(context)
    #print("XXX",contexts[:20])
    answers.append(pred.answer)
    time.sleep(.1)


(0)  **

 The asked Question is:    What is biochemistry? How does it help us understand life processes?


(0)  **

  The Hint for the question is: Here are the rewritten 10 points with the added details:

**Point 1: Introduction**

* Briefly intro


(0) Predicted Answer: Biochemistry studies chemical processes in living organisms.

 -----

######

 *********
(1)  **

 The asked Question is:    Why are biochemistry and molecular biology important fields of study?


(1)  **

  The Hint for the question is: * Explain how this episode will explore the fascinating world of biochemistry and molecular biology,


(1) Predicted Answer: Understand Life Processes

 -----

######

 *********
(2)  **

 The asked Question is:    Why do these diseases require study in biochemistry?


(2)  **

  The Hint for the question is: * Example: Biochemistry is essential for understanding the mechanisms behind diseases such as cancer


(2) Predicted Answer: Biochemistry is crucial for studying diseases because

KeyboardInterrupt: 

In [18]:
print("questions",len(questions))
print("Contexts",len(contexts))
print("Answers",len(answers))

questions 155
Contexts 50
Answers 50
