In [1]:
import os
import cohere
import hnswlib
import json
import uuid
from typing import List, Dict
from unstructured.partition.text import partition_text
from unstructured.chunking.title import chunk_by_title

# co = cohere.Client("COHERE_API_KEY")

In [2]:
from dotenv import load_dotenv
load_dotenv()
COHERE_API_KEY=os.getenv('COHERE_API_KEY')

In [3]:
co = cohere.Client(COHERE_API_KEY)

In [4]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [5]:
class Datastore:
    """
    A class representing a collection of documents.

    Parameters:
    sources (list): A list of dictionaries representing the sources of the documents. Each dictionary should have 'title' and 'url' keys.

    Attributes:
    sources (list): A list of dictionaries representing the sources of the documents.
    docs (list): A list of dictionaries representing the documents, with 'title', 'content', and 'url' keys.
    docs_embs (list): A list of the associated embeddings for the documents.
    docs_len (int): The number of documents in the collection.
    index (hnswlib.Index): The index used for document retrieval.

    Methods:
    load_and_chunk(): Loads the data from the sources and partitions the HTML content into chunks.
    embed(): Embeds the documents using the Cohere API.
    index(): Indexes the documents for efficient retrieval.
    """

    def __init__(self, raw_documents: List[Dict[str, str]]):
        self.raw_documents = raw_documents  # raw documents
        self.chunks = []            # chunked version of documents
        self.chunks_embs = []       # embeddings of chunked documents
        self.retrieve_top_k = 10
        self.rerank_top_k = 3
        self.load_and_chunk()  # load raw documents and break into chunks
        self.embed() # generate embeddings for each chunk
        self.index() # store embeddings in an index


    def load_and_chunk(self) -> None:
        """
        Loads the text from the sources and chunks the HTML content.
        """
        print("Loading documents...")

        for source in self.raw_documents:
            elements = partition_text(filename=source["filename"])
            chunks = chunk_by_title(elements)
            for chunk in chunks:
                self.chunks.append(
                    {
                        "title": source["title"],
                        "text": str(chunk),
                        "url": source["filename"],
                    }
                )

    def embed(self) -> None:
        """
        Embeds the document chunks using the Cohere API.
        """
        print("Embedding document chunks...")

        batch_size = 90
        self.chunks_len = len(self.chunks)

        for i in range(0, self.chunks_len, batch_size):
            batch = self.chunks[i : min(i + batch_size, self.chunks_len)]
            texts = [item["text"] for item in batch]
            chunks_embs_batch = co.embed(
                texts=texts, model="embed-english-v3.0", input_type="search_document"
            ).embeddings
            self.chunks_embs.extend(chunks_embs_batch)

    def index(self) -> None:
        """
        Indexes the document chunks for efficient retrieval.
        """
        print("Indexing documents...")

        self.idx = hnswlib.Index(space="ip", dim=1024)
        self.idx.init_index(max_elements=self.chunks_len, ef_construction=512, M=64)
        self.idx.add_items(self.chunks_embs, list(range(len(self.chunks_embs))))

        print(f"Indexing complete with {self.idx.get_current_count()} documents.")

        return self.idx

    def search_and_rerank(self, query: str) -> List[Dict[str, str]]:
        # SEARCH
        query_emb = co.embed(
                  texts=[query], model="embed-english-v3.0", input_type="search_query"
              ).embeddings

        chunk_ids = self.idx.knn_query(query_emb, k=self.retrieve_top_k)[0][0]

        # RERANK
        chunks_to_rerank = [self.chunks[chunk_id]["text"] for chunk_id in chunk_ids]

        rerank_results = co.rerank(
            query=query,
            documents=chunks_to_rerank,
            top_n=self.rerank_top_k,
            model="rerank-english-v2.0",
        )

        chunk_ids_reranked = [chunk_ids[result.index] for result in rerank_results]

        chunks_retrieved = []
        for chunk_id in chunk_ids_reranked:
            chunks_retrieved.append(
                {
                "title": self.chunks[chunk_id]["title"],
                "text": self.chunks[chunk_id]["text"],
                "filename": self.chunks[chunk_id]["url"],
                }
            )

        return chunks_retrieved

In [6]:
sources = [
    {
        "title": "Data Quarkle", 
        "filename": "/home/notebook-user/response-eval-ai/sample-docs/concept-dataquarkle.md"},
    {
        "title": "RAGs", 
        "filename": "/home/notebook-user/response-eval-ai/sample-docs/undestanding-rags.md"},
        {
        "title": "sample", 
        "filename": "/home/notebook-user/response-eval-ai/sample-docs/sample.md"},    
        {
        "title": "de-concepts", 
        "filename": "/home/notebook-user/response-eval-ai/sample-docs/understanding-de-concepts.md"}, 
]

# Create an instance of the Datastore class with the given sources
datastore = Datastore(sources)

Loading documents...
Embedding document chunks...
Indexing documents...
Indexing complete with 41 documents.


In [7]:
partition_text(filename='/home/notebook-user/response-eval-ai/sample-docs/concept-dataquarkle.md')

[<unstructured.documents.elements.NarrativeText at 0x7f9f038002b0>,
 <unstructured.documents.elements.NarrativeText at 0x7f9f03802530>,
 <unstructured.documents.elements.NarrativeText at 0x7f9f03802a70>,
 <unstructured.documents.elements.NarrativeText at 0x7f9f03800fd0>,
 <unstructured.documents.elements.NarrativeText at 0x7f9f03803a90>,
 <unstructured.documents.elements.NarrativeText at 0x7f9f03803790>]

In [8]:
datastore.search_and_rerank("quarkle")

[{'title': 'Data Quarkle',
  'text': '# Data Quarkle - a new concept Introducing "DataQuarkle" – a fantastical approach to data engineering that transcends conventional boundaries and unleashes the power of cosmic data manipulation.',
  'filename': '/home/notebook-user/response-eval-ai/sample-docs/concept-dataquarkle.md'},
 {'title': 'Data Quarkle',
  'text': 'In the realm of DataQuarkle, innovation knows no bounds. From "Datafusion" spells that merge disparate data sources into a harmonious symphony of insights to "Datamancy" rituals that foresee future trends through the cosmic currents, DataQuarkle pushes the boundaries of what is possible in the realm of data engineering.',
  'filename': '/home/notebook-user/response-eval-ai/sample-docs/concept-dataquarkle.md'},
 {'title': 'Data Quarkle',
  'text': 'DataQuarkle harnesses the mysterious energy of "Gigavortex," a celestial phenomenon that swirls through the cosmos, weaving together streams of data from across the universe. Powered by

In [9]:
class Chatbot:
    def __init__(self, datastore: Datastore):
        """
        Initializes an instance of the Chatbot class.

        Parameters:
        storage (Storage): An instance of the Storage class.

        """
        self.datastore = datastore
        self.conversation_id = str(uuid.uuid4())

    def run(self):
        """
        Runs the chatbot application.

        """
        while True:
            # Get the user message
            message = input()

            # Typing "quit" ends the conversation
            if message.lower() == "quit":
                print("Ending chat.")
                break
            else:
                print(f"User: {message}")

            # Generate search queries, if any
            response_queries = co.chat(message=message, search_queries_only=True)

            if response_queries.search_queries:
                print("Retrieving information...", end="")

                # Get the query(s)
                queries = []
                for search_query in response_queries.search_queries:
                    queries.append(search_query["text"])

                # Retrieve documents for each query
                chunks = []
                for query in queries:
                    chunks.extend(self.datastore.search_and_rerank(query))
            
                response = co.chat(
                    message=message,
                    documents=chunks,
                    conversation_id=self.conversation_id,
                    stream=True,
                )

            else:
                response = co.chat(
                    message=message,
                    conversation_id=self.conversation_id,
                    stream=True,
                )

            # Print the chatbot response
            print("\nChatbot:")
            
            citations_flag = False
            
            for event in response:
                                
                # Text
                if event.event_type == "text-generation":
                    print(event.text, end="")

                # Citations
                if event.event_type == "citation-generation":
                    if not citations_flag:
                        print("\n\nCITATIONS:")
                        citations_flag = True
                    print(event.citations[0])
            
            # Documents
            if citations_flag:
                print("\n\nDOCUMENTS:")
                documents = [{'id': doc['id'],
                                'text': doc['text'][:50] + '...',
                                'title': doc['title'],
                                'url': doc['filename']} 
                                for doc in response.documents]
                for doc in documents:
                    print(doc)

            print(f"\n{'-'*100}\n")
            return response

In [10]:
# Create an instance of the Chatbot class with the Datastore instance
chatbot = Chatbot(datastore)

# # Run the chatbot
# chatbot.run()

In [11]:

chatbot.run()


User: What is data warehousing and how is it different from traditional database?
Retrieving information...
Chatbot:
Data warehousing involves the process of collecting, storing and managing data from various sources in order to aid decision-making processes. 

Data warehouses are different from traditional databases because they are optimised for analytical queries rather than transactional operations. They enable businesses to analyse historical data trends.

CITATIONS:
{'start': 41, 'end': 51, 'text': 'collecting', 'document_ids': ['doc_0']}
{'start': 53, 'end': 60, 'text': 'storing', 'document_ids': ['doc_0']}
{'start': 65, 'end': 99, 'text': 'managing data from various sources', 'document_ids': ['doc_0']}
{'start': 112, 'end': 142, 'text': 'aid decision-making processes.', 'document_ids': ['doc_0']}
{'start': 219, 'end': 251, 'text': 'optimised for analytical queries', 'document_ids': ['doc_0']}
{'start': 264, 'end': 289, 'text': 'transactional operations.', 'document_ids': ['doc_

cohere.StreamingChat {
	response: <Response [200]>
	texts: ['Data warehousing involves the process of collecting, storing and managing data from various sources in order to aid decision-making processes. \n\nData warehouses are different from traditional databases because they are optimised for analytical queries rather than transactional operations. They enable businesses to analyse historical data trends.']
	response_id: b6b0ca05-72eb-4e83-acca-c8a4f62d09d3
	conversation_id: e00efd1c-f468-4ed5-8b57-79b71f78d86e
	generation_id: 0026e4e4-3abf-416c-b785-4bd008ea3ea7
	preamble: None
	prompt: None
	chat_history: [{'role': 'USER', 'message': 'What is data warehousing and how is it different from traditional database?'}, {'role': 'CHATBOT', 'message': 'Data warehousing involves the process of collecting, storing and managing data from various sources in order to aid decision-making processes. \n\nData warehouses are different from traditional databases because they are optimised for analyti