In [7]:
import logging
import os
import uuid
from typing import Any, Dict, List, Tuple

import cohere
import hnswlib
from cohere import ChatConnector
from dotenv import find_dotenv, load_dotenv
from rich import print as rprint
from tqdm import tqdm
from unstructured.chunking.title import chunk_by_title
from unstructured.partition.html import partition_html
import pandas as pd

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("RAGgy")
logging.getLogger("httpx").setLevel(logging.WARNING)

load_dotenv(find_dotenv())

%load_ext rich


co = cohere.Client()

The rich extension is already loaded. To reload it, use:
  %reload_ext rich


## Creating the vectorstore

In [2]:
raw_documents = [
    {"title": "Text Embeddings", "url": "https://docs.cohere.com/docs/text-embeddings"},
    {
        "title": "Similarity Between Words and Sentences",
        "url": "https://docs.cohere.com/docs/similarity-between-words-and-sentences",
    },
    {
        "title": "The Attention Mechanism",
        "url": "https://docs.cohere.com/docs/the-attention-mechanism",
    },
    {
        "title": "Transformer Models",
        "url": "https://docs.cohere.com/docs/transformer-models",
    },
]

In [46]:
class Vectorstore:
    def __init__(self, raw_documents):
        self.raw_documents = raw_documents

        self.docs = []
        self.embeddings = []

        self.retrieve_top_k = 10
        self.rerank_top_k = 3
        self.load_and_chunk()
        self.embed()
        self.index()

    def load_and_chunk(self):
        logger.info("Loading documents to vectorstore...")

        for i, raw_document in enumerate(raw_documents):
            elements = partition_html(url=raw_document["url"])

            logger.info(f"Chunking document #{i+1}/{len(raw_documents)}...")
            chunks = chunk_by_title(elements)

            for chunk in chunks:
                self.docs.append(
                    {
                        "title": raw_document["title"],
                        "url": raw_document["url"],
                        "text": str(chunk),
                    }
                )

    def embed(self):
        logger.info("Embedding document chunks...")

        batch_size = 64

        self.num_docs = len(self.docs)

        for i in tqdm(range(0, self.num_docs, batch_size)):
            batch = self.docs[i : min(i + batch_size, self.num_docs)]
            texts = [doc["text"] for doc in batch]

            emb_batch = co.embed(
                texts=texts,
                model="embed-english-v3.0",
                input_type="search_document",
            ).embeddings

            self.embeddings.extend(emb_batch)

    def index(self):
        logger.info("Indexing document chunks...")

        self.idx = hnswlib.Index(space="ip", dim=1024)
        self.idx.init_index(max_elements=self.num_docs, ef_construction=512, M=64)
        self.idx.add_items(self.embeddings, list(range(len(self.embeddings))))

        logger.info(
            f"Indexing complete! {self.idx.get_current_count()} documents indexed."
        )

    def retrieve(self, query: str):
        # Dense retrieval
        query_embedding = co.embed(
            texts=[query],
            model="embed-english-v3.0",
            input_type="search_query",
        ).embeddings

        knn_doc_ids = self.idx.knn_query(query_embedding, k=self.retrieve_top_k)[0][0]

        rank_fields = ["title", "text"]

        rerank_docs = [self.docs[ids] for ids in knn_doc_ids]

        self.rerank_results = co.rerank(
            query=query,
            documents=rerank_docs,
            top_n=self.rerank_top_k,
            model="rerank-english-v3.0",
            rank_fields=rank_fields,
        )

        rerank_doc_ids = [
            rerank_docs[result.index] for result in self.rerank_results.results
        ]

        docs_retrieved = [
            {
                "title": doc["title"],
                "url": doc["url"],
                "text": doc["text"],
            }
            for doc in rerank_doc_ids
        ]

        return docs_retrieved

In [47]:
v = Vectorstore(raw_documents)

INFO:RAGgy:Loading documents to vectorstore...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:RAGgy:Chunking document #1/4...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:RAGgy:Chunking document #2/4...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:RAGgy:Chunking document #3/4...
INFO:unstructured:Reading document from string ...
INFO:unstructured:Reading document ...
INFO:RAGgy:Chunking document #4/4...
INFO:RAGgy:Embedding document chunks...
100%|██████████| 3/3 [00:01<00:00,  1.68it/s]
INFO:RAGgy:Indexing document chunks...
INFO:RAGgy:Indexing complete! 134 documents indexed.


In [51]:
ans = v.retrieve("transformer circuits")

INFO:httpx:HTTP Request: POST https://api.cohere.ai/v1/rerank "HTTP/1.1 200 OK"


In [52]:
rprint(ans)

## Chatbot

In [58]:
class Chatbot:
    def __init__(self, vectorstore: Vectorstore):
        self.vectorstore = vectorstore
        self.conversation_id = str(uuid.uuid4())

    def run(self):
        while True:
            message = input("User: ")

            if message.lower() == "quit":
                print("Goodbye!")
                break
            else:
                print(f"User: {message}")

                response = co.chat(message=message, search_queries_only=True)

                if response.search_queries:
                    logger.info("Retrieving documents...")

                    documents = []
                    for query in response.search_queries:
                        documents.extend(self.vectorstore.retrieve(query.text))

                    response = co.chat_stream(
                        message="message",
                        model="command-r",
                        documents=documents,
                        conversation_id=self.conversation_id,
                    )

                else:
                    response = co.chat_stream(
                        message=message,
                        model="command-r",
                        conversation_id=self.conversation_id,
                    )

                print("RAGgy:")

                citations = []
                cited_documents = []

                for event in response:
                    if event.event_type == "text-generation":
                        print(event.text, end="")

                    elif event.event_type == "citation-generation":
                        citations.extend(event.citations)

                    elif event.event_type == "stream-end":
                        cited_documents.extend(event.response.documents)

                if citations:
                    print("\n\nCitations:")
                    for citation in citations:
                        print(citation)

                if cited_documents:
                    print("\n\nCited Documents:")
                    for document in cited_documents:
                        print(document)

                print(f'\n{"-"*100}\n')

In [63]:
Chatbot(v).run()

User: explain the concept of transformers


INFO:RAGgy:Retrieving documents...


RAGgy:
Transformer models are a recent development in machine learning; they can be used for various tasks like generating text, answering questions, and translating between languages. The architecture of these models is simple and isn't as complex as one might think. Each transformer block present in the model architecture is composed of a neural network, an attention component, and a feedforward component.

Citations:
start=25 end=63 text='recent development in machine learning' document_ids=['doc_2']
start=77 end=99 text='used for various tasks' document_ids=['doc_2']
start=105 end=177 text='generating text, answering questions, and translating between languages.' document_ids=['doc_2']
start=182 end=194 text='architecture' document_ids=['doc_2']
start=204 end=220 text='models is simple' document_ids=['doc_2']
start=225 end=241 text="isn't as complex" document_ids=['doc_2']
start=262 end=284 text='Each transformer block' document_ids=['doc_0']
start=322 end=350 text='composed of a n

## Connectors

In [74]:
class Chatbot:
    def __init__(self, connectors: List[str]):
        self.conversation_id = str(uuid.uuid4())
        self.connectors = [ChatConnector(id=connector) for connector in connectors]

    def run(self):
        while True:
            message = input("User: ")

            if message.lower() == "quit":
                print("Goodbye!")
                break
            else:
                print(f"User: {message}")

                response = co.chat_stream(
                    message=message,
                    model="command-r-plus",
                    conversation_id=self.conversation_id,
                    connectors=self.connectors,
                )

                print("RAGgy:")

                citations = []
                cited_documents = []

                for event in response:
                    if event.event_type == "text-generation":
                        print(event.text, end="")

                    elif event.event_type == "citation-generation":
                        citations.extend(event.citations)

                    elif event.event_type == "stream-end":
                        cited_documents.extend(event.response.documents)

                if citations:
                    print("\n\nCitations:")
                    for citation in citations:
                        print(citation)

                if cited_documents:
                    print("\n\nCited Documents:")
                    for document in cited_documents:
                        print(document)

                print(f'\n{"-"*100}\n')


In [75]:
from cohere import ChatConnector

response = co.chat_stream(
    message="What is Pokemon?", connectors=[ChatConnector(id="web-search")]
)


In [76]:
connectors = ['web-search']

chatbot = Chatbot(connectors)

chatbot.run()

User: what is 3blue1brown
RAGgy:
3Blue1Brown is a YouTube channel about discovery and creativity in math, with an emphasis on visualizations. The channel was created and is run by Grant Sanderson, who graduated from Stanford University in 2015 with a bachelor's degree in mathematics. The channel covers topics such as linear algebra, neural networks, calculus, Fourier transforms, quantum mechanics, and more.

Citations:
start=17 end=32 text='YouTube channel' document_ids=['web-search_0', 'web-search_2']
start=39 end=71 text='discovery and creativity in math' document_ids=['web-search_0', 'web-search_2']
start=81 end=108 text='emphasis on visualizations.' document_ids=['web-search_0', 'web-search_2']
start=125 end=162 text='created and is run by Grant Sanderson' document_ids=['web-search_0']
start=183 end=202 text='Stanford University' document_ids=['web-search_0', 'web-search_2', 'web-search_5']
start=206 end=210 text='2015' document_ids=['web-search_0']
start=218 end=251 text="bachelor

## Quickstart connector

In [2]:
# co.connectors.create(
#     name="hnews",
#     url="https://hnews.onrender.com/search",
#     service_auth={"type": "bearer", "token": os.getenv("HACKERNEWS_CONNECTOR_API_KEY")},
# )

In [3]:
class Chatbot:
    def __init__(self, connectors: List[str]):
        self.conversation_id = str(uuid.uuid4())
        self.connectors = [ChatConnector(id=connector) for connector in connectors]
        self.citation_history = []
        self.cited_document_history = []

    def run(self):
        while True:
            message = input("User: ")

            if message.lower() == "quit":
                print("Goodbye!")
                break
            else:
                print(f"User: {message}")

                response = co.chat_stream(
                    message=message,
                    model="command-r-plus",
                    conversation_id=self.conversation_id,
                    connectors=self.connectors,
                )

                print("RAGgy:")

                citations = []
                cited_documents = []

                for event in response:
                    if event.event_type == "text-generation":
                        print(event.text, end="")

                    elif event.event_type == "citation-generation":
                        citations.extend(event.citations)

                    elif event.event_type == "stream-end":
                        cited_documents.extend(event.response.documents)

                self.citation_history.extend(citations)
                self.cited_document_history.extend(cited_documents)

                if citations:
                    print("\n\nCitations:")
                    for citation in citations:
                        print(citation)

                if cited_documents:
                    print("\n\nCited Documents:")
                    for document in cited_documents:
                        print(document)

                print(f'\n{"-"*100}\n')


In [10]:
connectors = ["hnews-pz7r8e", "web-search"]

chatbot = Chatbot(connectors)

chatbot.run()


User: what happened with apple stock today?
RAGgy:
Sorry, I cannot answer this question as I do not have access to today's date.
----------------------------------------------------------------------------------------------------

User: apple stock news
RAGgy:
Here is a list of news items related to Apple stock:
- Apple stock has received a consensus rating of "buy".
- Apple's stock price could fluctuate by 1% and the news would report a $27 billion change in market cap.
- Apple's stock price fell after it announced it would give $750 million to workers in its factories.
- Apple's stock price fell after it announced a lower-priced iPad.
- Apple's stock price fell after it warned on Q1 results.
- Apple's stock price fell after reports that iPhone sales in China had plummeted by 19% in Q1.
- Apple's stock price fell after reports that it would be cutting iPhone X production targets.
- Apple's stock price rose after Microsoft announced it was investing $150 million in the company.
- Apple

In [12]:
cited_docs = pd.DataFrame(chatbot.cited_document_history).sort_values(by='id', ascending=False)
cited_docs

Unnamed: 0,id,snippet,timestamp,title,url,author,comment_text,created_at,created_at_i,objectID,parent_id,story_id,story_title,story_url,text,updated_at,num_comments,points
0,web-search_5,"Apple Inc. Stock , AAPL\n\nPlus500. 81% of ret...",2024-04-23T18:13:45,"Apple Stock Price | AAPL Stock Quote, News, an...",https://markets.businessinsider.com/stocks/aap...,,,,,,,,,,,,,
15,web-search_4,Accessibility Log In Help Join The Motley Fool...,2024-04-22T21:13:53,4 Reasons to Buy Apple Stock Like There's No T...,https://www.fool.com/investing/2024/01/31/4-re...,,,,,,,,,,,,,
11,web-search_3,IBD Digital: 2 Months for $20\n\nPsychological...,2024-04-21T14:58:08,Apple Stock A Buy Right Now? AAPL Stock Chart ...,https://www.investors.com/research/apple-stock...,,,,,,,,,,,,,
17,web-search_3,Skip to main content\n\nData is currently not ...,2024-04-22T21:13:53,4 Things You Need to Know Before You Rush to B...,https://www.nasdaq.com/articles/4-things-you-n...,,,,,,,,,,,,,
14,web-search_2,Accessibility Log In Help Join The Motley Fool...,2024-04-24T18:27:41,2 Reasons to Buy Apple Stock Like There's No T...,https://www.fool.com/investing/2024/03/10/2-re...,,,,,,,,,,,,,
10,web-search_1,Skip to Main Content\n\nSkip to Related Conten...,2024-04-14T16:16:58,"Apple Inc. (AAPL) Stock Price, News, Quote & H...",https://finance.yahoo.com/quote/AAPL/,,,,,,,,,,,,,
13,web-search_1,Skip to main content\n\n2 Reasons to Buy Apple...,2024-04-17T15:33:10,2 Reasons to Buy Apple Stock for a 21% Potenti...,https://www.thestreet.com/apple/stock/2-reason...,,,,,,,,,,,,,
16,web-search_0,IBD Digital: 2 Months for $20\n\nPsychological...,2024-04-21T14:58:08,Apple Stock A Buy Right Now? AAPL Stock Chart ...,https://www.investors.com/research/apple-stock...,,,,,,,,,,,,,
2,hnews-pz7r8e_8,,,,,tgv,"In unrelated news, Apple Stock went down a sha...",2021-08-27T11:31:33Z,1630063893.0,28326446.0,28326355.0,28325856.0,Apple chief executive Tim Cook gets $750m payout,https://www.bbc.com/news/business-58352098,"In unrelated news, Apple Stock went down a sha...",2023-09-07T09:34:08Z,,
6,hnews-pz7r8e_25,,,,,nl,MS buying Apple stock involved a pretty compli...,2023-05-02T23:53:07Z,1683071587.0,35795888.0,35795433.0,35789963.0,Fakespot Is Acquired by Mozilla,https://www.fakespot.com/post/fakespot-acquire...,MS buying Apple stock involved a pretty compli...,2023-09-07T13:12:41Z,,
