#### Setup

In [1]:
%pip install -r requirements.txt --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os

# take environment variables from .env.
load_dotenv(override=True) 

search_service_endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.getenv("AZURE_AI_SEARCH_API_KEY", "")) if len(os.getenv("AZURE_AI_SEARCH_API_KEY", "")) > 0 else DefaultAzureCredential()
index_name = os.getenv("AZURE_SEARCH_INDEX", "recommendationidx")

#### Embed data using Cohere embed V3 without Vector DB
- Use embed-english-v3.0 model to embed the data with 1024 Dimentions and 512 Context window size

In [3]:
import cohere

# Initialize Cohere client
co = cohere.ClientV2()

In [4]:
# Chunk and load documents into AI search

from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = DirectoryLoader('data/', glob="*.md", loader_cls=TextLoader, loader_kwargs={'autodetect_encoding': True})
docs = loader.load()
documents = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50).split_documents(docs)
print(f"Loaded {len(documents)} documents")

Loaded 704 documents


In [5]:
# Extract page_content from each Document object
document_texts = [doc.page_content for doc in documents]

In [6]:
%store -r embeddings
print (len(embeddings), "Document embeddings generated")

704 Document embeddings generated


In [None]:
def generate_embeddings(texts, input_type="search_document"):
    model = "embed-english-v3.0"
    # Ensure texts is a list
    if isinstance(texts, str):
        texts = [texts]

    response = co.embed(
        texts=texts,
        model=model,
        input_type=input_type,
        embedding_types=["int8"],
    )
    return [embedding for embedding in response.embeddings.int8]


In [None]:
import time

# Limit to 80 calls per minute as Free Embed API has a limitation of 100 calls per min
batch_size = 80
embeddings = []
for i in range(0, len(document_texts), batch_size):
    batch = document_texts[i:i + batch_size]
    embeddings.extend(generate_embeddings(batch))
    if i + batch_size < len(document_texts):
        time.sleep(60)  # Sleep for 60 seconds to respect the rate limit

print (len(embeddings), "Document embeddings generated")

#### RAG with Cohere client and Cohere Command R+ with Citations

In a basic RAG application, the steps involved are:
- Transforming the user message into search queries
- Retrieving relevant documents for a given search query & rerank them for semantic relavance
- Generating the response and citations


In [7]:
from typing import List
import numpy as np
import json

def generate_search_queries(message: str) -> List[str]:
    
    # Define the query generation tool
    query_gen_tool = [
        {
            "type": "function",
            "function": {
                "name": "internet_search",
                "description": "Returns a list of relevant document snippets for a textual query retrieved from the internet",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "queries": {
                            "type": "array",
                            "items": {"type": "string"},
                            "description": "a list of queries to search the internet with.",
                        }
                    },
                    "required": ["queries"],
                },
            },
        }
    ]


    # Define a preamble to optimize search query generation
    system_message = "Write a search query that will find helpful information for answering the user's question accurately. If you need more than one search query, write a list of search queries. If you decide that a search is very unlikely to find information that would be useful in constructing a response to the user, you should instead directly answer."

    # Generate search queries (if any)
    search_queries = []
    
    res = co.chat(
        # model="command-r-08-2024",
        model="command-r-plus-08-2024",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": message},
        ],
        tools=query_gen_tool
    )
    
    if res.message.tool_calls:
        for tc in res.message.tool_calls:
            queries = json.loads(tc.function.arguments)["queries"]
            search_queries.extend(queries)

    return search_queries


In [8]:
def generate_cohere_embeddings(search_query):
    # Embed the search query
    return co.embed(
        model="embed-english-v3.0",
        input_type="search_query",
        texts=[search_query],
        embedding_types=["int8"]).embeddings.int8

In [9]:
def search_relevant_documents(query_emb, embeddings, document_texts, queries_for_search, n=5):
    retrieved_documents = []
    for idx, emb in enumerate(query_emb):
        scores = np.dot(emb, np.transpose(embeddings))
        max_idx = np.argsort(-scores)[:n]
        retrieved_documents.extend([document_texts[item] for item in max_idx])
        print(f"Search query: {queries_for_search[idx]}")
        for rank, idx in enumerate(max_idx):
            print(f"Rank: {rank+1}")
            print(f"Score: {scores[idx]}")
            print(f"Document: {document_texts[idx]}\n")
    return retrieved_documents

In [None]:
def rerank_documents(queries_for_search, retrieved_documents):
    retrieved_documents_str = [str(doc) for doc in retrieved_documents]
    reranked_documents = []
    for idx, doc in enumerate(queries_for_search):
        results = co.rerank(query=queries_for_search[idx],
                            documents=retrieved_documents_str,
                            top_n=2,
                            model='rerank-english-v3.0')
        # Display the reranking results
        for idx, result in enumerate(results.results):
            print(f"Rank: {idx+1}") 
            print(f"Score: {result.relevance_score}")
            print(f"Document: {retrieved_documents[result.index]}\n")
        reranked_documents.extend([retrieved_documents_str[result.index] for result in results.results])
    return reranked_documents

#### Product specific search

In [None]:
query = "What are the features of the TrailMaster X4 Tent?"
queries_for_search = generate_search_queries(query)
print(queries_for_search)

print("--------------------------")

#### Embed search Query
query_emb = []
for query in queries_for_search:
    print("Search query: ", queries_for_search)
    query_emb.extend(generate_cohere_embeddings(query))

print(len(query_emb), "Search query embeddings generated")

print("--------------------------")

retrieved_documents = search_relevant_documents(query_emb, embeddings, document_texts, queries_for_search, n=5)

print("--------------------------")

reranked_documents = rerank_documents(queries_for_search, retrieved_documents)

['features of the TrailMaster X4 Tent']
--------------------------
Search query:  ['features of the TrailMaster X4 Tent']
1 Search query embeddings generated
--------------------------
Search query: features of the TrailMaster X4 Tent
Rank: 1
Score: 1201942
Document: 4) Are there any additional accessories included with the TrailMaster X4 Tent?
   The TrailMaster X4 Tent includes a rainfly, tent stakes, guy lines, and a carry bag for easy transport.

5) Can the TrailMaster X4 Tent be easily carried during hikes?
   Yes, the TrailMaster X4 Tent weighs just 12lbs, and when packed in its carry bag, it can be comfortably carried during hikes.

Rank: 2
Score: 1178961
Document: 3) **Rating:** 5
   **Review:** The TrailMaster X4 Tent is a fantastic investment for any serious camper. The easy setup and spacious interior make it perfect for extended trips, and the waterproof design kept us dry in heavy rain.

4) **Rating:** 4
   **Review:** I like the TrailMaster X4 Tent, but I wish it came in 

In [13]:
# Generate the response
response = co.chat(model="command-r-plus-08-2024",
                   messages=[{'role': 'user', 'content': query}],
                   documents=reranked_documents)

# Display the response
print(response.message.content[0].text)

# Display the citations and source documents
if response.message.citations:
    print("\nCITATIONS:")
    for citation in response.message.citations:
        print(citation, "\n")


The TrailMaster X4 Tent is a great tent for serious campers. It is easy to set up and has a spacious interior, making it perfect for extended trips. It is also waterproof, which is great for heavy rain. The tent is also lightweight, weighing only 12lbs, and comes with a carry bag for easy transport. It also comes with a rainfly, tent stakes, and guy lines.

CITATIONS:
start=29 end=60 text='great tent for serious campers.' sources=[DocumentSource(type='document', id='doc:0', document={'content': "3) **Rating:** 5\n   **Review:** The TrailMaster X4 Tent is a fantastic investment for any serious camper. The easy setup and spacious interior make it perfect for extended trips, and the waterproof design kept us dry in heavy rain.\n\n4) **Rating:** 4\n   **Review:** I like the TrailMaster X4 Tent, but I wish it came in more colors. It's comfortable and has many useful features, but the green color just isn't my favorite. Overall, it's a good tent.", 'id': 'doc:0'})] 

start=67 end=81 text='ea

In [16]:
query = "Find hiking shoes under $100?"
queries_for_search = generate_search_queries(query)
print(queries_for_search)

print("--------------------------")

#### Embed search Query
query_emb = []
for query in queries_for_search:
    print("Search query: ", queries_for_search)
    query_emb.extend(generate_cohere_embeddings(query))

print(len(query_emb), "Search query embeddings generated")

print("--------------------------")

retrieved_documents = search_relevant_documents(query_emb, embeddings, document_texts, queries_for_search, n=5)

print("--------------------------")

reranked_documents = rerank_documents(queries_for_search, retrieved_documents)

['hiking shoes under $100']
--------------------------
Search query:  ['hiking shoes under $100']
1 Search query embeddings generated
--------------------------
Search query: hiking shoes under $100
Rank: 1
Score: 1002070
Document: # Information about product item_number: 11
TrailWalker Hiking Shoes, price $110

## Brand
TrekReady

## Category
Hiking Footwear

Rank: 2
Score: 935213
Document: **Review:** The TrailWalker Hiking Shoes are hands down the best hiking shoes I've ever owned. From the moment I put them on, they felt like a perfect fit. The traction is outstanding, allowing me to confidently navigate challenging trails without slipping. The shoes provide excellent ankle support, which is crucial on uneven terrain. They are also durable and show no signs of wear, even after multiple hikes. I can't recommend these shoes enough for avid hikers. They are worth every penny.

Rank: 3
Score: 934393
Document: # Information about product item_number: 4
TrekReady Hiking Boots, price $140

In [17]:
# Generate the response
response = co.chat(model="command-r-plus-08-2024",
                   messages=[{'role': 'user', 'content': query}],
                   documents=reranked_documents)

# Display the response
print(response.message.content[0].text)

# Display the citations and source documents
if response.message.citations:
    print("\nCITATIONS:")
    for citation in response.message.citations:
        print(citation, "\n")


I'm sorry, I could not find any hiking shoes under $100. However, I did find the TrailWalker Hiking Shoes by TrekReady, which cost $110. They are described as the best hiking shoes the reviewer has ever owned, with outstanding traction, excellent ankle support, and durability.

CITATIONS:
start=81 end=105 text='TrailWalker Hiking Shoes' sources=[DocumentSource(type='document', id='doc:0', document={'content': '# Information about product item_number: 11\nTrailWalker Hiking Shoes, price $110\n\n## Brand\nTrekReady\n\n## Category\nHiking Footwear', 'id': 'doc:0'}), DocumentSource(type='document', id='doc:1', document={'content': "**Review:** The TrailWalker Hiking Shoes are hands down the best hiking shoes I've ever owned. From the moment I put them on, they felt like a perfect fit. The traction is outstanding, allowing me to confidently navigate challenging trails without slipping. The shoes provide excellent ankle support, which is crucial on uneven terrain. They are also durable and s

#### Personalized search

In [14]:
query = "Suggest best reviewed products from the product catalogue that might be of interest to Jane Doe, based on their recent purchase history?"
queries_for_search = generate_search_queries(query)
print(queries_for_search)

print("--------------------------")

#### Embed search Query
query_emb = []
for query in queries_for_search:
    print("Search query: ", queries_for_search)
    query_emb.extend(generate_cohere_embeddings(query))

print(len(query_emb), "Search query embeddings generated")

print("--------------------------")

retrieved_documents = search_relevant_documents(query_emb, embeddings, document_texts, queries_for_search, n=5)

print("--------------------------")

reranked_documents = rerank_documents(queries_for_search, retrieved_documents)

['Jane Doe recent purchase history']
--------------------------
Search query:  ['Jane Doe recent purchase history']
1 Search query embeddings generated
--------------------------
Search query: Jane Doe recent purchase history
Rank: 1
Score: 539487
Document: ## Customer_Info

First Name: Jane 
Last Name: Doe 
Age: 28 
Email Address: janedoe@example.com 
Phone Number: 555-987-6543 
Shipping Address: 456 Oak St, Another City USA, 67890 
Membership: Gold 

## Recent_Purchases

order_number: 6 
date: 2023-01-10 
item:
- description:  Adventurer Pro Backpack, quantity 1, price $90 
  item_number: 2 
 
order_number: 15 
date: 2023-01-20 
item:
- description:  TrekReady Hiking Boots, quantity 1, price $140 
  item_number: 4

Rank: 2
Score: 397703
Document: ## Customer_Info

First Name: John 
Last Name: Smith 
Age: 35 
Email Address: johnsmith@example.com 
Phone Number: 555-123-4567 
Shipping Address: 123 Main St,  Anytown USA, 12345 
Membership: None 

## Recent_Purchases

order_number: 1 
dat

In [15]:
# Generate the response
response = co.chat(model="command-r-plus-08-2024",
                   messages=[{'role': 'user', 'content': query}],
                   documents=reranked_documents)

# Display the response
print(response.message.content[0].text)

# Display the citations and source documents
if response.message.citations:
    print("\nCITATIONS:")
    for citation in response.message.citations:
        print(citation, "\n")


Jane Doe's recent purchase history is as follows:

- **Order Number 6**: Adventurer Pro Backpack, quantity 1, price $90, purchased on 2023-01-10.
- **Order Number 15**: TrekReady Hiking Boots, quantity 1, price $140, purchased on 2023-01-20.

CITATIONS:
start=55 end=69 text='Order Number 6' sources=[DocumentSource(type='document', id='doc:0', document={'content': '## Customer_Info\n\nFirst Name: Jane \nLast Name: Doe \nAge: 28 \nEmail Address: janedoe@example.com \nPhone Number: 555-987-6543 \nShipping Address: 456 Oak St, Another City USA, 67890 \nMembership: Gold \n\n## Recent_Purchases\n\norder_number: 6 \ndate: 2023-01-10 \nitem:\n- description:  Adventurer Pro Backpack, quantity 1, price $90 \n\xa0 item_number: 2 \n \norder_number: 15 \ndate: 2023-01-20 \nitem:\n- description:  TrekReady Hiking Boots, quantity 1, price $140 \n\xa0 item_number: 4', 'id': 'doc:0'})] 

start=73 end=96 text='Adventurer Pro Backpack' sources=[DocumentSource(type='document', id='doc:0', document={'conte