#### Setup

In [1]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os

load_dotenv(override=True) # take environment variables from .env.

# The following variables from your .env file are used in this notebook
cohere_api_key = os.environ["COHERE_API_KEY"]
search_service_endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.getenv("AZURE_AI_SEARCH_API_KEY", "")) if len(os.getenv("AZURE_AI_SEARCH_API_KEY", "")) > 0 else DefaultAzureCredential()
index_name = os.getenv("AZURE_SEARCH_INDEX", "recommendationidx")
search_service_name = os.environ["AZURE_AI_SEARCH_SERVICE_NAME"]

#### Analyze and Clean data

In [3]:
import glob

# Read all markdown files in the dataset directory
md_files = glob.glob('data/*.md')

# Read and display the first few rows of each markdown file
for file in md_files:
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read()
    print(f"Content of {file}:")
    print(content[:500])  # Display the first 500 characters of the file

Content of data/product_info_2.md:
# Information about product item_number: 2
Adventurer Pro Backpack, price $90,

## Brand
HikeMate

## Category
Backpacks

## Features
- 40L capacity for ample storage space
- Ergonomic design for comfortable carrying
- Durable nylon material for long-lasting performance
- Multiple compartments and pockets for organized storage
- Hydration system compatibility with a dedicated hydration bladder sleeve and tube port
- Adjustable and padded shoulder straps for a customized fit and enhanced comfort

Content of data/product_info_10.md:
# Information about product item_number: 10
TrailBlaze Hiking Pants, price $75,

## Brand
MountainStyle

## Category
Hiking Clothing

## Features
- **Material**: Made of high-quality nylon fabric
- **Color**: Khaki
- **Size Options**: Available in M, L, and XL sizes
- **Weight**: Lightweight design, weighing approximately 1lb
- **Quick-Drying**: Designed to dry quickly, allowing for enhanced comfort during outdoor activities

#### AI Search as Vector Store

In [4]:
import cohere
from dotenv import load_dotenv
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.search.documents.models import (
    VectorizedQuery,
)
from azure.search.documents.indexes.models import (
    HnswAlgorithmConfiguration,
    SearchField,
    SearchableField,
    SearchFieldDataType,
    SearchIndex,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
)
from azure.core.credentials import AzureKeyCredential

def create_or_update_index(client, index_name):
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchField(
            name="content",
            type=SearchFieldDataType.String,
            searchable=True,
        ),
        SearchField(
            name="embedding",
            type="Collection(Edm.SByte)",  # OData syntax for 8-bit signed integer
            vector_search_dimensions=1024,
            vector_search_profile_name="my-vector-config",
            # hidden=False, Use hidden=False if you want to return the embeddings in the search results
        ),
    ]

    vector_search = VectorSearch(
        profiles=[
            VectorSearchProfile(
                name="my-vector-config",
                algorithm_configuration_name="my-hnsw",
            )
        ],
        algorithms=[
            HnswAlgorithmConfiguration(
                name="my-hnsw",
                kind=VectorSearchAlgorithmKind.HNSW,
            )
        ],
    )

    index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
    client.create_or_update_index(index=index)

In [5]:
# Initialize Azure Search Index Client
search_index_client = SearchIndexClient(
    endpoint=search_service_endpoint,
    credential=credential,
    index_name=index_name
)

# Create or update the search index to include the embedding field
create_or_update_index(search_index_client, index_name)

#### Embed data using Cohere embed V3
- Use embed-english-v3.0 model to embed the data with 1024 Dimentions and 512 Context window size

In [6]:
# Chunk and load documents into AI search

from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = DirectoryLoader('data/', glob="*.md", loader_cls=TextLoader, loader_kwargs={'autodetect_encoding': True})
docs = loader.load()
documents = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20).split_documents(docs)
print(f"Loaded {len(documents)} documents")

Loaded 699 documents


In [7]:
# Extract page_content from each Document object
document_texts = [doc.page_content for doc in documents]

In [8]:
def generate_embeddings(texts, input_type="search_document"):
    model = "embed-english-v3.0"
    # Ensure texts is a list
    if isinstance(texts, str):
        texts = [texts]

    response = co.embed(
        texts=texts,
        model=model,
        input_type=input_type,
        embedding_types=["int8"],
    )
    return [embedding for embedding in response.embeddings.int8]


In [None]:
import time

# Initialize Cohere client
co = cohere.ClientV2()

# Limit to 80 calls per minute as Free Embed API has a limitation of 100 calls per min
batch_size = 80
embeddings = []
for i in range(0, len(document_texts), batch_size):
    batch = document_texts[i:i + batch_size]
    embeddings.extend(generate_embeddings(batch))
    if i + batch_size < len(document_texts):
        time.sleep(60)  # Sleep for 60 seconds to respect the rate limit

# Generate embeddings
# embeddings = generate_embeddings(document_texts)
print (len(embeddings), "Document embeddings generated")

#### Upload documents to Vector Store

In [14]:
def index_documents(search_client, documents, embeddings):
    documents_to_index = [
        {"id": str(idx), "content": doc, "embedding": emb}
        for idx, (doc, emb) in enumerate(zip(documents, embeddings))
    ]
    search_client.upload_documents(documents=documents_to_index)

In [15]:
# Initialize the SearchClient
search_client = SearchClient(
    endpoint=search_service_endpoint, 
    index_name=index_name, 
    credential=credential
)

# Index the documents and their embeddings
index_documents(search_client, document_texts, embeddings)

#### Query

In [24]:
# Create an AI search Retriever

from langchain_community.retrievers import AzureAISearchRetriever

ai_search_retriever = AzureAISearchRetriever(
    content_key="content", top_k=5, index_name=index_name
)

In [26]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(
    """You are a ecommerce system that help users to find products that match their preferences. 
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}

Question: {question}"""
)

In [27]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [33]:
from langchain_cohere import ChatCohere
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Define the Cohere LLM
llm = ChatCohere(model="command-r-plus-08-2024")

chain = (
    {"context": ai_search_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

#### Simple Product and Customer Queries

In [34]:
chain.invoke("What is the price for TrailMaster X4 Tent?")

'The price of the TrailMaster X4 Tent is $250.'

In [36]:
chain.invoke("What are some of the purchases made by Jane Doe?")

'Jane Doe has made the following purchases:\n\n- Adventurer Pro Backpack: Quantity 1, priced at $90, on January 10, 2023.\n- TrekReady Hiking Boots: Quantity 1, priced at $140, on January 20, 2023.\n\nThese purchases are part of her recent order history.'

In [37]:
chain.invoke("What are the features of the TrailMaster X4 Tent?")

'The TrailMaster X4 Tent is a spacious and durable tent designed for outdoor enthusiasts. Here are some of its key features:\n\n- **Capacity:** It can comfortably sleep up to 4 people, providing ample space for a family or a group of friends. The tent also has room for gear, ensuring a comfortable camping experience.\n- **Weight and Portability:** Weighing only 12 lbs, this tent is lightweight and easy to carry during hikes or outdoor adventures. The included carry bag makes it convenient to transport and store.\n- **Weather Resistance:** While it is primarily a 3-season tent, it comes with a rainfly to protect against rain and harsh weather conditions.\n- **Convenience:** It has a convenient storage pocket inside, allowing campers to keep their essentials organized and easily accessible.\n- **Easy Setup:** The tent is designed for quick and straightforward setup, making it user-friendly for campers.\n- **Warranty:** OutdoorLiving provides a 2-year limited warranty, ensuring peace of m

In [38]:
chain.invoke("What are some products bought by people in Suburbia?")

'The people in Suburbia seem to be interested in outdoor gear and camping equipment. Here are some products they have purchased:\n\n- SkyView 2-Person Tent\n- TrekReady Hiking Boots\n- TrekMaster Camping Chair\n- CompactCook Camping Stove\n\nThese products cater to various outdoor activities like camping, hiking, and cooking outdoors. The reviews indicate that these items generally meet or exceed expectations, with some minor criticisms noted for specific aspects of the products.'

#### Chat History

Sample questions:
- Can you suggest products based on my previous purchases and browsing history?
- How can I find similar products to the ones I've recently viewed, but with specific modifications or upgrades?
- Given my past reviews and ratings, what are some products that might interest me and that I haven't considered before?
- Is it possible to get personalized recommendations that take into account not only my preferences but also those of my family members or friends, for a more holistic shopping experience?
- In what ways can I refine my search results to ensure I'm only seeing products that are currently in stock and available for immediate purchase?
- Can you provide a feature that allows me to compare products side by side, highlighting their key differences and similarities, to help me make an informed decision?
-Given my past purchases and the current season, can you suggest some relevant products that might enhance my wardrobe or home decor?
- How can I receive personalized notifications or alerts when new products are launched or when there are special offers or discounts on items that match my preferences?
- Can the system learn from my feedback and adjust its recommendations accordingly, ensuring a more dynamic and responsive shopping experience?
- With an emphasis on data security, how does the system ensure that my personal information and browsing habits are kept private and secure, especially when making recommendations?

Things to explore
- Plan and Solve
- Reranking 
- Chat history