#### Setup

In [1]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [23]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os

load_dotenv(override=True) # take environment variables from .env.

# The following variables from your .env file are used in this notebook
cohere_api_key = os.environ["COHERE_API_KEY"]
search_service_endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.getenv("AZURE_SEARCH_ADMIN_KEY", "")) if len(os.getenv("AZURE_SEARCH_ADMIN_KEY", "")) > 0 else DefaultAzureCredential()
index_name = os.getenv("AZURE_SEARCH_INDEX", "recommendationidx")
search_service_name = os.environ["AZURE_AI_SEARCH_SERVICE_NAME"]

#### Analyze and Clean data

In [3]:
import pandas as pd
from IPython.display import display

# Read a markdown file, getting the header from the first row and
# index from the second column
# Drop the left-most and right-most naN columns
# Drop the header underline row
display(pd.read_table('dataset/customer-purchase-history.md', sep="|", header=0, index_col=1, skipinitialspace=True).dropna(axis=1, how='all').iloc[1:].head())

Unnamed: 0_level_0,Age,Gender,Location,Purchase History,Browsing Behavior
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C001,32,Male,NYC,"Sports Gear, Running Shoes, Camping Equipment","Sports, Outdoor Activities"
C002,28,Female,SF,"Yoga Mats, Activewear, Hiking Backpack","Fitness, Adventure Travel"
C003,40,Male,LA,"Golf Clubs, Polo Shirts, Sun Hats","Golfing, Summer Fashion"
C004,24,Female,Chicago,"Skateboards, Streetwear, Headphones","Urban Lifestyle, Music"
C005,36,Male,Miami,"Surfboards, Swimwear, Beach Towels","Water Sports, Beach Essentials"


In [4]:
display(pd.read_table('dataset/product-data.md', sep="|", header=0, index_col=1, skipinitialspace=True).dropna(axis=1, how='all').iloc[1:].head())

Unnamed: 0_level_0,Category,Sub-Category,Product Name,Description
Product ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
P001,Sports,Gear,Running Shoes,"Lightweight, breathable running shoes with exc..."
P002,Fitness,Equipment,Yoga Mats,"Non-slip, eco-friendly yoga mats for comfortab..."
P003,Outdoor,Camping,Camping Tents,"Spacious, weather-resistant tents for outdoor ..."
P004,Sports,Apparel,Sports Bras,High-support sports bras for various activities.
P005,Adventure,Water Sports,Surfboards,High-performance surfboards for surfing enthus...


In [5]:
display(pd.read_table('dataset/product-reviews.md', sep="|", header=0, index_col=1, skipinitialspace=True).dropna(axis=1, how='all').iloc[1:].head())

Unnamed: 0_level_0,Product ID,Customer ID,Rating,Review Text
Review ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
R001,P001,C001,5,These running shoes are amazing! They provide ...
R002,P002,C002,4,The yoga mat is great for my practice. It's st...
R003,P003,C003,5,Spacious and durable camping tent. It withstoo...
R004,P004,C004,3,"Sports bras are functional, but I wish they ha..."
R005,P005,C005,5,High-quality surfboard. It's responsive and al...


#### AI Search as Vector Store

In [6]:
import cohere
from dotenv import load_dotenv
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.search.documents.models import (
    VectorizedQuery,
)
from azure.search.documents.indexes.models import (
    HnswAlgorithmConfiguration,
    SearchField,
    SearchableField,
    SearchFieldDataType,
    SearchIndex,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
)
from azure.core.credentials import AzureKeyCredential

def create_or_update_index(client, index_name):
    fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, key=True),
        SearchField(
            name="text",
            type=SearchFieldDataType.String,
            searchable=True,
        ),
        SearchField(
            name="embedding",
            type="Collection(Edm.SByte)",  # OData syntax for 8-bit signed integer
            vector_search_dimensions=1024,
            vector_search_profile_name="my-vector-config",
            # hidden=False, Use hidden=False if you want to return the embeddings in the search results
        ),
    ]

    vector_search = VectorSearch(
        profiles=[
            VectorSearchProfile(
                name="my-vector-config",
                algorithm_configuration_name="my-hnsw",
            )
        ],
        algorithms=[
            HnswAlgorithmConfiguration(
                name="my-hnsw",
                kind=VectorSearchAlgorithmKind.HNSW,
            )
        ],
    )

    index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
    client.create_or_update_index(index=index)

In [7]:
# Initialize Azure Search Index Client
search_index_client = SearchIndexClient(
    endpoint=search_service_endpoint,
    credential=credential,
    index_name=index_name
)

# Create or update the search index to include the embedding field
create_or_update_index(search_index_client, index_name)

#### Embed data using Cohere embed V3
- Use embed-english-v3.0 model to embed the data with 1024 Dimentions and 512 Context window size

In [8]:
# Chunk and load documents into AI search

from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = DirectoryLoader('dataset/', glob="*.md", loader_cls=TextLoader, loader_kwargs={'autodetect_encoding': True})
docs = loader.load()
documents = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20).split_documents(docs)
print(f"Loaded {len(documents)} documents")

Loaded 59 documents


In [9]:
# Extract page_content from each Document object
document_texts = [doc.page_content for doc in documents]
print(document_texts[:2])

['| Customer ID | Age | Gender | Location | Purchase History | Browsing Behavior |\n|-------------|------|--------|----------|------------------|-------------------|\n| C001        | 32  | Male   | NYC      | Sports Gear, Running Shoes, Camping Equipment | Sports, Outdoor Activities |\n| C002        | 28  | Female | SF       | Yoga Mats, Activewear, Hiking Backpack | Fitness, Adventure Travel |', '| C003        | 40  | Male   | LA       | Golf Clubs, Polo Shirts, Sun Hats | Golfing, Summer Fashion |\n| C004        | 24  | Female | Chicago  | Skateboards, Streetwear, Headphones | Urban Lifestyle, Music |\n| C005        | 36  | Male   | Miami    | Surfboards, Swimwear, Beach Towels | Water Sports, Beach Essentials |']


In [10]:
def generate_embeddings(texts, input_type="search_document"):
    model = "embed-english-v3.0"
    # Ensure texts is a list
    if isinstance(texts, str):
        texts = [texts]

    response = co.embed(
        texts=texts,
        model=model,
        input_type=input_type,
        embedding_types=["int8"],
    )
    return [embedding for embedding in response.embeddings.int8]


In [None]:
# Initialize Cohere client
co = cohere.ClientV2()

# Generate embeddings
embeddings = generate_embeddings(document_texts)

#### Upload documents to Vector Store

In [12]:
def index_documents(search_client, documents, embeddings):
    documents_to_index = [
        {"id": str(idx), "text": doc, "embedding": emb}
        for idx, (doc, emb) in enumerate(zip(documents, embeddings))
    ]
    search_client.upload_documents(documents=documents_to_index)

In [15]:
# Initialize the SearchClient
search_client = SearchClient(
    endpoint=search_service_endpoint, 
    index_name=index_name, 
    credential=credential
)

# Index the documents and their embeddings
index_documents(search_client, document_texts, embeddings)

#### Create a Retriever

In [27]:
from langchain_community.retrievers import AzureAISearchRetriever

retriever = AzureAISearchRetriever(
    content_key="text", top_k=5, index_name=index_name
)

In [37]:
from langchain.tools.retriever import create_retriever_tool

retriever_tool = create_retriever_tool(
    retriever,
    "vector_database",
    "Search for information about Products and Customers. For any questions about e-commerce, you must use this tool!",
)

In [29]:
from langchain_cohere import ChatCohere

llm = ChatCohere(model="command-r-plus-08-2024", temperature=0)

In [31]:
from langchain_core.prompts import ChatPromptTemplate

# Preamble
preamble = """
You are an expert who answers the user's question with the most relevant datasource. You are equipped with a special vectorstore of information about customer purchases and product database.
"""
# Prompt template
prompt = ChatPromptTemplate.from_template("{input}")

In [38]:
from langchain_cohere.react_multi_hop.agent import create_cohere_react_agent

# Create the ReAct agent
agent = create_cohere_react_agent(
    llm=llm,
    tools=[retriever_tool],
    prompt=prompt,
)

In [39]:
from langchain.agents import AgentExecutor

agent_executor = AgentExecutor(agent=agent, tools=[retriever_tool], verbose=True)

In [40]:
response = agent_executor.invoke({
    "input": "What is the most popular product?",
    "preamble": preamble,
})
print(response['output'])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
I will search for the most popular product.
{'tool_name': 'vector_database', 'parameters': {'query': 'What is the most popular product?'}}
[0m[36;1m[1;3m| R037      | P037       | C037        | 5      | "Extra-long yoga mat. It provides ample space for my practice and is comfortable to use." |
| R038      | P038       | C038        | 4      | "Quick-pitch tent. It's easy to set up and provides good protection from the elements." |

| R002      | P002       | C002        | 4      | "The yoga mat is great for my practice. It's sticky enough to prevent slipping and provides good cushioning." |
| R003      | P003       | C003        | 5      | "Spacious and durable camping tent. It withstood the windy conditions during my last camping trip." |

| R029      | P029       | C029        | 5      | "Bright and portable lantern. It illuminates my campsite and is easy to carry." |
| R030      | P030       | C030        | 4      | "S

Sample questions:
- Can you suggest products based on my previous purchases and browsing history?
- How can I find similar products to the ones I've recently viewed, but with specific modifications or upgrades?
- Given my past reviews and ratings, what are some products that might interest me and that I haven't considered before?
- Is it possible to get personalized recommendations that take into account not only my preferences but also those of my family members or friends, for a more holistic shopping experience?
- In what ways can I refine my search results to ensure I'm only seeing products that are currently in stock and available for immediate purchase?
- Can you provide a feature that allows me to compare products side by side, highlighting their key differences and similarities, to help me make an informed decision?
-Given my past purchases and the current season, can you suggest some relevant products that might enhance my wardrobe or home decor?
- How can I receive personalized notifications or alerts when new products are launched or when there are special offers or discounts on items that match my preferences?
- Can the system learn from my feedback and adjust its recommendations accordingly, ensuring a more dynamic and responsive shopping experience?
- With an emphasis on data security, how does the system ensure that my personal information and browsing habits are kept private and secure, especially when making recommendations?