#### Setup Notebook

In [1]:
%pip install -r requirements.txt --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os

# take environment variables from .env.
load_dotenv(override=True) 

search_service_endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.getenv("AZURE_AI_SEARCH_API_KEY", "")) if len(os.getenv("AZURE_AI_SEARCH_API_KEY", "")) > 0 else DefaultAzureCredential()
index_name = os.getenv("AZURE_SEARCH_INDEX", "recommendationidx")

#### Create Search Index in Azure AI Search

In [3]:
import cohere
from dotenv import load_dotenv
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.search.documents.models import (
    VectorizedQuery,
)
from azure.search.documents.indexes.models import (
    HnswAlgorithmConfiguration,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    SemanticSearch,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    HnswParameters,
    VectorSearchAlgorithmMetric
)
from azure.core.credentials import AzureKeyCredential

def create_or_update_index(client, index_name):
    fields = [
        SimpleField(
            name="id", 
            type=SearchFieldDataType.String, 
            key=True
        ),
        SearchField(
            name="title",
            type=SearchFieldDataType.String,
            searchable=True,
        ),
        SearchField(
            name="content",
            type=SearchFieldDataType.String,
            searchable=True,
        ),
        SearchField(
            name="embedding",
            # 8-bit signed integer (int8)
            type="Collection(Edm.SByte)",  
            vector_search_dimensions=1024,
            vector_search_profile_name="my-vector-config",
            # Use hidden=False if you want to return the embeddings in the search results
            hidden=False, 
            searchable=True,
            filterable=False,
        ),
    ]
    
    semantic_search = SemanticSearch(
                configurations=[
                    SemanticConfiguration(
                        name="my-semantic-config",
                        prioritized_fields=SemanticPrioritizedFields(
                            title_field=SemanticField(field_name="title"),
                            content_fields=[
                                SemanticField(field_name="content"),
                            ],
                        ),
                    )
                ]
            )

    vector_search = VectorSearch(
        profiles=[
            VectorSearchProfile(
                name="my-vector-config",
                algorithm_configuration_name="my-hnsw",
            )
        ],
        algorithms=[
            HnswAlgorithmConfiguration(
                name="my-hnsw",
                kind=VectorSearchAlgorithmKind.HNSW,
                parameters=HnswParameters(
                    metric=VectorSearchAlgorithmMetric.DOT_PRODUCT
                )
            )
        ],
    )

    index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)
    return client.create_or_update_index(index=index)

In [4]:
# Initialize Azure Search Index Client
search_index_client = SearchIndexClient(
    endpoint=search_service_endpoint,
    credential=credential,
    index_name=index_name
)

# Check if the index exists
try:
    search_index_client.get_index(index_name)
    print(f"Index '{index_name}' already exists.")
except Exception as e:
    print(f"Index '{index_name}' does not exist. Creating a new one.")
    create_or_update_index(search_index_client, index_name)
    print(f"Search index '{index_name}' created or updated successfully.")


Index 'recommendationidx' already exists.


#### Analyze and Clean data, if needed  

In [5]:
import glob

# Read all markdown files in the data directory
md_files = glob.glob('data/*.md')

# Read and display the first few rows of each markdown file
for file in md_files:
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read()
    print(f"Content of {file}:")
    # Display the first 100 characters of the file
    print(content[:100]) 

Content of data/product_info_2.md:
# Information about product item_number: 2
Adventurer Pro Backpack, price $90,

## Brand
HikeMate

#
Content of data/product_info_10.md:
# Information about product item_number: 10
TrailBlaze Hiking Pants, price $75,

## Brand
MountainSt
Content of data/customer_12.md:
## Customer_Info

First Name: Karen 
Last Name: Williams 
Age: 29 
Email Address: karenw@example.com
Content of data/product_info_6.md:
# Information about product item_number: 6
EcoFire Camping Stove, price $80,

## Brand
EcoFire

## C
Content of data/product_info_14.md:
# Information about product item_number: 14
MountainDream Sleeping Bag, price $130,

## Brand
Mounta
Content of data/product_info_20.md:
# Information about product item_number: 20
CompactCook Camping Stove, price $60,

## Brand
CompactC
Content of data/customer_4.md:
## Customer_Info

First Name: Sarah 
Last Name: Lee 
Age: 38 
Email Address: sarahlee@example.com 
P
Content of data/product_info_7.md:
# Information abo

#### Chunk documents to be indexed

In [6]:
# Chunk and load documents into AI search
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = DirectoryLoader('data/', glob="*.md", loader_cls=TextLoader, loader_kwargs={'autodetect_encoding': True})
docs = loader.load()
documents = RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=50).split_documents(docs)
print(f"Loaded {len(documents)} documents")

Loaded 809 documents


In [7]:
# Extract page_content from each Document object
# document_texts = [doc.page_content for doc in documents]
document_tuples = [(doc.page_content, doc.metadata['source']) for doc in documents]
# Print the first 2 page_content only
for content, source in document_tuples[:2]:
    print(content)
    print(source)

# Information about product item_number: 2
Adventurer Pro Backpack, price $90,

## Brand
HikeMate

## Category
Backpacks
data/product_info_2.md
## Features
- 40L capacity for ample storage space
- Ergonomic design for comfortable carrying
- Durable nylon material for long-lasting performance
- Multiple compartments and pockets for organized storage
- Hydration system compatibility with a dedicated hydration bladder sleeve and tube port
- Adjustable and padded shoulder straps for a customized fit and enhanced comfort
data/product_info_2.md


#### Generate Embeddings using Cohere embed V3
- Use embed-english-v3.0 model to embed the data with 1024 Dimentions and 512 Context window size

In [8]:
def generate_embeddings(texts, input_type="search_document"):
    model = "embed-english-v3.0"
    # Ensure texts is a list
    if isinstance(texts, str):
        texts = [texts]

    response = co.embed(
        texts=texts,
        model=model,
        input_type=input_type,
        embedding_types=["int8"],
    )
    return [embedding for embedding in response.embeddings.int8]


In [10]:
import time

# Initialize Cohere client
co = cohere.ClientV2()

# Limit to 80 calls per minute as Free Embed API has a limitation of 100 calls per min
batch_size = 80
embeddings = []
document_texts = [doc[0] for doc in document_tuples]
print(f"Generating embeddings for {len(document_texts)} documents")
for i in range(0, len(document_texts), batch_size):
    batch = document_texts[i:i + batch_size]
    embeddings.extend(generate_embeddings(batch))
    if i + batch_size < len(document_texts):
        time.sleep(60)  # Sleep for 60 seconds to respect the rate limit
        
print (len(embeddings), "Document embeddings generated")

Generating embeddings for 809 documents
809 Document embeddings generated


#### Upload documents and embeddings in Azure AI Search

In [13]:
def index_documents(search_client, documents, embeddings):
    documents_to_index = [
        {"id": str(idx), "title": document_tuples[idx][1], "content": doc, "embedding": emb}
        for idx, (doc, emb) in enumerate(zip(documents, embeddings))
    ]
    search_client.upload_documents(documents=documents_to_index)
    print(documents_to_index[:2])

In [14]:
# Initialize the SearchClient
search_client = SearchClient(
    endpoint=search_service_endpoint, 
    index_name=index_name, 
    credential=credential
)

# Index the documents and their embeddings
index_documents(search_client, document_texts, embeddings)

[{'id': '0', 'title': 'data/product_info_2.md', 'content': '# Information about product item_number: 2\nAdventurer Pro Backpack, price $90,\n\n## Brand\nHikeMate\n\n## Category\nBackpacks', 'embedding': [-108, -52, -32, 29, -89, 39, -110, -8, -75, -5, 9, 26, -79, -56, 43, -82, 31, -24, -25, -10, 0, -2, -31, 69, -1, 54, -70, -16, -19, -22, -8, 8, 6, -12, 73, 58, -3, 26, 45, -38, 46, -20, -18, 4, -25, -4, 21, -21, 46, 14, 34, 44, 22, -7, -10, 41, 68, -56, -1, -76, -22, 40, 66, 71, -1, 0, 52, 19, 33, -33, -20, 28, 27, 56, -24, -96, 72, 17, 2, 37, -2, -23, -30, -53, 54, -2, -14, -11, -18, -58, -38, 4, -60, -23, -17, -20, 78, 61, 45, -39, 36, -95, -20, -23, -12, -84, 53, 21, 24, 28, -103, 127, 5, -62, 52, -45, -93, 110, 44, 26, -40, 0, 59, 1, 23, 45, -24, -30, -17, 39, 21, 34, -95, -37, -49, 3, -2, 63, -1, -45, 35, 21, 29, -23, 84, 20, 33, 25, 36, -20, 65, 13, 11, 4, 19, -28, -37, -13, 35, 3, -36, -37, 19, 3, -37, 82, -2, -16, 6, -46, -6, -12, 43, -40, -35, 1, -17, 34, 54, -94, -50, -54, 75