#### Setup Notebook

In [1]:
%pip install -r requirements.txt --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os

# take environment variables from .env.
load_dotenv(override=True) 

search_service_endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.getenv("AZURE_AI_SEARCH_API_KEY", "")) if len(os.getenv("AZURE_AI_SEARCH_API_KEY", "")) > 0 else DefaultAzureCredential()
index_name = os.getenv("AZURE_SEARCH_INDEX", "recommendationidx")

#### Create Search Index in Azure AI Search

In [3]:
import cohere
from dotenv import load_dotenv
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.search.documents.models import (
    VectorizedQuery,
)
from azure.search.documents.indexes.models import (
    HnswAlgorithmConfiguration,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    SemanticSearch,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    HnswParameters,
    VectorSearchAlgorithmMetric
)
from azure.core.credentials import AzureKeyCredential

def create_or_update_index(client, index_name):
    fields = [
        SimpleField(
            name="id", 
            type=SearchFieldDataType.String, 
            key=True
        ),
        SearchField(
            name="title",
            type=SearchFieldDataType.String,
            searchable=True,
        ),
        SearchField(
            name="content",
            type=SearchFieldDataType.String,
            searchable=True,
        ),
        SearchField(
            name="embedding",
            # 8-bit signed integer (int8)
            type="Collection(Edm.SByte)",  
            vector_search_dimensions=1024,
            vector_search_profile_name="my-vector-config",
            # Use hidden=False if you want to return the embeddings in the search results
            hidden=False, 
            searchable=True,
            filterable=False,
        ),
    ]
    
    semantic_search = SemanticSearch(
                configurations=[
                    SemanticConfiguration(
                        name="my-semantic-config",
                        prioritized_fields=SemanticPrioritizedFields(
                            title_field=SemanticField(field_name="title"),
                            content_fields=[
                                SemanticField(field_name="content"),
                            ],
                        ),
                    )
                ]
            )

    vector_search = VectorSearch(
        profiles=[
            VectorSearchProfile(
                name="my-vector-config",
                algorithm_configuration_name="my-hnsw",
            )
        ],
        algorithms=[
            HnswAlgorithmConfiguration(
                name="my-hnsw",
                kind=VectorSearchAlgorithmKind.HNSW,
                parameters=HnswParameters(
                    metric=VectorSearchAlgorithmMetric.DOT_PRODUCT
                )
            )
        ],
    )

    index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)
    return client.create_or_update_index(index=index)

In [4]:
# Initialize Azure Search Index Client
search_index_client = SearchIndexClient(
    endpoint=search_service_endpoint,
    credential=credential,
    index_name=index_name
)

# Check if the index exists
try:
    search_index_client.get_index(index_name)
    print(f"Index '{index_name}' already exists.")
except Exception as e:
    print(f"Index '{index_name}' does not exist. Creating a new one.")
    create_or_update_index(search_index_client, index_name)
    print(f"Search index '{index_name}' created or updated successfully.")


Index 'recommendationidx' already exists.


#### Analyze and Clean data, if needed  

In [None]:
import glob

# Read all markdown files in the data directory
md_files = glob.glob('data/*.md')

# Read and display the first few rows of each markdown file
for file in md_files:
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read()
    print(f"Content of {file}:")
    # Display the first 100 characters of the file
    print(content[:100]) 

#### Chunk documents to be indexed

In [None]:
# Chunk and load documents into AI search
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = DirectoryLoader('data/', glob="*.md", loader_cls=TextLoader, loader_kwargs={'autodetect_encoding': True})
docs = loader.load()
titles = [doc.metadata['source'] for doc in docs]
docs_with_titles = list(zip(titles, docs))
documents = RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=50).split_documents(docs)
print(f"Loaded {len(documents)} documents")

Loaded 809 documents


In [6]:
print(titles)

['data/product_info_2.md', 'data/product_info_10.md', 'data/customer_12.md', 'data/product_info_6.md', 'data/product_info_14.md', 'data/product_info_20.md', 'data/customer_4.md', 'data/product_info_7.md', 'data/product_info_15.md', 'data/customer_5.md', 'data/product_info_3.md', 'data/product_info_11.md', 'data/customer_13.md', 'data/customer_1.md', 'data/product_info_8.md', 'data/product_info_9.md', 'data/product_info_18.md', 'data/customer_8.md', 'data/product_info_19.md', 'data/customer_9.md', 'data/product_info_4.md', 'data/product_info_16.md', 'data/customer_6.md', 'data/product_info_12.md', 'data/customer_10.md', 'data/customer_2.md', 'data/product_info_1.md', 'data/product_info_13.md', 'data/customer_11.md', 'data/customer_3.md', 'data/product_info_5.md', 'data/product_info_17.md', 'data/customer_7.md']


In [None]:
# Extract page_content from each Document object
document_texts = [doc.page_content for doc in documents]

#### Generate Embeddings using Cohere embed V3
- Use embed-english-v3.0 model to embed the data with 1024 Dimentions and 512 Context window size

In [None]:
def generate_embeddings(texts, input_type="search_document"):
    model = "embed-english-v3.0"
    # Ensure texts is a list
    if isinstance(texts, str):
        texts = [texts]

    response = co.embed(
        texts=texts,
        model=model,
        input_type=input_type,
        embedding_types=["int8"],
    )
    return [embedding for embedding in response.embeddings.int8]


In [None]:
import time

# Initialize Cohere client
co = cohere.ClientV2()

# Limit to 80 calls per minute as Free Embed API has a limitation of 100 calls per min
batch_size = 80
embeddings = []
for i in range(0, len(document_texts), batch_size):
    batch = document_texts[i:i + batch_size]
    embeddings.extend(generate_embeddings(batch))
    if i + batch_size < len(document_texts):
        time.sleep(60)  # Sleep for 60 seconds to respect the rate limit
        
print (len(embeddings), "Document embeddings generated")

#### Upload documents and embeddings in Azure AI Search

In [None]:
def index_documents(search_client, documents, embeddings):
    documents_to_index = [
        {"id": str(idx), "title": titles[idx], "content": doc, "embedding": emb}
        for idx, (doc, emb) in enumerate(zip(documents, embeddings))
    ]
    search_client.upload_documents(documents=documents_to_index)

In [None]:
# Initialize the SearchClient
search_client = SearchClient(
    endpoint=search_service_endpoint, 
    index_name=index_name, 
    credential=credential
)

# Index the documents and their embeddings
index_documents(search_client, document_texts, embeddings)