# Document Indexing Workflow

## Important Note
The document indexing process should be implemented using the **indexer feature** and executed in **multiple steps** for optimal performance and maintainability. 

This notebook demonstrates the step-by-step approach to document indexing, breaking down the process into manageable stages that can be:
- Monitored individually
- Debugged more easily
- Rerun selectively if needed
- Scaled appropriately based on document volume

Each step in this notebook represents a distinct phase of the indexing pipeline, ensuring a structured and systematic approach to document processing.

In [None]:
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics.aio import TextAnalyticsClient
from typing import List
from models.document import Document
from azure.ai.inference.aio import EmbeddingsClient
from azure.core.credentials import AzureKeyCredential
from dotenv import load_dotenv
from azure.search.documents.aio import SearchClient
from azure.core.credentials import AzureKeyCredential
import os
import pandas as pd
import json
import asyncio

In [30]:
load_dotenv(override=True)

language_endpoint=os.getenv('LANGUAGE_ENDPOINT')
language_api_key=os.getenv('LANGUAGE_KEY')
openai_endpoint=os.getenv('OPENAI_ENDPOINT')
openai_key = os.getenv('OPENAI_KEY')
openai_embedding_deployment = os.getenv('EMBEDDING_OPENAI_DEPLOYMENT')
region="westus"

# Translation Service
translation_endpoint = os.getenv('TRANSLATION_ENDPOINT')
translation_key = os.getenv('TRANSLATION_KEY')
translation_region = os.getenv('TRANSLATION_REGION')

# See the list of models available here
# https://docs.cohere.com/docs/cohere-embed
cohere_key = os.getenv('COHERE_KEY')
cohere_model=os.getenv('COHERE_MODEL')
cohere_endpoint=os.getenv('COHERE_ENDPOINT')

# Search
search_endpoint = os.getenv('SEARCH_ENDPOINT')
search_api_key = os.getenv('SEARCH_API_KEY')

### Load supported languages

We use [Cohere](https://docs.cohere.com/docs/cohere-embed) since it support embedding for multiple languages here.

We load the JSON files that support all the languages

In [None]:
# This is the official supported languages in Cohere
path = os.path.join("cohere","supported_languages.json")

with open(path,"r",encoding="utf-8") as f:
    supported_languages = json.load(f)

print(supported_languages)

# Create a dictionary for fast lookup: {code: description}
language_dict = {lang["code"]: lang["description"] for lang in supported_languages}

In [47]:
text_analytics_client = TextAnalyticsClient(language_endpoint, AzureKeyCredential(language_api_key))

client = EmbeddingsClient(
            endpoint=cohere_endpoint,
            credential=AzureKeyCredential(cohere_key)
        )
model_name = cohere_model

index_name = "multilanguage"

credential = AzureKeyCredential(search_api_key)

# Initialize the search index client
search_client = SearchClient(endpoint=search_endpoint,
                             index_name=index_name,
                             credential=credential)

Create two custom functions to validate if the languague is supported before
the embedding

In [33]:
# Function to check if a language code is supported
def is_language_supported(language_code: str) -> bool:
    """Check if a language code is supported by Cohere embeddings"""

    return language_code in language_dict

# Function to get language description
def get_language_description(language_code: str) -> str:
    """Get the description for a language code, or None if not supported"""
    return language_dict.get(language_code)


def reached_size_limit(docs:list) -> bool:
    """    
    Check if document batch has reached service limits for Text Analytics Language Detection.
    
    The Azure Text Analytics service has the following constraints:
    - Maximum 1000 documents per request
    - Maximum 1 MB total request size
    
    This function returns True when approaching these limits to ensure safe batching.
    """
    number_of_documents = len(docs)

    if number_of_documents >= 950 and number_of_documents < 1000:
        return True

    json_string = json.dumps(docs, ensure_ascii=False)
    accurate_size_bytes = len(json_string.encode('utf-8'))

    # Check if size exceeds 1 MB (1,048,576 bytes)
    max_size_limit_bytes = 1 * 1024 * 1024  # 1 MB
    # Set size limit to 700 KB (allowing room before 1 MB limit)
    size_limit_bytes = 700 * 1024  # 700 KB

    if accurate_size_bytes > size_limit_bytes and accurate_size_bytes <= max_size_limit_bytes:
        return True
    else:
        return False
    
def csv_to_json_array(csv_file:str, output_file:str):
    """Convert CSV or Excel file to array of JSON objects with snake_case field names"""
    
    # Check file extension and read accordingly
    if csv_file.endswith('.xlsx') or csv_file.endswith('.xls'):
        # Read Excel file into DataFrame
        df = pd.read_excel(csv_file)
    else:
        # Read CSV file into DataFrame
        df = pd.read_csv(csv_file)
    
    # Replace NaN values with empty strings
    df = df.fillna('')
    
    # Convert column names from "Title Case" to "snake_case"
    def to_snake_case(name):
        # Replace spaces with underscores and convert to lowercase
        return name.replace(' ', '_').lower()
    
    # Rename all columns to snake_case
    df.columns = [to_snake_case(col) for col in df.columns]
    
    # Convert DataFrame to list of dictionaries (JSON objects)
    data = df.to_dict(orient='records')
    
    # Print the result
    print(f"Converted {len(data)} records from {csv_file} to JSON array")
    print(f"Converted column names: {list(df.columns)}")
    print("\nFirst record example:")
    print(json.dumps(data[0], indent=2))
    
    # Save JSON array to file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"\nJSON array saved to: {output_file}")


async def get_language_documents(docs:list):
    """
    The `get_language_documents` method is an asynchronous function that detects the language of a batch of documents using Azure Text Analytics service.

    **Purpose:**
    - Takes a list of documents and identifies the primary language for each document
    - Returns processed results with language codes that are compatible with Cohere embeddings

    **How it works:**

    1. **Language Detection**: Calls Azure Text Analytics API (`text_analytics_client.detect_language()`) to analyze the documents asynchronously

    2. **Result Processing**: For each document in the response:
        - Creates a dictionary with the document's ID
        - Handles errors: If detection failed, marks it as an error and includes error details
        - Language code mapping: Special handling for Chinese - converts Azure's "zh_chs" (Chinese Simplified) to "zh" for Cohere compatibility
        - For other languages, uses the ISO 639-1 language code from Azure

    3. **Return Value**: Returns a list of processed documents, where each contains:
        - `id`: Document identifier
        - `language_code`: ISO 639-1 language code (when successful)
        - `is_error`: Boolean flag if detection failed
        - `error`: Error details (when applicable)

    **Key Feature:**
    The method handles the mismatch between Azure's language codes and Cohere's expected format, specifically normalizing Chinese language codes to ensure compatibility with the Cohere multilingual embedding model.
    """
    documents = await text_analytics_client.detect_language(docs)    
    processed_documents = []

    # Parse all documents
    for document in documents:

        doc = {
            "id": document.id            
        }

        if document.is_error:
            doc['is_error'] = True
            doc['error'] = document.error
        else:
            # Language simplified is different from our AI Services vs Cohere
            if document.primary_language.iso6391_name == "zh_chs":
                doc['language_code'] = "zh"
            else:
                doc['language_code'] = document.primary_language.iso6391_name            
        
        processed_documents.append(doc)

    return processed_documents    


def load_documents_from_jsonl(file_path: str) -> List[Document]:
    """
    Load documents from a JSONL file into a list of Document objects.
    
    Args:
        file_path: Path to the JSONL file
        
    Returns:
        List of Document objects
    """
    documents = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                data = json.loads(line)
                documents.append(Document(**data))
    return documents

async def create_embeddings(documents:List[str]) -> List[float]:
    """Call Azure AI Inference endpoint using Github Model Cohere 3"""
    
    vectors:List[float] = []
    response = await client.embed(input=documents,
                                  model=cohere_model)
    
    for data in response.data:
        vectors.append(data['embedding'])

    return vectors

#### Convert file to JSON

We convert the XLSX file to JSON, by doing so we load the JSON in a dictionnary and add new columns to be able to load them in the index

In [None]:
csv_to_json_array(csv_file="car_problems_multilingual.xlsx",output_file="car_problems_multilingual.json")

In [None]:
# Load the json in a dictionnary
with open("car_problems_multilingual.json","r",encoding="utf-8") as f:
    documents = json.load(f)

print(f"{len(documents)} loaded")    

## Step 2: Detect Languages in Documents

In this step, we'll process the documents to identify their languages using Azure Text Analytics. This is crucial for:

- **Language validation**: Determining if the language is supported by our Cohere multilingual embedding model
- **Batch processing**: The Azure Text Analytics service has limits (max 1000 documents, 1MB total size), so we process documents in batches
- **Language code mapping**: Converting Azure's language codes to match Cohere's expected format (e.g., "zh_chs" → "zh")

The cell below will:
1. Iterate through all documents and prepare them for language detection
2. Check batch size limits to ensure we don't exceed Azure's constraints
3. Call the language detection API to identify each document's language
4. Map the detected languages back to the original documents for further processing

In [36]:
docs = []
processed_documents = []

for document in documents:

    # Detect the language of the documents, here the maximum is 1000 documents with a size of 1 MB
    docs.append({
        "id": document["id"],
        "text": document['fault']
    })

    if reached_size_limit(docs):
        print(f"Reached size limits {len(docs)}")
        # Add something here
        break

if len(docs) > 0:
    results = await get_language_documents(docs)
    # Map the language results back to the original documents
    for result in results:
        # Find the matching document by id
        matching_doc = next((d for d in documents if d['id'] == result['id']), None)
        if matching_doc and not result.get('is_error'):
            matching_doc['language_code'] = result['language_code']
            processed_documents.append(matching_doc)

print("all documents processed")       
print(json.dumps(processed_documents,indent=4))

all documents processed
[
    {
        "id": "a4103b4e-38a2-4817-9394-18a52cd0bb06",
        "brand": "\u4e30\u7530",
        "model": "\u51ef\u7f8e\u745e",
        "fault": "\u5239\u8f66\u6709\u566a\u97f3",
        "fix": "\u9700\u8981\u65b0\u5239\u8f66\u7247",
        "language_code": "zh"
    },
    {
        "id": "22984e73-ad39-4b6f-b3ec-8f8400804a7d",
        "brand": "Volkswagen",
        "model": "Golf",
        "fault": "\u05e1\u05d5\u05dc\u05dc\u05d4 \u05e0\u05d2\u05de\u05e8\u05ea \u05de\u05d4\u05e8",
        "fix": "\u05e1\u05d5\u05dc\u05dc\u05d4 \u05d7\u05d3\u05e9\u05d4",
        "language_code": "he"
    },
    {
        "id": "cc9871f0-1eb3-41d9-a29f-381be8b0ee1a",
        "brand": "Ford",
        "model": "Focus",
        "fault": "\u30d6\u30ec\u30fc\u30ad\u304c\u30ad\u30fc\u30ad\u30fc\u9cf4\u308b",
        "fix": "\u30d1\u30c3\u30c9\u66ff\u3048\u308b",
        "language_code": "ja"
    },
    {
        "id": "addfc4c4-4497-42c6-8585-1aa8fb9926c7",
        "brand": "Nis

Loop each documents and validate if the language is supported by the embedding model, if not you will need to add a translation step

In [37]:
documents_not_supported = []
documents_to_embeds = []

for doc in processed_documents:
   
   if not is_language_supported(doc['language_code']):
      documents_not_supported.append({
         "id": doc['id'],
         "language_code": doc['language_code'],
         "language_description": get_language_description(doc['language_code'])
      })
      continue

   # Save supported document so the embedding can be a batch job
   # for performance reason, for indexing the first batch this is the better 
   # options
   documents_to_embeds.append({
         "id": doc['id'],
         "language_code": doc['language_code'],     
         "text": doc['fault']  # Important this is the text that we want to create the embedding
   })

print(f"Not supported documents {len(documents_not_supported)}")
print(f"Documents supported {len(documents_to_embeds)}")
   

Not supported documents 0
Documents supported 60


Save the file to process for the batch

In [38]:
with open("documents_to_embed.jsonl", 'w', encoding='utf-8') as f:
    for doc in documents_to_embeds:
      f.write(json.dumps(doc,ensure_ascii=False) + '\n')

In [39]:
documents = load_documents_from_jsonl("documents_to_embed.jsonl")

len(documents)

60

## Step 3: Generate Embeddings for Documents

This step creates vector embeddings for all the documents that have supported languages. The embeddings are essential for enabling semantic search capabilities in the search index.

**What this process does:**

1. **Batch Processing**: Documents are processed in batches of 10 for optimal performance with the Cohere multilingual embedding model. This prevents timeout issues and manages API rate limits effectively.

2. **Vector Generation**: For each batch of documents:
    - Extracts the text content (the 'fault' field describing car problems)
    - Calls the Azure AI Inference endpoint with Cohere's multilingual model
    - Receives back 1024-dimensional vectors that capture the semantic meaning of each text

3. **Vector Assignment**: The generated vectors are then attached to their corresponding Document objects, creating a complete representation that includes both the original text and its semantic embedding.

4. **Progress Tracking**: The process uses index counters (`idx` and `idx_document`) to:
    - Track progress through the document list
    - Ensure vectors are correctly matched to their source documents
    - Handle any remaining documents that don't fill a complete batch

This embedding step is crucial for the indexing workflow as it transforms human-readable text into numerical representations that can be used for similarity searches, allowing users to find relevant car troubleshooting information across multiple languages using semantic meaning rather than exact keyword matches.

In [42]:
idx = 0       
number_of_documents = len(documents) - 1
documents_to_embed:List[str] = []

doc_test = [ documents[0],documents[1],documents[2] ]

idx_document = 0

print(len(doc_test))

while idx < len(documents):
            
    idx+=1   
    #print(idx)                     
    documents_to_embed.append(documents[idx-1].text)

    if idx % 10 == 0:
        vectors = await create_embeddings(documents_to_embed)

        for v in vectors:
            documents[idx_document].vector = v
            idx_document+=1            

        documents_to_embed.clear()

if len(documents_to_embed) > 0:
    vectors = await create_embeddings(documents_to_embed)

    for v in vectors:
        documents[idx_document].vector = v
        idx_document+=1

3


In [43]:
with open("documents_with_vectors.json", 'w', encoding='utf-8') as f:
    # Convert all Pydantic models to dictionaries
    json_data = [doc.model_dump() for doc in documents]
    json.dump(json_data, f, indent=2, ensure_ascii=False)

Now add the vector to the original document

In [44]:
# vector_fix
with open("documents_with_vectors.json","r",encoding="utf-8") as f:
    doc_with_vectors = json.load(f)

with open("car_problems_multilingual.json","r",encoding="utf-8") as f:
    cars = json.load(f)    

# Create a dictionary for fast lookup: {id: vector}
vector_dict = {doc["id"]: doc["vector"] for doc in doc_with_vectors}

# Add vectors to cars documents
for car in cars:
    car_id = car["id"]
    if car_id in vector_dict:
        car["vector"] = vector_dict[car_id]

print(f"Added vectors to {sum(1 for car in cars if 'vector' in car)} out of {len(cars)} documents")    

Added vectors to 60 out of 60 documents


In [45]:
with open("car_problems_multilingual_with_vectors.json", 'w', encoding='utf-8') as f:
    for car in cars:
      f.write(json.dumps(car,ensure_ascii=False) + '\n')

Now upload in the index

In [None]:
try:
    result = await search_client.upload_documents(cars)
    print("Upload of new document succeeded: {}".format(result[0].succeeded))
except Exception as ex:
    print(ex)
  