In [20]:
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics.aio import TextAnalyticsClient
from typing import List
from models.document import Document
from azure.ai.inference.aio import EmbeddingsClient
from azure.core.credentials import AzureKeyCredential
from dotenv import load_dotenv
from azure.search.documents.aio import SearchClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.translation.text import TranslatorCredential
from azure.ai.translation.text.models import InputTextItem
from azure.ai.translation.text.aio import TextTranslationClient
from openai import AsyncAzureOpenAI
from typing import Dict
import os
import pandas as pd
import json
import asyncio

In [18]:
load_dotenv(override=True)

language_endpoint=os.getenv('LANGUAGE_ENDPOINT')
language_api_key=os.getenv('LANGUAGE_KEY')
region="westus"

translation_endpoint = os.getenv('TRANSLATION_ENDPOINT')
translation_key = os.getenv('TRANSLATION_KEY')
translation_region = os.getenv('TRANSLATION_REGION')

cohere_key = os.getenv('COHERE_KEY')
cohere_model=os.getenv('COHERE_MODEL')
cohere_endpoint=os.getenv('COHERE_ENDPOINT')

search_endpoint = os.getenv('SEARCH_ENDPOINT')
search_api_key = os.getenv('SEARCH_API_KEY')

In [19]:
index_name="translated_dual"

credential = AzureKeyCredential(search_api_key)

search_client = SearchClient(endpoint=search_endpoint,
                             index_name=index_name,
                             credential=credential)

text_analytics_client = TextAnalyticsClient(language_endpoint, AzureKeyCredential(language_api_key))

credential = TranslatorCredential(translation_key, translation_region)
text_translator = TextTranslationClient(endpoint=translation_endpoint, credential=credential)

client = EmbeddingsClient(
            endpoint=cohere_endpoint,
            credential=AzureKeyCredential(cohere_key)
        )
model_name = cohere_model

In [9]:
async def get_language_documents(docs:list):
    """
    The `get_language_documents` method is an asynchronous function that detects the language of a batch of documents using Azure Text Analytics service.

    **Purpose:**
    - Takes a list of documents and identifies the primary language for each document
    - Returns processed results with language codes that are compatible with Cohere embeddings

    **How it works:**

    1. **Language Detection**: Calls Azure Text Analytics API (`text_analytics_client.detect_language()`) to analyze the documents asynchronously

    2. **Result Processing**: For each document in the response:
        - Creates a dictionary with the document's ID
        - Handles errors: If detection failed, marks it as an error and includes error details
        - Language code mapping: Special handling for Chinese - converts Azure's "zh_chs" (Chinese Simplified) to "zh" for Cohere compatibility
        - For other languages, uses the ISO 639-1 language code from Azure

    3. **Return Value**: Returns a list of processed documents, where each contains:
        - `id`: Document identifier
        - `language_code`: ISO 639-1 language code (when successful)
        - `is_error`: Boolean flag if detection failed
        - `error`: Error details (when applicable)

    **Key Feature:**
    The method handles the mismatch between Azure's language codes and Cohere's expected format, specifically normalizing Chinese language codes to ensure compatibility with the Cohere multilingual embedding model.
    """
    documents = await text_analytics_client.detect_language(docs)    
    processed_documents = []

    # Parse all documents
    for document in documents:

        doc = {
            "id": document.id            
        }

        if document.is_error:
            doc['is_error'] = True
            doc['error'] = document.error
        else:
            # Language simplified is different from our AI Services vs Cohere
            if document.primary_language.iso6391_name == "zh_chs":
                doc['language_code'] = "zh"
            else:
                doc['language_code'] = document.primary_language.iso6391_name            
        
        processed_documents.append(doc)

    return processed_documents

async def create_embeddings_cohere(documents:List[str]) -> List[float]:
    """Call Azure AI Inference endpoint using Github Model Cohere 3"""
    
    vectors:List[float] = []
    response = await client.embed(input=documents,
                                  model=cohere_model)
    
    for data in response.data:
        vectors.append(data['embedding'])

    return vectors

def csv_to_json_array(csv_file:str, output_file:str):
    """Convert CSV or Excel file to array of JSON objects with snake_case field names"""
    
    # Check file extension and read accordingly
    if csv_file.endswith('.xlsx') or csv_file.endswith('.xls'):
        # Read Excel file into DataFrame
        df = pd.read_excel(csv_file)
    else:
        # Read CSV file into DataFrame
        df = pd.read_csv(csv_file)
    
    # Replace NaN values with empty strings
    df = df.fillna('')
    
    # Convert column names from "Title Case" to "snake_case"
    def to_snake_case(name):
        # Replace spaces with underscores and convert to lowercase
        return name.replace(' ', '_').lower()
    
    # Rename all columns to snake_case
    df.columns = [to_snake_case(col) for col in df.columns]
    
    # Convert DataFrame to list of dictionaries (JSON objects)
    data = df.to_dict(orient='records')
    
    # Print the result
    print(f"Converted {len(data)} records from {csv_file} to JSON array")
    print(f"Converted column names: {list(df.columns)}")
    print("\nFirst record example:")
    print(json.dumps(data[0], indent=2))
        
    # Save JSON array to file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"\nJSON array saved to: {output_file}")

In [None]:
csv_to_json_array(csv_file="car_problems_multilingual.xlsx",output_file="car_problems_multilingual.json")

with open("car_problems_multilingual.json","r",encoding="utf-8") as f:
    documents = json.load(f)

In [12]:
docs = []
processed_documents = []

for document in documents:

    # Detect the language of the documents, here the maximum is 1000 documents with a size of 1 MB
    docs.append({
        "id": document["id"],
        "text": document['fault']
    })


results = await get_language_documents(docs)
# Map the language results back to the original documents
for result in results:
    # Find the matching document by id
    matching_doc = next((d for d in documents if d['id'] == result['id']), None)
    if matching_doc and not result.get('is_error'):
        matching_doc['language_code'] = result['language_code']
        processed_documents.append(matching_doc)

In [None]:
print(processed_documents)

In [None]:
enriched_documents:List[Dict] = []

text_to_embed:List[str] = []
text_to_translate:List[InputTextItem] = []

for doc in processed_documents:
   
   doc['original_language'] = doc['language_code']
   text_to_embed.append(doc['fault'])

   if doc['language_code'] == 'en':      
      enriched_documents.append(doc)
      continue

   text_to_translate.append(InputTextItem(text=doc['brand']))
   text_to_translate.append(InputTextItem(text=doc['model']))
   text_to_translate.append(InputTextItem(text=doc['fault']))
   text_to_translate.append(InputTextItem(text=doc['fix']))
   
   texts = await text_translator.translate(content=text_to_translate,to=['en'])
      
   doc['brand_en'] = texts[0].translations[0].text
   doc['model_en'] = texts[1].translations[0].text
   doc['fault_en'] = texts[2].translations[0].text
   doc['fix_en'] = texts[3].translations[0].text

   enriched_documents.append(doc)
   text_to_translate.clear()

print(f"{len(enriched_documents)} documents to enrich")

60 documents to enrich


In [None]:
print(json.dumps(enriched_documents,indent=4))

In [None]:
vectorized_documents:List[Dict] = []

for doc in enriched_documents:

    # Here only one field to vectorize
    if doc['language_code'] == 'en':
      vector = await create_embeddings_cohere([doc['fault']])
      doc['vector'] = vector[0]
      vectorized_documents.append(doc)
      # Add a small delay to avoid rate limiting
      await asyncio.sleep(5)      
      continue

    vectors = await create_embeddings_cohere([doc['fault'],doc['fault_en']])
    doc['vector'] = vectors[0]
    doc['vector_en'] = vectors[1]

    vectorized_documents.append(doc)

    # Add a small delay to avoid rate limiting
    await asyncio.sleep(5)

print(f"{len(vectorized_documents)} vectorized")


60 vectorized


In [36]:
# Remove none needed columns
for doc in vectorized_documents:
    doc.pop('language_code',None)

with open("documents_hybrid.json", 'w', encoding='utf-8') as f:    
    json.dump(vectorized_documents, f, indent=2, ensure_ascii=False)

In [None]:
try:
    result = await search_client.upload_documents(vectorized_documents)
    print("Upload of new document succeeded: {}".format(result[0].succeeded))
except Exception as ex:
    print(ex)