# Azure Cognitive Search Integrated Vectorization Sample
This code demonstrates how to use Azure Cognitive Search as a Vector store by automatically chunking and generating embeddings using the Azure OpenAI Embedding skill as part of the skillset pipeline in Azure Cognitive Search. 
## Prerequisites
To run the code, install the following packages. 

In [13]:
! pip uninstall azure_search_documents --yes

Found existing installation: azure-search-documents 11.4.0b11
Uninstalling azure-search-documents-11.4.0b11:
  Successfully uninstalled azure-search-documents-11.4.0b11


In [14]:
! pip install azure_search_documents --quiet 

In [9]:
! python --version

Python 3.10.9


In [1]:
! pip freeze

aiohttp==3.8.4
aiosignal==1.3.1
asttokens @ file:///opt/conda/conda-bld/asttokens_1646925590279/work
async-timeout==4.0.2
attrs==22.2.0
azure-common==1.1.28
azure-core==1.29.5
azure-search-documents==11.4.0
azure-storage-blob==12.19.0
backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work
certifi @ file:///C:/b/abs_85o_6fm0se/croot/certifi_1671487778835/work/certifi
cffi==1.16.0
charset-normalizer==3.0.1
colorama @ file:///C:/b/abs_a9ozq0l032/croot/colorama_1672387194846/work
comm @ file:///C:/b/abs_1419earm7u/croot/comm_1671231131638/work
contourpy==1.0.7
cryptography==41.0.5
cycler==0.11.0
debugpy @ file:///C:/ci_310/debugpy_1642079916595/work
decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work
docopt==0.6.2
entrypoints @ file:///C:/ci/entrypoints_1649926676279/work
executing @ file:///opt/conda/conda-bld/executing_1646925071911/work
filelock==3.9.0
fonttools==4.38.0
frozenlist==1.3.3
huggingface-hub==0.12.1
idna==3.4
ipykernel @ file:///C:/b/abs_b4f07tbsy

## Import required libraries and environment variables

In [5]:
# Import required libraries  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient  
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,

    QueryType,
    VectorizedQuery,
    
    VectorFilterMode,    
)
#from azure.search.documents.models import (
   # RawVectorQuery,
 #   VectorizedQuery,
   # QueryLanguage,
#)
from azure.search.documents.indexes.models import (  
    #AzureOpenAIEmbeddingSkill,  
    #AzureOpenAIParameters,  
    #AzureOpenAIVectorizer,  
    ExhaustiveKnnParameters,  
    #ExhaustiveKnnVectorSearchAlgorithmConfiguration,
    FieldMapping,  
    HnswParameters,  
    HnswVectorSearchAlgorithmConfiguration,  
    #IndexProjectionMode,  
    InputFieldMappingEntry,  
    OutputFieldMappingEntry,  
    #PrioritizedFields,    
    SearchField,  
    SearchFieldDataType,  
    SearchIndex,  
    SearchIndexer,  
    SearchIndexerDataContainer,  
    SearchIndexerDataSourceConnection,  
    #SearchIndexerIndexProjectionSelector,  
    #SearchIndexerIndexProjections,  
    #SearchIndexerIndexProjectionsParameters,  
    SearchIndexerSkillset,  
    SemanticConfiguration,  
    SemanticField,  
    #SemanticSettings,  
    SplitSkill,
    SqlIntegratedChangeTrackingPolicy,  
    VectorSearch,  
    VectorSearchAlgorithmKind,  
    VectorSearchAlgorithmMetric,  
    VectorSearchProfile,  
)  
from azure.storage.blob import BlobServiceClient  
import openai  
from dotenv import load_dotenv  
import os  
  
# Configure environment variables
load_dotenv()  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")  
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME_01")  
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")  
openai.api_type = "azure"  
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION")  
model: str = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL")
blob_connection_string = os.getenv("BLOB_CONNECTION_STRING")  
container_name = os.getenv("BLOB_CONTAINER_NAME")  
credential = AzureKeyCredential(key)  
EMBEDDING_LENGTH = 1536     # 768 for GPT-2, 1024 for GPT-3
server = os.getenv("MYSQL_SERVER")
database = os.getenv("MYSQL_DATABASE")
username = os.getenv("MYSQL_USERNAME")
password = os.getenv("MYSQL_PASSWORD")
table_name = os.getenv("MYSQL_TABLE_NAME")

ImportError: cannot import name 'HnswVectorSearchAlgorithmConfiguration' from 'azure.search.documents.indexes.models' (c:\Users\jherna01\miniconda3\envs\env-openai\lib\site-packages\azure\search\documents\indexes\models\__init__.py)

## Connect to MySQL to a data source in Cognitive Search

In [9]:
ds_conn_str = f'Server={server}; Port=3306; Database={database}; Uid={username}; Pwd={password}; SslMode=Preferred;'

# Create a data source
ds_client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))
container = SearchIndexerDataContainer(name=table_name)

#change_detection_policy = SqlIntegratedChangeTrackingPolicy()

data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-ds",
    type="mysql",
    connection_string=ds_conn_str,
    container=container
    #,data_change_detection_policy=change_detection_policy
)
data_source = ds_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

HttpResponseError: () Data source type 'mysql' is not supported for this API version
Code: 
Message: Data source type 'mysql' is not supported for this API version

## Create a search index

In [4]:
# Create a search index  
index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)  
fields = [  
  # Properties of individual chunk
    SearchField(name="Id", type=SearchFieldDataType.String, key=True,
                sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),
    SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
    SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), 
                vector_search_dimensions=EMBEDDING_LENGTH, vector_search_profile="myHnswProfile"),
    # Properties of original row in DB that the chunk belonged to
    SearchField(name="parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchField(name="parent_numero", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchField(name="parent_descripcion_breve", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchField(name="parent_descripcion", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True)

]  
  
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswVectorSearchAlgorithmConfiguration(  
            name="myHnsw",  
            kind=VectorSearchAlgorithmKind.HNSW,  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
        ExhaustiveKnnVectorSearchAlgorithmConfiguration(  
            name="myExhaustiveKnn",  
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,  
            parameters=ExhaustiveKnnParameters(  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm="myHnsw",  
            vectorizer="myOpenAI",  
        ),  
        VectorSearchProfile(  
            name="myExhaustiveKnnProfile",  
            algorithm="myExhaustiveKnn",  
            vectorizer="myOpenAI",  
        ),  
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=os.getenv("AZURE_OPENAI_ENDPOINT"),  
                deployment_id=model,  
                api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
            ),  
        ),  
    ],  
)  
  
semantic_config = SemanticConfiguration(  
    name="my-semantic-config",  
    prioritized_fields=PrioritizedFields(  
        prioritized_content_fields=[SemanticField(field_name="Id")]  
    ),  
)  
  
# Create the semantic settings with the configuration  
semantic_settings = SemanticSettings(configurations=[semantic_config])  
  
# Create the search index with the semantic settings  
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_settings=semantic_settings)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created")  


vector_search_profile is not a known attribute of class <class 'azure.search.documents.indexes.models._index.SearchField'> and will be ignored


NameError: name 'HnswVectorSearchAlgorithmConfiguration' is not defined

## Create a skillset

In [5]:
# Create a skillset  
skillset_name = f"{index_name}-skillset"  
  
split_skill = SplitSkill(  
    description="Split skill to chunk documents",  
    text_split_mode="pages",  
    context="/document",  
    maximum_page_length=300,  
    page_overlap_length=20,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/descripcion"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="textItems", target_name="pages")  
    ],  
)  
  
embedding_skill = AzureOpenAIEmbeddingSkill(  
    description="Skill to generate embeddings via Azure OpenAI",  
    context="/document/pages/*",  
    resource_uri=os.getenv("AZURE_OPENAI_ENDPOINT"),  
    deployment_id=model,  
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/pages/*"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="embedding", target_name="vector")  
    ],  
)  
  
index_projections = SearchIndexerIndexProjections(  
    selectors=[  
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,  
            parent_key_field_name="parent_id",  
            source_context="/document/pages/*",  
            mappings=[  
                InputFieldMappingEntry(name="chunk", source="/document/pages/*"),
                InputFieldMappingEntry(name="vector", source="/document/pages/*/vector"),
               # InputFieldMappingEntry(name="Id", source="/document/numero"),
                InputFieldMappingEntry(name="parent_numero", source="/document/numero"),
                InputFieldMappingEntry(name="parent_descripcion_breve", source="/document/descripcion_breve"),
                InputFieldMappingEntry(name="parent_descripcion", source="/document/descripcion")
            ],  
        ),  
    ],  
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),  
)  
  
skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to chunk documents and generating embeddings",  
    skills=[split_skill, embedding_skill],  
    index_projections=index_projections,  
)  
  
client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))  
client.create_or_update_skillset(skillset)  
print(f"{skillset.name} created")  


ticket-mysql-vector-02-skillset created


## Create an indexer

In [6]:
# Create an indexer  
indexer_name = f"{index_name}-indexer"  
  
indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings",  
    skillset_name=skillset_name,  
    target_index_name=index_name,  
    #data_source_name=data_source.name,  
    data_source_name="ticket-mysql-ds",
    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results  
    field_mappings=[FieldMapping(source_field_name="numero", target_field_name="Id")]  
)  
  
indexer_client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))  
indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer  
indexer_client.run_indexer(indexer_name)  
print(f' {indexer_name} created')  


 ticket-mysql-vector-02-indexer created


## Perform a vector similarity search

This example shows a pure vector search using the vectorizable text query, all you need to do is pass in text and your vectorizer will handle the query vectorization.

In [7]:
# Pure Vector Search
query = "que se requiere para liberar la app afore?"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
vector_query = VectorizedQuery(text=query, k=1, fields="vector", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k=3, fields="vector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["parent_id", "Id", "chunk"],
    top=1
)  
  
for result in results:  
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['Id']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['chunk']}")   


parent_id: CHG0032603
chunk_id: 13a128904827_CHG0032603_pages_0
Score: 0.8629935
Content: SE REQUIERE LIBERAR LA APP DE MODIFICACION DE DATOS  EN ATENCION A FT RITM0123459 PARA LA CORRECCION DE LAS SIGUIENTES INCIDENCIAS.

*SE ENCUENTRAN CON EL CAMPO VACÍO DE PAÍS EN LOS DATOS DE DOMICILIO PARTICULAR Y LABORAL.

*TRAMITES DE APODERADO LEGAL LLEGABAN COMO BENEFICIARIO A PORTAL MIT


## Perform a hybrid search

In [28]:
# Hybrid Search
query = "que se requiere para liberar la app afore?"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k=1, fields="vector", exhaustive=True)
  
results = search_client.search(  
    search_text=query,  
    vector_queries= [vector_query],
    select=["parent_id", "Id", "chunk"],
    top=1
)  
  
for result in results:  
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['Id']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['chunk']}")  


parent_id: CHG0032603
chunk_id: 69685ba19050_CHG0032603_pages_0
Score: 0.03253968432545662
Content: SE REQUIERE LIBERAR LA APP DE MODIFICACION DE DATOS  EN ATENCION A FT RITM0123459 PARA LA CORRECCION DE LAS SIGUIENTES INCIDENCIAS.

*SE ENCUENTRAN CON EL CAMPO VACÍO DE PAÍS EN LOS DATOS DE DOMICILIO PARTICULAR Y LABORAL.

*TRAMITES DE APODERADO LEGAL LLEGABAN COMO BENEFICIARIO A PORTAL MIT


## Perform a hybrid search + semantic reranking

In [8]:
# Semantic Hybrid Search
query = "que se requiere para liberar la app afore?"

search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))
vector_query = VectorizableTextQuery(text=query, k=2, fields="vector", exhaustive=True)

results = search_client.search(  
    search_text=query,
    vector_queries=[vector_query],
    select=["parent_id", "Id", "chunk"],
    query_type=QueryType.SEMANTIC, query_language=QueryLanguage.EN_US, semantic_configuration_name='my-semantic-config', query_caption=QueryCaptionType.EXTRACTIVE, query_answer=QueryAnswerType.EXTRACTIVE,
    top=2
)

semantic_answers = results.get_answers()
for answer in semantic_answers:
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['Id']}")  
    print(f"Reranker Score: {result['@search.reranker_score']}")
    print(f"Content: {result['chunk']}")  

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")


parent_id: CHG0032603
chunk_id: 13a128904827_CHG0032603_pages_0
Reranker Score: 1.6374846696853638
Content: SE REQUIERE LIBERAR LA APP DE MODIFICACION DE DATOS  EN ATENCION A FT RITM0123459 PARA LA CORRECCION DE LAS SIGUIENTES INCIDENCIAS.

*SE ENCUENTRAN CON EL CAMPO VACÍO DE PAÍS EN LOS DATOS DE DOMICILIO PARTICULAR Y LABORAL.

*TRAMITES DE APODERADO LEGAL LLEGABAN COMO BENEFICIARIO A PORTAL MIT
Caption: 

parent_id: CHG0032599
chunk_id: 2ab08323ce1b_CHG0032599_pages_0
Reranker Score: 1.6374846696853638
Content: SE REQUIERE LA ACTUALIZACIÓN DE LOS COMPONENTES DE ACTIVIDAD COMERCIAL, ESTUDIO DE RETIRO PERSONALIZADO Y PROSPECTA QUE INCLUYE LAS MEJORAS DE CORRECCIÓN DE HORARIO EN LA ZONA CST Y CAMBIO DE SQL LITE A SECURE STORAGE, MODIFICACIÓN IOS MENÚ PRINCIPAL.
Caption: 

