In [10]:
# AZURE AI Search index

In [1]:
# !pip install -U -q -r ../requirements.txt

In [2]:
aoai_connection_name = "open_ai_connection"
aoai_connection_name_emb = "open_ai_connection"
acs_connection_name = "acs-connection"
# data_set_name = "papers"
data_set_name = "row_data_ds"
asset_name = "aoai_acs_mlindex"
doc_intelligence_connection_name = "doc-intelligence-connection"
# data_set_name = "sample-data"
asset_name = f"{data_set_name}_aoai_acs_emb3_large_mlindex_v2_clean"
vision_deploy_name = "gpt-4o"
# aoai_embedding_model_name = "text-embedding-ada-002"
aoai_embedding_model_name = "text-embedding-3-large"

acs_config = {
    "index_name": asset_name,
}

experiment_name = f"{data_set_name}-acs-embedding"

In [3]:
import sys

sys.path.append('../chunk_caption_component/')
sys.path.append('../enhanced_doc_analyzer_component/')

In [4]:
# Example of registering the component in a workspace
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# Get workspace
ml_client = MLClient.from_config(
    credential=DefaultAzureCredential()
)

Found the config file in: /config.json


In [5]:

from azure.ai.ml import load_component, load_environment

ml_registry = MLClient(credential=DefaultAzureCredential(), registry_name="azureml")

# Reads input folder of files containing chunks and their metadata as batches, in parallel, and generates embeddings for each chunk. Output format is produced and loaded by `azureml.rag.embeddings.EmbeddingContainer`.
generate_embeddings_component = ml_registry.components.get(
    "llm_rag_generate_embeddings", label="latest"
)
# Reads an input folder produced by `azureml.rag.embeddings.EmbeddingsContainer.save()` and pushes all documents (chunk, metadata, embedding_vector) into an Azure Cognitive Search index. Writes an MLIndex yaml detailing the index and embeddings model information.
update_acs_index_component = ml_registry.components.get(
    "llm_rag_update_acs_index", label="latest"
)
# Takes a uri to a storage location where an MLIndex yaml is stored and registers it as an MLIndex Data asset in the AzureML Workspace.
register_mlindex_component = ml_registry.components.get(
    "llm_rag_register_mlindex_asset", label="latest"
)

# Load components and environment
analyzer_component = load_component(source="./enhanced_doc_analyzer_component/doc_analyzer_component.yaml")
chunk_caption_index = load_component(source="./chuck_caption_component/chuck_caption_component.yaml")


In [6]:
print(update_acs_index_component.description)

Uploads `embeddings` into Azure Cognitive Search instance specified in `acs_config`. The Index will be created if it doesn't exist.

The Index will have the following fields populated:
- "id", String, key=True
- "content", String
- "contentVector", Collection(Single)
- "category", String
- "url", String
- "filepath", String
- "content_hash", String
- "meta_json_string", String

"meta_json_string" contains all metadata for a document serialized as a JSON string.



In [7]:

from azureml.rag.utils.deployment import infer_deployment
from azureml.rag.utils.connections import get_connection_by_id_v2
from azure.ai.ml import dsl, Input
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml import Input, Output
from azure.ai.ml.entities._job.pipeline._io import PipelineInput
from typing import Optional, List
import json


aoai_connection_id = ml_client.connections.get(aoai_connection_name).id
aoai_connection_emb_id = ml_client.connections.get(aoai_connection_name_emb).id
aoai_connection = get_connection_by_id_v2(aoai_connection_id)
aoai_connection_emb = get_connection_by_id_v2(aoai_connection_emb_id)



embeddings_model_uri = f"azure_open_ai://deployment/{aoai_embedding_model_name}/model/{aoai_embedding_model_name}"
# embeddings_model = "hugging_face://model/sentence-transformers/all-mpnet-base-v2"
embeddings_model = embeddings_model_uri


doc_intelligence_connection = ml_client.connections.get(doc_intelligence_connection_name)
acs_connection = ml_client.connections.get(acs_connection_name)

# Get the data asset with version
raw_papers = ml_client.data.get(data_set_name, version="1")
# Create Input object for the data
pdf_input = Input(type=AssetTypes.URI_FOLDER, path=raw_papers.path)


In [8]:

def optional_pipeline_input_provided(input: Optional[PipelineInput]):
    """Checks if optional pipeline inputs are provided."""
    return input is not None and input._data is not None

def use_automatic_compute(component, instance_count=1, instance_type="Standard_E8s_v3"):
    """Configure input `component` to use automatic compute with `instance_count` and `instance_type`.

    This avoids the need to provision a compute cluster to run the component.
    """
    component.set_resources(
        instance_count=instance_count,
        instance_type=instance_type,
        properties={"compute_specification": {"automatic": True}},
    )
    return component

@dsl.pipeline(
    description="Combined document analysis and azure AI search indexing pipeline",
    default_compute="serverless"
)
def document_processing_pipeline(
    # Document Analyzer inputs
    
    pdf_folder,
    asset_name: str,
    acs_config: str, 
    acs_connection_id: str,
    doc_intel_connection_id: str,
    confidence_threshold: float = 0.5,
    min_length: int = 10,
    top_margin_percent: float = 0.05,  # as a ration of page height
    bottom_margin_percent: float = 0.05,  # as a percentage of page height
    overlap_threshold: float = 0.5,
    ignore_roles: str = "pageFooter,footnote,pageHeader",
    ocr_elements: str = 'formula,table',
    vision_deployment_name: str = "gpt-4o",
    embeddings_model: str = "hugging_face://model/sentence-transformers/all-mpnet-base-v2",
    embeddings_container=None,
    aoai_connection_id: str = None,
    aoai_connection_emb_id: str = None,
    # Compute settings
    analyzer_compute: str = "gpu-cluster",
    indexer_compute: str = "cpu-cluster"

):
    # Document Analyzer step
    analysis_job = analyzer_component(
        input_folder=pdf_folder,
        doc_intel_connection_id=doc_intel_connection_id,
        confidence_threshold=confidence_threshold,
        min_length=min_length,
        top_margin_percent=top_margin_percent,
        bottom_margin_percent=bottom_margin_percent,
        overlap_threshold=overlap_threshold,
        ignore_roles=ignore_roles,
        ocr_elements=ocr_elements
    )
    analysis_job.compute = analyzer_compute

    # Chunk Caption Index step
    # Using the output from document analyzer as input
    chunk_caption_job = chunk_caption_index(
        input_folder=analysis_job.outputs.output_dir,
        azure_openai_connection_id=aoai_connection_id,
        vision_deployment_name=vision_deployment_name,
    )
    chunk_caption_job.compute = indexer_compute

    generate_embeddings = generate_embeddings_component(
        chunks_source=chunk_caption_job.outputs.output_folder,
        embeddings_container=embeddings_container,
        embeddings_model=embeddings_model,
    )
    # use_automatic_compute(generate_embeddings)
    generate_embeddings.compute = indexer_compute
    if optional_pipeline_input_provided(aoai_connection_emb_id):
        generate_embeddings.environment_variables[
            "AZUREML_WORKSPACE_CONNECTION_ID_AOAI"
        ] = aoai_connection_emb_id
    if optional_pipeline_input_provided(embeddings_container):
        # If provided, `embeddings_container` is expected to be a URI to folder, the folder can be empty.
        # Each sub-folder is generated by a `create_embeddings_component` run and can be reused for subsequent embeddings runs.
        generate_embeddings.outputs.embeddings = Output(
            type="uri_folder", path=f"{embeddings_container.path}/{{name}}"
        )

    # `update_acs_index` takes the Embedded data produced by `generate_embeddings` and pushes it into an Azure Cognitive Search index.
    update_acs_index = update_acs_index_component(
        embeddings=generate_embeddings.outputs.embeddings, acs_config=acs_config
    )
    # use_automatic_compute(update_acs_index)
    update_acs_index.compute = indexer_compute
    if optional_pipeline_input_provided(acs_connection_id):
        update_acs_index.environment_variables[
            "AZUREML_WORKSPACE_CONNECTION_ID_ACS"
        ] = acs_connection_id

    register_mlindex = register_mlindex_component(
        storage_uri=update_acs_index.outputs.index, asset_name=asset_name
    )
    # use_automatic_compute(register_mlindex)
    register_mlindex.compute = indexer_compute
    return {
        "mlindex_asset_uri": update_acs_index.outputs.index,
        "mlindex_asset_id": register_mlindex.outputs.asset_id,
        "analyzer_output": analysis_job.outputs.output_dir,
        "final_output": chunk_caption_job.outputs.output_folder
    }




In [None]:
# Create pipeline
pipeline = document_processing_pipeline(
    # Document Analyzer params
    pdf_folder=pdf_input,
    asset_name=asset_name,
    doc_intel_connection_id=doc_intelligence_connection.id,
    acs_config=json.dumps(acs_config),
    acs_connection_id=acs_connection.id,
    confidence_threshold=0.3,
    min_length=15,
    top_margin_percent=0.05,  # as a ration of page height
    bottom_margin_percent=0.05,  # as a percentage of page height
    overlap_threshold=0.7,
    ignore_roles="pageFooter,footnote,pageHeader",
    ocr_elements='formula',
    
    # Chunk Caption Index params
    aoai_connection_id=aoai_connection_id,
    aoai_connection_emb_id=aoai_connection_emb_id,
    vision_deployment_name=vision_deploy_name,
    embeddings_model=embeddings_model,
    embeddings_container=None,
    
    # Compute settings
    analyzer_compute="gpu-cluster-a100",
    indexer_compute="cpu-cluster-low"
)

# These are added so that in progress index generations can be listed in UI, this tagging is done automatically by UI.
pipeline.properties["azureml.mlIndexAssetName"] = asset_name
pipeline.properties["azureml.mlIndexAssetKind"] = "acs"
pipeline.properties["azureml.mlIndexAssetSource"] = data_set_name
 

UserErrorException: Pipeline input expected an azure.ai.ml.Input or primitive types (str, bool, int or float), but got type <class 'list'>.

In [None]:

# Submit the pipeline
run = ml_client.jobs.create_or_update(
    pipeline,
    experiment_name=experiment_name,
    tags={"type": "sample-document-processing"}
)

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
[32mUploading enhanced_doc_analyzer_component (0.12 MBs): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [None]:
# Example of registering the component in a workspace
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# Get workspace
ml_client = MLClient.from_config(
    credential=DefaultAzureCredential()
)

Found the config file in: /config.json


In [None]:
asset_name

'row_data_ds_aoai_acs_emb3_large_mlindex_v2_clean'

In [None]:
from azureml.rag.mlindex import MLIndex

# asset_name = "sample_aoai_acs_mlindex"
# asset_name = "aoai_faiss_mlindex"

question = "how many steps are in metalloporphyrins synthesis?"

retriever = MLIndex(
    ml_client.data.get(asset_name, label="latest")
)
# .as_langchain_retriever()
# retriever.get_relevant_documents(question)

Resolving access token for scope "https://storage.azure.com/.default" using identity of type "MANAGED".
Getting data access token with Assigned Identity (client_id=clientid) and endpoint type based on configuration


In [None]:
retriever.as_langchain_retriever().get_relevant_documents(question)

  return original_attr(*args, **kwargs)


HttpResponseError: (InvalidRequestParameter) Unknown field 'contentVector' in vector field list.
Code: InvalidRequestParameter
Message: Unknown field 'contentVector' in vector field list.
Exception Details:	(UnknownField) Unknown field 'contentVector' in vector field list.
	Code: UnknownField
	Message: Unknown field 'contentVector' in vector field list.

In [None]:
pip install azureml-rag[cognitive_search]==0.2.38

Collecting azure-search-documents>=11.4.0 (from azureml-rag[cognitive_search]==0.2.38)
  Downloading azure_search_documents-11.5.2-py3-none-any.whl.metadata (23 kB)
Downloading azure_search_documents-11.5.2-py3-none-any.whl (298 kB)
Installing collected packages: azure-search-documents
Successfully installed azure-search-documents-11.5.2
Note: you may need to restart the kernel to use updated packages.


In [None]:
from azureml.rag.utils.connections import get_connection_by_id_v2

aoai_connection_id = ml_client.connections.get("aoai-sweden-505").id
aoai_connection = get_connection_by_id_v2(aoai_connection_id)

ActivityCompleted: Activity=WorkspaceConnections.Get, HowEnded=Failure, Duration=402.97 [ms], Exception=ResourceNotFoundError, ErrorCategory=UserError, ErrorMessage=(UserError) Connection aoai-sweden-505 can't be found in this workspace
Code: UserError
Message: Connection aoai-sweden-505 can't be found in this workspace


ResourceNotFoundError: (UserError) Connection aoai-sweden-505 can't be found in this workspace
Code: UserError
Message: Connection aoai-sweden-505 can't be found in this workspace

In [None]:
from langchain.chains import RetrievalQA
from azureml.rag.models import init_llm, parse_model_uri

model_config = parse_model_uri(
    "azure_open_ai://deployment/gpt-35-turbo/model/gpt-35-turbo"
)
model_config["azure_endpoint"] = aoai_connection.target
model_config["api_key"] = aoai_connection.api_key
model_config["temperature"] = 0.3
model_config["max_retries"] = 3
model_config["deployment"] = "gpt-4v"
model_config["model"] = "gpt-4"

llm=init_llm(model_config)

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever
)

qa.run(question)

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever

retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=retriever, llm=llm
)

In [None]:
retriever_from_llm.invoke(question)

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever_from_llm
)

qa.run(question)