In [1]:
import sys

sys.path.append('../chunk_caption_component/')
sys.path.append('../enhanced_doc_analyzer_component/')

In [2]:
# Example of registering the component in a workspace
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# Get workspace
ml_client = MLClient.from_config(
    credential=DefaultAzureCredential()
)

Found the config file in: /config.json


In [3]:

from azure.ai.ml import load_component, load_environment
from azure.ai.ml import dsl, Input
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml import Input, Output
from azure.ai.ml.entities._job.pipeline._io import PipelineInput
from typing import Optional
import json
import os

ml_registry = MLClient(credential=DefaultAzureCredential(), registry_name="azureml")

# Reads input folder of files containing chunks and their metadata as batches, in parallel, and generates embeddings for each chunk. Output format is produced and loaded by `azureml.rag.embeddings.EmbeddingContainer`.
generate_embeddings_component = ml_registry.components.get(
    "llm_rag_generate_embeddings", label="latest"
)
# Reads an input folder produced by `azureml.rag.embeddings.EmbeddingsContainer.save()` and pushes all documents (chunk, metadata, embedding_vector) into an Azure Cognitive Search index. Writes an MLIndex yaml detailing the index and embeddings model information.
update_acs_index_component = ml_registry.components.get(
    "llm_rag_update_acs_index", label="latest"
)
# Takes a uri to a storage location where an MLIndex yaml is stored and registers it as an MLIndex Data asset in the AzureML Workspace.
register_mlindex_component = ml_registry.components.get(
    "llm_rag_register_mlindex_asset", label="latest"
)

# Load components and environment
analyzer_component = load_component(source="./enhanced_doc_analyzer_component/doc_analyzer_component.yaml")
chunk_caption_index = load_component(source="./chuck_caption_component/chuck_caption_component.yaml")


In [4]:

from azureml.rag.utils.deployment import infer_deployment
from azureml.rag.utils.connections import get_connection_by_id_v2

aoai_connection_name = "open_ai_connection"
acs_connection_name = "acs-connection"
data_set_name = "papers"
asset_name = "aoai_acs_mlindex"
doc_intelligence_connection_name = "doc-intelligence-connection"
vision_deploy_name = "gpt-4"
aoai_embedding_model_name = "text-embedding-3-large"

acs_config = {
    "index_name": "qknows-embedding",
}

experiment_name = "sample-acs-embedding"

aoai_connection_id = ml_client.connections.get(aoai_connection_name).id
aoai_connection = get_connection_by_id_v2(aoai_connection_id)



embeddings_model_uri = f"azure_open_ai://deployment/{aoai_embedding_model_name}/model/{aoai_embedding_model_name}"
# embeddings_model = "hugging_face://model/sentence-transformers/all-mpnet-base-v2"
embeddings_model = embeddings_model_uri


doc_intelligence_connection = ml_client.connections.get(doc_intelligence_connection_name)
acs_connection = ml_client.connections.get(acs_connection_name)

# Get the data asset with version
raw_papers = ml_client.data.get(data_set_name, version="1")
# Create Input object for the data
pdf_input = Input(type=AssetTypes.URI_FOLDER, path=raw_papers.path)

In [5]:


def optional_pipeline_input_provided(input: Optional[PipelineInput]):
    """Checks if optional pipeline inputs are provided."""
    return input is not None and input._data is not None

def use_automatic_compute(component, instance_count=1, instance_type="Standard_E8s_v3"):
    """Configure input `component` to use automatic compute with `instance_count` and `instance_type`.

    This avoids the need to provision a compute cluster to run the component.
    """
    component.set_resources(
        instance_count=instance_count,
        instance_type=instance_type,
        properties={"compute_specification": {"automatic": True}},
    )
    return component

@dsl.pipeline(
    description="Combined document analysis and azure AI search indexing pipeline",
    default_compute="cpu-cluster-low",
)
def document_processing_pipeline(
    # Document Analyzer inputs
    
    pdf_folder,
    asset_name: str,
    doc_intel_connection_id: str,
    acs_config: str,
    acs_connection_id: str,
    confidence_threshold: float = 0.5,
    min_length: int = 10,
    overlap_threshold: float = 0.5,
    ignore_roles: str = "pageFooter,footnote,pageHeader",
    vision_deployment_name: str = "gpt-4",
    embeddings_model: str = "hugging_face://model/sentence-transformers/all-mpnet-base-v2",
    embeddings_container=None,
    aoai_connection_id: str = None,
    # Compute settings
    analyzer_compute: str = "gpu-cluster-a100",
    indexer_compute: str = "cpu-cluster-low"

):
    # Document Analyzer step
    analysis_job = analyzer_component(
        input_folder=pdf_folder,
        doc_intel_connection_id=doc_intel_connection_id,
        confidence_threshold=confidence_threshold,
        min_length=min_length,
        overlap_threshold=overlap_threshold,
        ignore_roles=ignore_roles
    )
    analysis_job.compute = analyzer_compute

    # Chunk Caption Index step
    # Using the output from document analyzer as input
    chunk_caption_job = chunk_caption_index(
        input_folder=analysis_job.outputs.output_dir,
        azure_openai_connection_id=aoai_connection_id,
        vision_deployment_name=vision_deployment_name,
    )
    chunk_caption_job.compute = indexer_compute

    generate_embeddings = generate_embeddings_component(
        chunks_source=chunk_caption_job.outputs.output_folder,
        embeddings_container=embeddings_container,
        embeddings_model=embeddings_model,
    )
    # use_automatic_compute(generate_embeddings)
    generate_embeddings.compute = indexer_compute
    if optional_pipeline_input_provided(aoai_connection_id):
        generate_embeddings.environment_variables[
            "AZUREML_WORKSPACE_CONNECTION_ID_AOAI"
        ] = aoai_connection_id
    if optional_pipeline_input_provided(embeddings_container):
        # If provided, `embeddings_container` is expected to be a URI to folder, the folder can be empty.
        # Each sub-folder is generated by a `create_embeddings_component` run and can be reused for subsequent embeddings runs.
        generate_embeddings.outputs.embeddings = Output(
            type="uri_folder", path=f"{embeddings_container.path}/{{name}}"
        )

    # `update_acs_index` takes the Embedded data produced by `generate_embeddings` and pushes it into an Azure Cognitive Search index.
    update_acs_index = update_acs_index_component(
        embeddings=generate_embeddings.outputs.embeddings, acs_config=acs_config
    )
    # use_automatic_compute(update_acs_index)
    update_acs_index.compute = indexer_compute
    if optional_pipeline_input_provided(acs_connection_id):
        update_acs_index.environment_variables[
            "AZUREML_WORKSPACE_CONNECTION_ID_ACS"
        ] = acs_connection_id

    register_mlindex = register_mlindex_component(
        storage_uri=update_acs_index.outputs.index, asset_name=asset_name
    )
    # use_automatic_compute(register_mlindex)
    register_mlindex.compute = indexer_compute
    return {
        "mlindex_asset_uri": update_acs_index.outputs.index,
        "mlindex_asset_id": register_mlindex.outputs.asset_id,
        "analyzer_output": analysis_job.outputs.output_dir,
        "final_output": chunk_caption_job.outputs.output_folder
    }




In [6]:
# Create pipeline
pipeline = document_processing_pipeline(
    # Document Analyzer params
    pdf_folder=pdf_input,
    asset_name=asset_name,
    doc_intel_connection_id=doc_intelligence_connection.id,
    acs_config=json.dumps(acs_config),
    acs_connection_id=acs_connection.id,
    confidence_threshold=0.3,
    min_length=15,
    overlap_threshold=0.7,
    ignore_roles="pageFooter,footnote,pageHeader",
    
    # Chunk Caption Index params
    aoai_connection_id=aoai_connection_id,
    vision_deployment_name=vision_deploy_name,
    embeddings_model=embeddings_model,
    embeddings_container=None,
    
    # Compute settings
    analyzer_compute="gpu-cluster-a100",
    indexer_compute="cpu-cluster-low"
)

# These are added so that in progress index generations can be listed in UI, this tagging is done automatically by UI.
pipeline.properties["azureml.mlIndexAssetName"] = asset_name
pipeline.properties["azureml.mlIndexAssetKind"] = "acs"
pipeline.properties["azureml.mlIndexAssetSource"] = "raw_papers"
 

In [7]:

# Submit the pipeline
run = ml_client.jobs.create_or_update(
    pipeline,
    experiment_name="document-processin-aml-components-pipeline",
    tags={"type": "document-processing"}
)

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFileJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored
pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored


In [8]:
from azureml.rag.mlindex import MLIndex



question = "how many steps are in metalloporphyrins synthesis?"

faias_retriever = MLIndex(
    ml_client.data.get(asset_name, label="latest")
).as_langchain_retriever()
retriever.get_relevant_documents(question)

MlException: 
[37m
[30m
1) Resource was not found.
[39m[39m

Details: 

[31m(x) Asset aoai_acs_mlindex does not exist in workspace mlw-acadocser-dev-04.[39m

Resolutions: 
1) Double-check that the resource has been specified correctly and that you have access to it.
If using the CLI, you can also check the full log in debug mode for more details by adding --debug to the end of your command

Additional Resources: The easiest way to author a yaml specification file is using IntelliSense and auto-completion Azure ML VS code extension provides: [36mhttps://code.visualstudio.com/docs/datascience/azure-machine-learning.[39m To set up VS Code, visit [36mhttps://docs.microsoft.com/azure/machine-learning/how-to-setup-vs-code[39m


In [None]:
from azureml.rag.utils.connections import get_connection_by_id_v2

aoai_connection_id = ml_client.connections.get("aoai-sweden-505").id
aoai_connection = get_connection_by_id_v2(aoai_connection_id)

In [None]:
from langchain.chains import RetrievalQA
from azureml.rag.models import init_llm, parse_model_uri

model_config = parse_model_uri(
    "azure_open_ai://deployment/gpt-35-turbo/model/gpt-35-turbo"
)
model_config["azure_endpoint"] = aoai_connection.target
model_config["api_key"] = aoai_connection.api_key
model_config["temperature"] = 0.3
model_config["max_retries"] = 3
model_config["deployment"] = "gpt-4v"
model_config["model"] = "gpt-4"

llm=init_llm(model_config)

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever
)

qa.run(question)

  llm = AzureChatOpenAI(
  qa.run(question)


"I don't know the answer to how many steps are in metalloporph"

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever

retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=retriever, llm=llm
)

In [None]:
retriever_from_llm.invoke(question)

[Document(metadata={'source_doc_id': 'Eur J Org Chem - 2020 - Breugst - ‐Hole Interactions in Catalysis.pdf', 'chunk_hash': '3116fd17b82e85c53f20791851eea6d233b0edc9d0935a70b463e76335f81c51', 'mtime': None, 'page_number': 15, 'stats': {'tiktokens': 9, 'chars': 22, 'lines': 1}, 'source': {'filename': 'Eur J Org Chem - 2020 - Breugst - ‐Hole Interactions in Catalysis.pdf', 'url': '', 'mtime': None, 'role': nan, 'image_path': nan, 'confidence': nan, 'source': 'azure_document_intelligence', 'bounding_box': '(4.20, 7.24, 5.24, 7.36)', 'page': 15, 'id': '24fdc787-653a-4770-959f-f13ff2a62693'}}, page_content='Received: May 13, 2020'),
 Document(metadata={'source_doc_id': 'ChemBioChem - 2020 - Valverde - Molecular Recognition in C‐Type Lectins  The Cases of DC‐SIGN  Langerin  MGL  and L‐Sectin.pdf', 'chunk_hash': 'ef66b46c575b5eaadb34e2e9285b1a1e5a5e2ef20559dcad04b62ecb35b997e7', 'mtime': None, 'page_number': 25, 'stats': {'tiktokens': 62, 'chars': 124, 'lines': 1}, 'source': {'filename': 'Che

In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever_from_llm
)

qa.run(question)

"I don't know."