### Test LlamaParse library with Azure Blob Storage pdf data ingestion

In [0]:
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
from llama_parse import LlamaParse
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.core.readers import SimpleDirectoryReader
import os


keyVaultName = os.environ["KEY_VAULT_NAME"]
KVUri = f"https://{keyVaultName}.vault.azure.net"

credential = DefaultAzureCredential()
client = SecretClient(vault_url=KVUri, credential=credential)
client.get_secret(name="aoai-endpoint").value
# base_url: str = 'https://api.cloud.llamaindex.ai'
parser = LlamaParse(
    result_type="markdown",
    parsing_instruction="This is an auto insurance claim document.",
    use_vendor_multimodal_model=True,
    vendor_multimodal_model_name= "openai-gpt-4o-mini",
    vendor_multimodal_api_key= client.get_secret(name="openai-apikey").value,
    api_key = client.get_secret(name="llamacloudkey").value
)

In [0]:
import nest_asyncio; nest_asyncio.apply()

file_path = "C:\\source\\github\\llamaparse\\insurance-claims-rag-data\\*.pdf"
CLAIMS_DIR = "insurance-claims-rag-data"


def get_claims_files(claims_dir=CLAIMS_DIR) -> list[str]:
    files = []
    for f in os.listdir(claims_dir):
        fname = os.path.join(claims_dir, f)
        if os.path.isfile(fname):
            files.append(fname)
    return files
files = get_claims_files()  # get all files from the claims/ directory

In [0]:
files

['insurance-claims-rag-data\\claim_1.pdf',
 'insurance-claims-rag-data\\claim_2.pdf',
 'insurance-claims-rag-data\\claim_3.pdf',
 'insurance-claims-rag-data\\claim_4.pdf',
 'insurance-claims-rag-data\\claim_5.pdf']

In [0]:
md_json_objs = parser.get_json_result(
    files
)  # extract markdown data for insurance claim document

Parsing files:   0%|          | 0/5 [00:00<?, ?it/s]

Started parsing the file under job_id 30277837-ac7f-4437-a716-8733f1cc5c8b
Started parsing the file under job_id 4ab8a70d-6a74-4540-9158-329cc1f2ddcf
Started parsing the file under job_id d2b2f6b5-05e7-4517-9f13-ec7871aeaa0a
Started parsing the file under job_id 6f60648e-c044-4feb-a402-bc11116f5d07


Parsing files:  20%|██        | 1/5 [00:08<00:34,  8.74s/it]

Started parsing the file under job_id 6c23082f-8b5c-4e1c-924a-24e682d7f824


Parsing files: 100%|██████████| 5/5 [00:11<00:00,  2.36s/it]


In [0]:
md_json_objs

[{'pages': [{'page': 1,
    'md': '# Auto Claims\n\nIn the event of an automobile accident:\n1. Report the accident to the police.\n2. Obtain information about the other people involved in the accident such as:\n   a. Names, addresses and phone numbers  \n   b. Insurance company  \n   c. Type of vehicle  \n   d. Auto and driver’s license numbers.  \n3. Have your vehicle towed to the nearest repair shop if the vehicle is not drivable. Do not authorize repairs until the claims adjuster gives you the authority to do so.  \n4. Call us to report the accident.  \n\nIn the event of a windshield, vandalism or theft loss:\n1. Report the vandalism loss or theft to the police.  \n2. Call us to report a loss.  \n\nAfter we report the claim to the insurance company, the claims adjuster will:\n1. Contact you to request details of the accident and repair estimates  \n2. Arrange for an appraiser to inspect the damages of vehicles that are not drivable or extensively damaged  \n3. Contact you for a set

In [0]:

parsed_images = parser.get_images(
    md_json_objs, download_path="data_images"
)  # extract images from PDFs and save them to ./data_images/

> Image for page 1: [{'name': 'page-0.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 2: [{'name': 'page-1.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 3: [{'name': 'page-2.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 1: [{'name': 'page-0.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 2: [{'name': 'page-1.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 3: [{'name': 'page-2.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 1: [{'name': 'page-0.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 2: [{'name': 'page-1.jpg', 'height': 0, 'width': 0, 'x': 0, 'y': 0, 'type': 'full_page_screenshot'}]
> Image for page 3: [{'name': 'page-2.jpg', 'height': 0,

In [0]:
mylist = []
help(mylist.extend)

Help on built-in function extend:

extend(iterable, /) method of builtins.list instance
    Extend list by appending elements from the iterable.



In [0]:
# extract list of pages for insurance claim doc
md_json_list = []
for obj in md_json_objs:
    md_json_list.extend(obj["pages"])

In [0]:
md_json_list

[{'page': 1,
  'md': '# Auto Claims\n\nIn the event of an automobile accident:\n1. Report the accident to the police.\n2. Obtain information about the other people involved in the accident such as:\n   a. Names, addresses and phone numbers  \n   b. Insurance company  \n   c. Type of vehicle  \n   d. Auto and driver’s license numbers.  \n3. Have your vehicle towed to the nearest repair shop if the vehicle is not drivable. Do not authorize repairs until the claims adjuster gives you the authority to do so.  \n4. Call us to report the accident.  \n\nIn the event of a windshield, vandalism or theft loss:\n1. Report the vandalism loss or theft to the police.  \n2. Call us to report a loss.  \n\nAfter we report the claim to the insurance company, the claims adjuster will:\n1. Contact you to request details of the accident and repair estimates  \n2. Arrange for an appraiser to inspect the damages of vehicles that are not drivable or extensively damaged  \n3. Contact you for a settlement  \n4.

#### Create helper functions to create a list of TextNodes from the markdown tables to feed into the VectorStoreIndex.

In [0]:
import re
from pathlib import Path
import typing as t
from llama_index.core.schema import TextNode, ImageNode


def get_page_number(file_name):
    """Gets page number of images using regex on file names"""
    match = re.search(r"-page-(\d+)\.jpg$", str(file_name))
    if match:
        return int(match.group(1))
    return 0


def _get_sorted_image_files(image_dir):
    """Get image files sorted by page."""
    raw_files = [f for f in list(Path(image_dir).iterdir()) if f.is_file()]
    sorted_files = sorted(raw_files, key=get_page_number)
    return sorted_files


def get_text_nodes(json_dicts, image_dir) -> t.List[TextNode]:
    """Creates nodes from json + images"""

    nodes = []

    docs = [doc["md"] for doc in json_dicts]  # extract text
    image_files = _get_sorted_image_files(image_dir)  # extract images

    for idx, doc in enumerate(docs):
        # adds both a text node and the corresponding image node (jpg of the page) for each page
        node = TextNode(
            text=doc,
            metadata={"image_path": str(image_files[idx]), "page_num": idx + 1},
        )
        image_node = ImageNode(
            image_path=str(image_files[idx]),
            metadata={"page_num": idx + 1, "text_node_id": node.id_},
        )
        nodes.extend([node, image_node])

    return nodes


text_nodes = get_text_nodes(md_json_list, "data_images")

In [0]:
text_nodes

[TextNode(id_='b08cdc24-66f0-4f00-832d-7d5cc46a7ac6', embedding=None, metadata={'image_path': 'data_images\\06fe8202-3175-47db-ae1e-fb1bb55cbc2e-page-0.jpg', 'page_num': 1}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='# Auto Claims\n\nIn the event of an automobile accident:\n1. Report the accident to the police.\n2. Obtain information about the other people involved in the accident such as:\n   a. Names, addresses and phone numbers  \n   b. Insurance company  \n   c. Type of vehicle  \n   d. Auto and driver’s license numbers.  \n3. Have your vehicle towed to the nearest repair shop if the vehicle is not drivable. Do not authorize repairs until the claims adjuster gives you the authority to do so.  \n4. Call us to report the accident.  \n\nIn the event of a windshield, vandalism or theft loss:\n1. Report the vandalism loss or theft to the police.  \n2. Call us to report a loss.  \n\nAfter we report the claim to the insurance company, the claim

#### Import vector and ai search packages

In [0]:
import logging
import sys
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from IPython.display import Markdown, display
from llama_index.core import (
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
)
from llama_index.core.settings import Settings

from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.vector_stores.azureaisearch import AzureAISearchVectorStore
from llama_index.vector_stores.azureaisearch import (
    IndexManagement,
    MetadataIndexFieldType,
)
from llama_index.core.vector_stores.types import VectorStoreQueryMode

In [0]:
# Create azure open ai embedding

"""
This code loads and sets the necessary variables for Azure services.
The variables are loaded from Azure Key Vault.
"""

from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

azure_openai_endpoint=client.get_secret(name="aoai-endpoint").value
azure_openai_api_key=client.get_secret(name="aoai-api-key").value
azure_openai_api_version = "2024-02-15-preview"
azure_openai_embedding_deployment = client.get_secret(name="aoai-embedding-deployment").value
search_credential = AzureKeyCredential(client.get_secret(name="aisearch-adminkey").value)
search_endpoint = client.get_secret(name="aisearch-endpoint").value


azure_openai_client = AzureOpenAI(
        model=client.get_secret(name="aoai-deploymentname").value,
        api_key=azure_openai_api_key, 
        api_version=azure_openai_api_version,
        azure_deployment=client.get_secret(name="aoai-deploymentname").value,
        azure_endpoint=azure_openai_endpoint)
    

# You need to deploy your own embedding model as well as your own chat completion model
embed_model = AzureOpenAIEmbedding(
    model="text-embedding-ada-002",
    deployment_name=azure_openai_embedding_deployment,
    api_key=azure_openai_api_key,
    azure_endpoint=azure_openai_endpoint,
    api_version="2024-02-15-preview"
)

#### Setup Azure AI Search

In [0]:
# Index name to use
index_name = "llama-insurance-index"

# Use index client to demonstrate creating an index
index_client = SearchIndexClient(
    endpoint=search_endpoint,
    credential=search_credential,
)

# Use search client to demonstration using existing index
search_client = SearchClient(
    endpoint=search_endpoint,
    index_name=index_name,
    credential=search_credential,
)

#### Use Existing Index

In [0]:

vector_store = AzureAISearchVectorStore(
    search_or_index_client=search_client,
    index_management=IndexManagement.VALIDATE_INDEX,
    id_field_key="id",
    chunk_field_key="chunk",
    embedding_field_key="embedding",
    embedding_dimensionality=1536,
    metadata_string_field_key="metadata",
    doc_id_field_key="doc_id",
)

In [0]:
# storage_context = StorageContext.from_defaults(vector_store=vector_store)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [0]:
Settings.llm = azure_openai_client

In [0]:
Settings.embed_model = embed_model

#### Upload claims data chunks to existing Azure AI Search Index

In [0]:
index = VectorStoreIndex(text_nodes, embed_model=embed_model, storage_context=storage_context)

In [0]:
# Query Data
query_engine = index.as_query_engine(similarity_top_k=3)

#### Start example queries

In [0]:
from IPython.display import display, Markdown

response = query_engine.query(
    "Who filed the insurance claim for the accident that happened on Sunset Blvd?"
)
display(Markdown(str(response)))

Michael Johnson filed the insurance claim for the accident that happened on Sunset Blvd.

In [0]:
from IPython.display import display, Markdown

response = query_engine.query(
    "Did Ms. Johnson sustain any injuries?"
)
display(Markdown(str(response)))

Yes, Ms. Johnson sustained minor injuries, including a bruised knee and some whiplash.

#### Setup Autogen AI Agents

In [0]:
from autogen import AssistantAgent, UserProxyAgent, register_function
from typing_extensions import List, Annotated
import autogen
from azure.search.documents.models import VectorizableTextQuery
from azure.search.documents.models import (
    QueryType,
    QueryCaptionType,
    QueryAnswerType
)

In [0]:
llm_config = {
    "config_list": [
        {
            "model": client.get_secret(name="aoai-deploymentname").value,
            "api_key": client.get_secret(name="aoai-api-key").value,
            "base_url": client.get_secret(name="aoai-endpoint").value,
            "api_type": "azure",
            "api_version": "2024-02-15-preview",
        },
    ]
}

gpt4_config = {
    "cache_seed": 42,
    "temperature": 0,
    "config_list": llm_config["config_list"],
    "timeout": 120
}


ai_search_agent = AssistantAgent(
    name="AISearchAssistant",
    system_message="You are a helpful AI agent."
    "You can help with Azure AI Search service."
    "Return TERMINATE when the task is done",
    llm_config=gpt4_config,
)

user_proxy = UserProxyAgent(
    name="User",
    is_termination_msg=lambda x: "terminate" in x.get("content", "").lower()
    if x.get("content", "") is not None
    else False,
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    code_execution_config=False,
)

#### Define custom function for function calling.

In [0]:
@user_proxy.register_for_execution()
@ai_search_agent.register_for_llm(
    description="A tool or function for search retrieval from Azure AI Search"
)
def search_retrieval(user_input:str) -> str:
        """
        Search and retrieve answers from Azure AI Search.
        Returns:
            str
        """
        query = user_input
        search_client = SearchClient(endpoint=search_endpoint, index_name=index_name, credential=search_credential)
        vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=5, fields="embedding", exhaustive=True)

        r = search_client.search(  
        search_text=query,
        vector_queries=[vector_query],
        select=["id", "chunk"],
        query_type=QueryType.SEMANTIC,
        semantic_configuration_name='mySemanticConfig',
        query_caption=QueryCaptionType.EXTRACTIVE,
        query_answer=QueryAnswerType.EXTRACTIVE,
        top=1
    )
        #query_result = results.get_answers()[0].text
        results = [doc["chunk"].replace("\n", "").replace("\r", "") for doc in r]
        content = "\n".join(results)
        return content

#### Initiate agent chat

In [0]:
message = "Search for 'How did Ms. Patel's accident happen' in the above defined index?"

agent_response = await user_proxy.a_initiate_chat(recipient=ai_search_agent, message=message)

[33mUser[0m (to AISearchAssistant):

Search for 'How did Ms. Patel's accident happen' in the above defined index?

--------------------------------------------------------------------------------
[33mAISearchAssistant[0m (to User):

[32m***** Suggested tool call (call_I8sNTSbzJJSSWvVE7qIpqDIs): search_retrieval *****[0m
Arguments: 
{"user_input":"How did Ms. Patel's accident happen"}
[32m*********************************************************************************[0m

--------------------------------------------------------------------------------
[35m
>>>>>>>> EXECUTING ASYNC FUNCTION search_retrieval...[0m


[runtime logging] log_function_use: autogen logger is None


[33mUser[0m (to AISearchAssistant):

[33mUser[0m (to AISearchAssistant):

[32m***** Response from calling tool (call_I8sNTSbzJJSSWvVE7qIpqDIs) *****[0m
# AUTOMOBILE CLAIM## LOSS- **Date:** 03/10/2023- **Location:** Intersection of Main St and Oak St- **City:** Boise- **State:** ID- **Police Dept. Involved:** Boise Police Department- **Ticket Issued:** ## DESCRIPTION OF ACCIDENTOn March 10, 2023, at approximately 9:15 AM, I was heading north at a parking space in the Boise Towne Square Mall parking lot. As I checked my mirrors and blind spots, I did not see any vehicles approaching. However, the driver of Vehicle 2, Michael Chen, was driving too fast through the parking lot and failed to stop in time. This resulted in my vehicle sustaining damages to the rear bumper and trunk.## INSURED VEHICLE- **Year:** 2018- **Make:** Toyota- **Model:** Camry- **V.I.N.:** 4T1BF1FK8JU01234- **Plate:** GH789- **Extent of Damages:** Rear bumper and trunk damage, including scratches, dents, and bro

In [0]:
message = "Search for 'Who filed the insurance claim for the accident that happened on Sunset Blvd?' in the above defined index?"

agent_response = await user_proxy.a_initiate_chat(recipient=ai_search_agent, message=message)

[33mUser[0m (to AISearchAssistant):

Search for 'Who filed the insurance claim for the accident that happened on Sunset Blvd?' in the above defined index?

--------------------------------------------------------------------------------
[33mAISearchAssistant[0m (to User):

[32m***** Suggested tool call (call_BrPFDx8dyw1mUIgMjDrw4k1w): search_retrieval *****[0m
Arguments: 
{"user_input":"Who filed the insurance claim for the accident that happened on Sunset Blvd?"}
[32m*********************************************************************************[0m

--------------------------------------------------------------------------------
[35m
>>>>>>>> EXECUTING ASYNC FUNCTION search_retrieval...[0m


[runtime logging] log_function_use: autogen logger is None


[33mUser[0m (to AISearchAssistant):

[33mUser[0m (to AISearchAssistant):

[32m***** Response from calling tool (call_BrPFDx8dyw1mUIgMjDrw4k1w) *****[0m
# AUTOMOBILE CLAIM## LOSS- **Date:** 06/23/2023- **Location:** Intersection of Vine Street and Sunset Bl- **City:** Los Angeles- **State:** CA- **Police Dept. Involved:** LAPD- **Ticket Issued:** Traffic Violation## DESCRIPTION OF ACCIDENTOn October 15, 2023, at approximately 3:30 PM, I was driving my 2020 Honda Accord (License Plate: 7XYZ123) southbound on Vine St.## INSURED VEHICLE- **Year:** 2020- **Make:** Honda- **Model:** Accord- **V.I.N.:** 1HGCV1F30LA123456- **Plate:** 7XYZ123- **Extent of Damages:** The front passenger side of my Honda Accord sustained significant damage, including a demolished door and broken headlights. Estimated repair cost: $3,500.- **Driver:** Michael Johnson- **Date of Birth:** 01/15/1985- **License No.:** 1111111111- **State:** CA## OTHER VEHICLE- **Year:** 2018- **Make:** Ford- **Model:** Escape- 