# File Assistant - File Search Demo

##### This demo highlights how the Assistant API can help you extract information from PDF using the File Search assistant feature for use cases like Purchase Order Automation

##### Assistant API Documentation: https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/file-search?tabs=python

### Load the required libraries

In [6]:
%pip install -U openai

Collecting openai
  Downloading openai-1.30.4-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 KB[0m [31m708.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.30.3
    Uninstalling openai-1.30.3:
      Successfully uninstalled openai-1.30.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-openai 0.1.1 requires langchain-core<0.2.0,>=0.1.33, but you have langchain-core 0.2.1 which is incompatible.
langchain-fireworks 0.1.1 requires langchain-core<0.2.0,>=0.1.27, but you have langchain-core 0.2.1 which is incompatible.[0m[31m
[0mSuccessfully installed openai-1.30.4
Note: you may need to restart the kernel to use updated packages.


In [1]:
from dotenv import dotenv_values
from openai import AzureOpenAI
from typing_extensions import override
from openai import AssistantEventHandler
import time
import os
from dotenv import load_dotenv
load_dotenv()

True

### Load the environment variables/configs

In [2]:
client = AzureOpenAI(
    api_version=os.getenv("OPENAI_API_VERSION_ASSISTANTS"),
    api_key=os.getenv("AZURE_OPENAI_KEY_ASSISTANTS"),  
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT_ASSISTANTS")
)

### Create an Assistant

In [5]:
assistant = client.beta.assistants.create(
  name="Financial Analyst Assistant",
  instructions="You are an expert financial analyst. Use you knowledge base to answer questions about audited financial statements.",
  model="gpt-4o",
  tools=[{"type": "file_search"}],
)

### Upload the file

In [6]:
message_file = client.files.create(
  file=open("../data/Wine-PO.pdf", "rb"), purpose="assistants"
)

print(message_file.id)

assistant-55aUjCIWZePeWTsf3bIfJViI


### Create the Vector Store and attach it to the Assistant

In [7]:
vector_store = client.beta.vector_stores.create(
  name="Wine-PO"
)

assistant = client.beta.assistants.update(
  assistant_id=assistant.id,
  tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
  temperature=0.5
)

print(vector_store.id, vector_store.name)

vs_RN4ips4eXNZkbxAyeCmGbRjY Wine-PO


### Add the file to the Vector Store

In [8]:
file_vector = client.beta.vector_stores.files.create_and_poll(
  vector_store_id=vector_store.id,
  file_id=message_file.id
)

print(file_vector.id)

assistant-55aUjCIWZePeWTsf3bIfJViI


### Create functions to create, process and read the thread

In [9]:
class EventHandler(AssistantEventHandler):
    @override
    def on_text_created(self, text) -> None:
        print(f"\nassistant > ", end="", flush=True)

    @override
    def on_tool_call_created(self, tool_call):
        print(f"\nassistant > {tool_call.type}\n", flush=True)

    @override
    def on_message_done(self, message) -> None:
        # print a citation to the file searched
        message_content = message.content[0].text
        annotations = message_content.annotations
        citations = []
        for index, annotation in enumerate(annotations):
            message_content.value = message_content.value.replace(
                annotation.text, f"[{index}]"
            )
            if file_citation := getattr(annotation, "file_citation", None):
                cited_file = client.files.retrieve(file_citation.file_id)
                citations.append(f"[{index}] {cited_file.filename}")

        print(message_content.value)
        print("\n".join(citations))


def format_messages(messages) -> None:
    message_list = []

    # Get all the messages till the last user message
    for message in messages:
        message_list.append(message)
        if message.role == "user":
            break

    # Reverse the messages to show the last user message first
    message_list.reverse()

    # Print the user or Assistant messages or images
    for message in message_list:
        for item in message.content:
            if message.role == "assistant":
                #print(f"{message.role}:\n{item.text.value}\n")
                return_value = item.text.value

    return return_value


def process_prompt(prompt: str, vector_store_id: str) -> None:

    thread = client.beta.threads.create(
    messages=[
        {
        "role": "user",
        "content": prompt,
        }
    ],
    tool_resources={ "file_search" : { "vector_store_ids": [vector_store_id]}}
    )

    run = client.beta.threads.runs.create(
        thread_id=thread.id,
        assistant_id=assistant.id,
        instructions="You are an AI assistant that helps extract information from purchase orders. Extract all the information required by the user question. Output plain text only. Do not output markdown")

    # Check Run details

    #print("processing ...")

    # with client.beta.threads.runs.stream(
    # thread_id=thread.id,
    # assistant_id=assistant.id,
    # instructions="You are an AI assistant that helps extract information from purchase orders. Extract all the information required by the user question.",
    # event_handler=EventHandler(),
    # ) as stream:
    #     stream.until_done()

    while True:
        run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
        if run.status == "completed":
            # Handle completed
            messages = client.beta.threads.messages.list(thread_id=thread.id)

            break
        if run.status == "failed":
            messages = client.beta.threads.messages.list(thread_id=thread.id)
            answer = messages.data[0].content[0].text.value
            print(f"Failed User:\n{prompt}\nAssistant:\n{answer}\n")
            # Handle failed
            break
        if run.status == "expired":
            # Handle expired
            print(run)
            break
        if run.status == "cancelled":
            # Handle cancelled
            print(run)
            break
        else:
            time.sleep(5)

    return messages

### Submit your request

In [10]:

text_message = """Extract from the file the Purchase Order Number, PO Status, Supplier Contact Name, Company Name and Address, Customer Contact Name, Company Name and Address, the Delivery Date, Order total and all items in the table.
                  Output the information in JSON format in the following structure:
                  { po_number:, order_status:, client_id:,customer_contact_name:, customer_company_name:, customer_address:, supplier_contact_name:, supplier_company_name:, supplier_address:, delivery_date:, order_total:, subtotal:, sales_tax:, shipping_cost:, other_costs:,
                    items: [{
                    item_name:,
                    quantity:,
                    item_price:,
                    item_discount:,
                    total:
                    }]  }
                  If there's not clear order status in the file, put the value as N/A
                  Put all dates in the following format dd/MM/YYYY
                  For all fields with numbers does not include the currency strings like USD, EUR, LPS, BRL
                  For all fields related to taxes, bring the information as is
                """



result_prompt = process_prompt(text_message, vector_store.id)

In [11]:
result_assistant = format_messages(result_prompt)

In [12]:
print(result_assistant)

{
  "po_number": "679133",
  "order_status": "N/A",
  "client_id": "12058",
  "customer_contact_name": "John Smith",
  "customer_company_name": "Cellar Selections",
  "customer_address": "686 Harrison Lane, Union City, CA, 94587",
  "supplier_contact_name": "Robert Web",
  "supplier_company_name": "RW Wines",
  "supplier_address": "7310 Somerset St., Salinas, CA, 93905",
  "delivery_date": "19/05/2024",
  "order_total": "2027.9",
  "subtotal": "1898",
  "sales_tax": "5%",
  "shipping_cost": "20",
  "other_costs": "15",
  "items": [
    {
      "item_name": "Cabernet Sauvignon",
      "quantity": "4",
      "item_price": "45",
      "item_discount": "5%",
      "total": "171"
    },
    {
      "item_name": "Rose",
      "quantity": "24",
      "item_price": "25",
      "item_discount": "15%",
      "total": "510"
    },
    {
      "item_name": "Syrah",
      "quantity": "12",
      "item_price": "34",
      "item_discount": "15%",
      "total": "346.8"
    },
    {
      "item_name":

## Cleanup

### Deleting Vector Stores & Assistants

In [13]:
# list_files = client.beta.vector_stores.files.list(vector_store_id=vector_store.id)

# for i in list_files.data:
#     print(i.id)
#     client.beta.vector_stores.files.delete(file_id=i.id, vector_store_id=vector_store.id)

assistant-55aUjCIWZePeWTsf3bIfJViI


InternalServerError: Error code: 500 - {'error': {'message': 'Failed to create file operation.', 'type': 'server_error', 'param': None, 'code': None}}

In [14]:
list_vector_stores = client.beta.vector_stores.list()

for i in list_vector_stores.data:
    print(i.id, i.name)
    client.beta.vector_stores.delete(i.id)

vs_RN4ips4eXNZkbxAyeCmGbRjY Wine-PO


In [15]:
list_assistants = client.beta.assistants.list(
    order="desc",
    limit="20",
)

for i in list_assistants.data:
    print(i.id, i.name)
    client.beta.assistants.delete(i.id)

asst_B3nYZWXoSyk7EKgja0COHew4 Financial Analyst Assistant
asst_JWMzir4w0tIjTwmzBH9v6Vfi Financial Analyst Assistant
asst_1gcqUG01D6PhCkizyumVw9VY Financial Analyst Assistant
