# Azure OpenAI Assistants - File Search

## Load Azure Configuration

In [1]:
from dotenv import load_dotenv
import os
import warnings
warnings.filterwarnings("ignore")

# Load environment variables from .env file

azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_openai_deployment = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")
azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION")

## Prepare Files

In [3]:
from openai import AzureOpenAI

# Create a client
client = AzureOpenAI(
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key
)

# Create a vector store
vector_store = client.vector_stores.create(name="Nasa Books")

# Specify the folder containing the files
folder_path = "../Data/nasabooks/"

# Get all file paths in the folder
file_paths = [os.path.join(folder_path, file_name) for file_name in os.listdir(folder_path)]

# Open file streams
file_streams = [open(path, "rb") for path in file_paths]

# Use the upload and poll SDK helper to upload the files, add them to the vector store,
# and poll the status of the file batch for completion.
file_batch = client.vector_stores.file_batches.upload_and_poll(
    vector_store_id=vector_store.id, files=file_streams
)

In [4]:
# You can print the status and the file counts of the batch to see the result of this operation.
print(file_batch.status)
print(file_batch.file_counts)
print(vector_store.id)

completed
FileCounts(cancelled=0, completed=28, failed=0, in_progress=0, total=28)
vs_HNVzCmkwhPs3sgBupMB1YH6T


## Reformat citations with the proper filenames

In [5]:
def reformat_citations(content_block):
    # Extract the annotations
    annotations = content_block.text.annotations
    
    # Original response
    paragraph = content_block.text.value

    # Dictionary to store key-value pairs of text and filename
    text_filename_pairs = {}

    # Iterate over the annotations and extract the relevant information
    for annotation in annotations:
        file_id = annotation.file_citation.file_id
        text = annotation.text
        cited_file = client.files.retrieve(file_id)
        filename = cited_file.filename

        if text not in text_filename_pairs:
            text_filename_pairs[text] = []
        text_filename_pairs[text].append(filename)

    # Replace the citation texts with their corresponding filenames prefixed with " Source: "
    for text, filenames in text_filename_pairs.items():
        sources = " Source: " + ", ".join(filenames)
        paragraph = paragraph.replace(text, sources)

    return paragraph

## Step 1-2:
1. Create an Assistant
2. Create a Thread

In [7]:
# Step 1: Create assistant
assistant = client.beta.assistants.create(
  name="Nasa books Assistant",
  instructions="""
  You are a assistant that provides information. 
   You will answer questions based on files provided to you about information in a NASA Book. 
   You will not provide answers outside of those files.
  """,
  model=azure_openai_deployment,
  tools=[{"type":"file_search"}],
  tool_resources={"file_search":{"vector_store_ids":[vector_store.id]}},
  temperature=1,
  top_p=1
)

# Step 2: Create thread
thread = client.beta.threads.create()
print(thread)

Thread(id='thread_NItTCRsavmsJAYr5mZ36eG62', created_at=1750534046, metadata={}, object='thread', tool_resources=ToolResources(code_interpreter=None, file_search=None))


## Step 3-6: Helper Function
3. Add a message to the thread
4. Run the Assistant
5. Check the Run Status
6. Display the Assistant's Response

In [8]:
import time

def run_assistant(user_question):
  # Step 3: Add a message to the thread
  messages = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content=user_question
  )

  # Step 4: Run the Assistant
  run = client.beta.threads.runs.create(
    thread_id=thread.id,
    assistant_id=assistant.id
  )

  # Step 5: Check the Run Status
  # Looping until the run completes or fails
  while run.status in ['queued', 'in_progress', 'cancelling']:
    time.sleep(1)
    run = client.beta.threads.runs.retrieve(
      thread_id=thread.id,
      run_id=run.id
    )

    #display run status
    print(run.status)

    if run.status == 'completed':
      messages = client.beta.threads.messages.list(thread_id=thread.id)
      # Step 6: Display the Assistant's Response
      content_block = messages.data[0].content[0]
      annotations = content_block.text.annotations
      if annotations is None:
        value = content_block.text.value
        print(value)
      else:
        print(reformat_citations(content_block))
    
    elif run.status == 'requires_action':
      pass
    
    elif run.status == "failed":
        print(run.last_error)
        
    else:
      pass


In [9]:
user_question ="""How did the wide floodplains in Queensland originate?"""
run_assistant(user_question)

in_progress
in_progress
completed
The wide floodplains in Queensland, specifically in Australia's Channel Country, originated from the extreme variation in water and sediment discharges from the rivers. The region is characterized by flat land and poor drainage, which allows for the formation of semi-permanent wetlands at river convergence points. In many years, rainfall is scarce, resulting in rivers that are nearly nonexistent, while in wetter years, high discharges can inundate the entire width of the floodplain. These occasional massive flows can transform the floodplain into extensive areas of water, with only treetops visible above the surface Source: page-49.pdf.


In [10]:
user_question ="""What forms the Lower Amazon River?"""
run_assistant(user_question)

in_progress
completed
The Lower Amazon River is formed at the "Meeting of the Waters," where two significant rivers converge: the coffee-colored Rio Solimões and the black-tea-colored Rio Negro. The Rio Solimões, which is rich in sediment, flows down from the Andes Mountains, while the nearly sediment-free Rio Negro flows from the Colombian hills and jungles, gaining its color from decayed leaf and plant matter. The two rivers run side by side for several kilometers before they mix and form the Lower Amazon River Source: page-61.pdf.


## Delete Assistant

In [11]:
response = client.beta.assistants.delete(assistant.id)

Delete Vector Store to avoid storage costs on Azure. It incur costs after first 1GB free.

In [12]:
client.vector_stores.delete(vector_store.id)

VectorStoreDeleted(id='vs_HNVzCmkwhPs3sgBupMB1YH6T', deleted=True, object='vector_store.deleted')