# Step by Step Guide to a Dungeons and Dragons (DnD) Retrieval Augmented Generation (RAG) Copilot

## Set-up

### Import necessary Python Libraries

In [1]:
# Import the Necessary Python Libraries

# Import standard and non-Azure libraries
import os
import json
from dotenv import load_dotenv

# Import the Azure SDK libraries
from azure.storage.blob import BlobServiceClient, BlobClient
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import AzureError

# Import the custom functions
import functions as fn


### Load environment variables from .env and configuration variables from config.json

In [2]:
load_dotenv() # populate environment variables from .env file

# Open the config.json file and read the configurations
with open('config.json', 'r') as f:
    config = json.load(f)

# Load environment variables from .env file
openai_api_key: str = os.environ["OPENAI_API_KEY"]
openai_api_base: str = os.environ["OPENAI_API_BASE"]
vector_store_address: str = os.environ['SEARCH_ENDPOINT']
vector_store_password: str = os.environ['SEARCH_KEY']
document_intelligence_endpoint: str = os.environ["DOCUMENT_INTELLIGENCE_ENDPOINT"]
document_intelligence_key: str = os.environ["DOCUMENT_INTELLIGENCE_KEY"]
storage_account_url: str = os.environ["STORAGE_ACCOUNT_URL"]
blob_raw_sas_token: str = os.environ["BLOB_RAW_SAS_TOKEN"]
blob_processed_sas_token: str = os.environ["BLOB_PROCESSED_SAS_TOKEN"]
blob_final_sas_token: str = os.environ["BLOB_FINAL_SAS_TOKEN"]

# Load configs from config.json file
openai_api_type: str = config["AOAI_CONFIGS"]["API_TYPE"] # "azure"
openai_api_version: str = config["AOAI_CONFIGS"]["API_VERSION"] # = "2023-08-01-preview"
doc_intel_model: str = config["DOC_INTEL_CONFIGS"]["ANALYSIS_MODEL"] # = "prebuilt-layout"
raw_container_name: str = config["BLOB_STORAGE_CONFIGS"]["RAW_CONTAINER"] # = "dnd-rag-bot-raw"
processed_container_name: str = config["BLOB_STORAGE_CONFIGS"]["PROCESSED_CONTAINER"] # = "dnd-rag-bot-processed"
final_container_name: str = config["BLOB_STORAGE_CONFIGS"]["FINAL_CONTAINER"] # = "dnd-rag-bot-final"

## Read the documents from Raw, process with Document Intelligence, and write to Processed

### Create the BlobServiceClients for the storage containers

In [3]:
# See https://pypi.org/project/azure-identity/
# We are using a blob SAS url + token, but you could define at the overall blob and not container or use default credentials 
# token + url allows us to set expirations for access

# Create the BlobServiceClient object so we can connect to the blob storage
# One for raw documents, one for processed documents, and one for the final chunked data before it goes into the search index 
raw_blob_service_client = BlobServiceClient(storage_account_url, blob_raw_sas_token)
processed_blob_service_client = BlobServiceClient(storage_account_url, blob_processed_sas_token)
final_blob_service_client = BlobServiceClient(storage_account_url, blob_final_sas_token)

### List the urls for the document in the Raw Container and Extract Blob URL

In [None]:
# Get the dictionary of blob names and urls for each container
dictionary_of_raw_blobs = fn.blob_name_and_url_dict(raw_blob_service_client, raw_container_name)
print(dictionary_of_raw_blobs)

# Specify the container name, file extension, and file name
# We are selecting based on the raw container and the .pdf file extension
# as we know specifically which file we are after.
# Use the raw_container_name from configs.json
file_extension = '.pdf'
dnd_pdf_name = 'DnD 5e Players Handbook (BnW OCR).pdf'

# Access the URL
dnd_pdf_url = dictionary_of_raw_blobs[raw_container_name][file_extension][dnd_pdf_name]['blob_url']
# print(dnd_pdf_url)

### Read the pdf into the DocumentAnalysisClient

In [None]:
# Create the DocumentAnalysisClient object so we can connect to the document intelligence service and read in the document
document_analysis_client = DocumentAnalysisClient(
                endpoint=document_intelligence_endpoint, credential=AzureKeyCredential(document_intelligence_key)
            )

In [None]:
# Run the doc_intel_pdf function and choose which model to use, in this case we are using the prebuilt-layout model
# The function will return the result, the dictionary of results, or both depending on the last parameter
# We are supplying the file name and the url is being extracted from the dictionary of blobs

dnd_pdf_doc_intel_result, dnd_pdf_doc_intel_dict = fn.doc_intel_pdf(document_analysis_client, doc_intel_model, dnd_pdf_name, dnd_pdf_url, 'both')
# print(dnd_pdf_doc_intel_result)

#### Optional: Write to / Read From local storage

In [None]:
# # Use the local_file_write function to write the results to a local file
# # We are writing the results to a text file and the dictionary to a json file
# fn.local_file_write(dnd_pdf_doc_intel_result, 'text', '../data/results/raw_results', 'dnd_pdf_doc_intel_result.txt')
# fn.local_file_write(dnd_pdf_doc_intel_dict, 'json', '../data/results/dictionaries', 'dnd_pdf_doc_intel_dict.json')

# # Or read in the results from local to save time / doc intel costs
# dnd_pdf_doc_intel_result_test = fn.local_file_read('../data/results/raw_results/dnd_pdf_doc_intel_result.txt', 'text')
# dnd_pdf_doc_intel_dict_test = fn.local_file_read('../data/results/dictionaries/dnd_pdf_doc_intel_dict.json', 'json')

### Write to the Processed Blob Storage
This saves the output from Document Intelligence so we don't have to
recreate that object every time we want to iterate over the
OCR output

In [None]:
# Upload the results to the processed container
fn.write_to_blob(dnd_pdf_doc_intel_result, processed_blob_service_client, processed_container_name, 'raw_results', 'dnd_pdf_doc_intel_result.txt', False)
fn.write_to_blob(dnd_pdf_doc_intel_dict, processed_blob_service_client, processed_container_name, 'dictionaries', 'dnd_pdf_doc_intel_dict.json', True)

### Read in from Processed Blob Storage
This allows for these processes to live in separate Function Apps

In [4]:
dictionary_of_processed_blobs = fn.blob_name_and_url_dict(processed_blob_service_client, processed_container_name)
print(dictionary_of_processed_blobs)

The container dnd-rag-bot-processed is being accessed.
The file dnd-rag-bot-processed/dictionaries/dnd_pdf_doc_intel_dict.json located at https://iancogsearchstorage.blob.core.windows.net/dnd-rag-bot-processed/dnd-rag-bot-processed/dictionaries/dnd_pdf_doc_intel_dict.json?si=dnd-rag-bot&spr=https&sv=2022-11-02&sr=c&sig=R6J8EuDEVTvG41nmQ1QlSDexuRT2%2BqDNv0yBiJc0kvc%3D is being added to the blob list.
The file dnd-rag-bot-processed/raw_results/dnd_pdf_doc_intel_result.txt located at https://iancogsearchstorage.blob.core.windows.net/dnd-rag-bot-processed/dnd-rag-bot-processed/raw_results/dnd_pdf_doc_intel_result.txt?si=dnd-rag-bot&spr=https&sv=2022-11-02&sr=c&sig=R6J8EuDEVTvG41nmQ1QlSDexuRT2%2BqDNv0yBiJc0kvc%3D is being added to the blob list.
{'dnd-rag-bot-processed': {'.json': {'dnd-rag-bot-processed/dictionaries/dnd_pdf_doc_intel_dict.json': {'file_name': 'dnd-rag-bot-processed/dictionaries/dnd_pdf_doc_intel_dict.json', 'blob_url': 'https://iancogsearchstorage.blob.core.windows.net/dnd

In [12]:
def load_blob(blob_dict, file_type=None, file_name=None):
    """
    Load a single blob from a nested dictionary of blob names and URLs.

    Parameters:
    blob_dict (dict): A nested dictionary where keys are container names, file types, and blob names, and values are blob details.
    file_type (str, optional): The file type to filter blobs by. Blobs not of this file type will be ignored.
    file_name (str, optional): A specific blob name to load. If provided, only this blob will be loaded.

    Returns:
    bytes: The content of the blob.

    Example usage:
    blob_dict = {"container1": {".txt": {"blob1": {"file_name": "blob1", "blob_url": "url1"}}}}
    load_blob(blob_dict, file_type=".txt", file_name="blob1")
    """

    # Iterate over all containers in the dictionary
    for container_name, container_dict in blob_dict.items():
        # If file_type is provided and it does not exist in the current container's dictionary, skip this container
        if file_type and file_type not in container_dict:
            continue
        # If file_type is provided, iterate over all blobs in the file_type dictionary, otherwise in the container's dictionary
        for blob_name, blob_details in (container_dict[file_type] if file_type else container_dict).items():
            # If file_name is provided and blob_name is not equal to it, skip this blob
            if file_name and blob_name != file_name:
                continue

            try:
                # Create a BlobClient for the blob
                blob_client = BlobClient.from_blob_url(blob_details['blob_url'])
                # Download the blob and read all its content
                blob_content = blob_client.download_blob().readall()

                # Return the blob content
                return blob_content
            except AzureError as e:
                print(f"Failed to download blob: {e}")
                return None

    # If no blob was found that matches the criteria, return None
    return None

In [13]:
dnd_pdf_doc_intel_result = load_blob(dictionary_of_processed_blobs, file_type='.txt', file_name='dnd_pdf_doc_intel_result.txt')
dnd_pdf_doc_intel_dict = load_blob(dictionary_of_processed_blobs, file_type='.json', file_name='dnd_pdf_doc_intel_dict.json')

Checking blob: dnd-rag-bot-processed at {'.json': {'dnd-rag-bot-processed/dictionaries/dnd_pdf_doc_intel_dict.json': {'file_name': 'dnd-rag-bot-processed/dictionaries/dnd_pdf_doc_intel_dict.json', 'blob_url': 'https://iancogsearchstorage.blob.core.windows.net/dnd-rag-bot-processed/dnd-rag-bot-processed/dictionaries/dnd_pdf_doc_intel_dict.json?si=dnd-rag-bot&spr=https&sv=2022-11-02&sr=c&sig=R6J8EuDEVTvG41nmQ1QlSDexuRT2%2BqDNv0yBiJc0kvc%3D'}}, '.txt': {'dnd-rag-bot-processed/raw_results/dnd_pdf_doc_intel_result.txt': {'file_name': 'dnd-rag-bot-processed/raw_results/dnd_pdf_doc_intel_result.txt', 'blob_url': 'https://iancogsearchstorage.blob.core.windows.net/dnd-rag-bot-processed/dnd-rag-bot-processed/raw_results/dnd_pdf_doc_intel_result.txt?si=dnd-rag-bot&spr=https&sv=2022-11-02&sr=c&sig=R6J8EuDEVTvG41nmQ1QlSDexuRT2%2BqDNv0yBiJc0kvc%3D'}}}
Checking blob: dnd-rag-bot-processed at {'.json': {'dnd-rag-bot-processed/dictionaries/dnd_pdf_doc_intel_dict.json': {'file_name': 'dnd-rag-bot-proces

In [11]:
print(dnd_pdf_doc_intel_result)

None
