# Step by Step Guide to a Dungeons and Dragons (DnD) Retrieval Augmented Generation (RAG) Copilot

## Set-up

### Import necessary Python Libraries

In [1]:
# Import the Necessary Python Libraries

# Import standard and non-Azure libraries
import os
import json
from dotenv import load_dotenv

# Import the Azure SDK libraries
from azure.storage.blob import BlobServiceClient, BlobClient
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential

# Import the custom functions
import functions as fn


### Load environment variables from .env and configuration variables from config.json

In [2]:
load_dotenv() # populate environment variables from .env file

# Open the config.json file and read the configurations
with open('config.json', 'r') as f:
    config = json.load(f)

# Load environment variables from .env file
openai_api_key: str = os.environ["OPENAI_API_KEY"]
openai_api_base: str = os.environ["OPENAI_API_BASE"]
vector_store_address: str = os.environ['SEARCH_ENDPOINT']
vector_store_password: str = os.environ['SEARCH_KEY']
document_intelligence_endpoint: str = os.environ["DOCUMENT_INTELLIGENCE_ENDPOINT"]
document_intelligence_key: str = os.environ["DOCUMENT_INTELLIGENCE_KEY"]
storage_account_url: str = os.environ["STORAGE_ACCOUNT_URL"]
blob_raw_sas_token: str = os.environ["BLOB_RAW_SAS_TOKEN"]
blob_processed_sas_token: str = os.environ["BLOB_PROCESSED_SAS_TOKEN"]
blob_final_sas_token: str = os.environ["BLOB_FINAL_SAS_TOKEN"]

# Load configs from config.json file
openai_api_type: str = config["AOAI_CONFIGS"]["API_TYPE"] # "azure"
openai_api_version: str = config["AOAI_CONFIGS"]["API_VERSION"] # = "2023-08-01-preview"
doc_intel_model: str = config["DOC_INTEL_CONFIGS"]["ANALYSIS_MODEL"] # = "prebuilt-layout"
raw_container_name: str = config["BLOB_STORAGE_CONFIGS"]["RAW_CONTAINER"] # = "dnd-rag-bot-raw"
processed_container_name: str = config["BLOB_STORAGE_CONFIGS"]["PROCESSED_CONTAINER"] # = "dnd-rag-bot-processed"
final_container_name: str = config["BLOB_STORAGE_CONFIGS"]["FINAL_CONTAINER"] # = "dnd-rag-bot-final"

## Read the documents from Raw, process with Document Intelligence, and write to Processed

### Create the BlobServiceClients for the storage containers

In [3]:
# See https://pypi.org/project/azure-identity/
# We are using a blob SAS url + token, but you could define at the overall blob and not container or use default credentials 
# token + url allows us to set expirations for access

# Create the BlobServiceClient object so we can connect to the blob storage
# One for raw documents, one for processed documents, and one for the final chunked data before it goes into the search index 
raw_blob_service_client = BlobServiceClient(storage_account_url, blob_raw_sas_token)
processed_blob_service_client = BlobServiceClient(storage_account_url, blob_processed_sas_token)
final_blob_service_client = BlobServiceClient(storage_account_url, blob_final_sas_token)

### List the urls for the document in the Raw Container and Extract Blob URL

In [4]:
# Get the dictionary of blob names and urls for each container
dictionary_of_raw_blobs = fn.blob_name_and_url_dict(raw_blob_service_client, raw_container_name)
print(dictionary_of_raw_blobs)

# Specify the container name, file extension, and file name
# We are selecting based on the raw container and the .pdf file extension
# as we know specifically which file we are after.
# Use the raw_container_name from configs.json
file_extension = '.pdf'
dnd_pdf_name = 'DnD 5e Players Handbook (BnW OCR).pdf'

# Access the URL
dnd_pdf_url = dictionary_of_raw_blobs[raw_container_name][file_extension][dnd_pdf_name]['blob_url']
# print(dnd_pdf_url)

The container dnd-rag-bot-raw is being accessed.
The file DnD 5e Players Handbook (BnW OCR).pdf located at https://iancogsearchstorage.blob.core.windows.net/dnd-rag-bot-raw/DnD%205e%20Players%20Handbook%20%28BnW%20OCR%29.pdf?si=dnd-rag-bot&spr=https&sv=2022-11-02&sr=c&sig=co5JS0b1IS3VuKBFIu4CdzIpsl8eZ9Xi7oDJX68swyI%3D is being added to the blob list.
{'dnd-rag-bot-raw': {'.pdf': {'DnD 5e Players Handbook (BnW OCR).pdf': {'file_name': 'DnD 5e Players Handbook (BnW OCR).pdf', 'blob_url': 'https://iancogsearchstorage.blob.core.windows.net/dnd-rag-bot-raw/DnD%205e%20Players%20Handbook%20%28BnW%20OCR%29.pdf?si=dnd-rag-bot&spr=https&sv=2022-11-02&sr=c&sig=co5JS0b1IS3VuKBFIu4CdzIpsl8eZ9Xi7oDJX68swyI%3D'}}}}


### Read the pdf into the DocumentAnalysisClient

In [5]:
# Create the DocumentAnalysisClient object so we can connect to the document intelligence service and read in the document
document_analysis_client = DocumentAnalysisClient(
                endpoint=document_intelligence_endpoint, credential=AzureKeyCredential(document_intelligence_key)
            )

In [6]:
# Run the doc_intel_pdf function and choose which model to use, in this case we are using the prebuilt-layout model
# The function will return the result, the dictionary of results, or both depending on the last parameter
# We are supplying the file name and the url is being extracted from the dictionary of blobs

dnd_pdf_doc_intel_result, dnd_pdf_doc_intel_dict = fn.doc_intel_pdf(document_analysis_client, doc_intel_model, dnd_pdf_name, dnd_pdf_url, 'both')
# print(dnd_pdf_doc_intel_result)

File DnD 5e Players Handbook (BnW OCR).pdf was analyzed using the Document Intelligence service.


#### Optional: Write to / Read From local storage

In [7]:
# # Use the local_file_write function to write the results to a local file
# # We are writing the results to a text file and the dictionary to a json file
fn.local_file_write(dnd_pdf_doc_intel_result, 'text', '../data/results/raw_results', 'dnd_pdf_doc_intel_result.txt')
fn.local_file_write(dnd_pdf_doc_intel_dict, 'json', '../data/results/dictionaries', 'dnd_pdf_doc_intel_dict.json')

# # Or read in the results from local to save time / doc intel costs
# dnd_pdf_doc_intel_result_test = fn.local_file_read('../data/results/raw_results/dnd_pdf_doc_intel_result.txt', 'text')
# dnd_pdf_doc_intel_dict_test = fn.local_file_read('../data/results/dictionaries/dnd_pdf_doc_intel_dict.json', 'json')

File successfully written to ../data/results/raw_results/dnd_pdf_doc_intel_result.txt
File successfully written to ../data/results/dictionaries/dnd_pdf_doc_intel_dict.json


In [8]:
# from azure.storage.blob import BlobServiceClient
# test_container_name = 'dnd-rag-bot-processed'
# # test_sas_token_container = 'si=dnd-rag-bot&spr=https&sv=2022-11-02&sr=c&sig=R6J8EuDEVTvG41nmQ1QlSDexuRT2%2BqDNv0yBiJc0kvc%3D'
# # test_sas_token_container = '?sv=2022-11-02&ss=bfqt&srt=sco&sp=rwdlacupiytfx&se=2023-11-06T13:03:00Z&st=2023-11-06T05:03:00Z&spr=https&sig=bLDGhqmtVyh%2Fr4IIMTFUUy25QIARkqjIPM%2FB6I1baa8%3D'
# test_storage_url = 'https://iancogsearchstorage.blob.core.windows.net'
# test_blob_service_client = processed_blob_service_client # BlobServiceClient(test_storage_url, test_sas_token_container)
# print(test_blob_service_client)
# test_blob_client = test_blob_service_client.get_blob_client(container=test_container_name, blob='dnd-rag-bot-processed/raw_results/dnd_pdf_doc_intel_result.txt')
# print(test_blob_client)

# with open(dnd_pdf_doc_intel_result, "rb") as data:
#     test_blob_client.upload_blob(dnd_pdf_doc_intel_result, blob_type="BlockBlob")

# test_blob_client = test_blob_service_client.get_blob_client(container=test_container_name, blob='dnd-rag-bot-processed/dictionaries/dnd_pdf_doc_intel_dict.json')
# print(test_blob_client)
# json_dump = json.dumps(dnd_pdf_doc_intel_dict)
# test_blob_client.upload_blob(json_dump, blob_type="BlockBlob")

In [10]:
# Upload the results to the processed container
fn.write_to_blob(dnd_pdf_doc_intel_result, processed_blob_service_client, processed_container_name, 'raw_results', 'dnd_pdf_doc_intel_result.txt', False)
fn.write_to_blob(dnd_pdf_doc_intel_dict, processed_blob_service_client, processed_container_name, 'dictionaries', 'dnd_pdf_doc_intel_dict.json', True)

<azure.storage.blob._blob_client.BlobClient object at 0x00000214F8360650>
Uploaded dnd_pdf_doc_intel_result.txt to dnd-rag-bot-processed/raw_results/dnd_pdf_doc_intel_result.txt
<azure.storage.blob._blob_client.BlobClient object at 0x00000214F8379ED0>
Uploaded dnd_pdf_doc_intel_dict.json to dnd-rag-bot-processed/dictionaries/dnd_pdf_doc_intel_dict.json


In [11]:
# container_name = processed_container_name
# virtual_directory_name = 'raw_results'
# file_name='dnd_pdf_doc_intel_result.txt'
# blob_name = str(container_name + '/' + virtual_directory_name + '/' + file_name)
# blob_client = processed_blob_service_client.get_blob_client(container=container_name, blob=blob_name)
# blob_client.upload_blob(str(dnd_pdf_doc_intel_result), blob_type="BlockBlob")