# Step by Step Guide to a Dungeons and Dragons (DnD) Retrieval Augmented Generation (RAG) Copilot

## Set-up

### Import necessary Python Libraries

In [1]:
# Import the Necessary Python Libraries

# Import standard and non-Azure libraries
import os
import json
from dotenv import load_dotenv

# Import the Azure SDK libraries
from azure.storage.blob import BlobServiceClient, BlobClient
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential

# Import the custom functions
import functions as fn


### Load environment variables from .env and configuration variables from config.json

In [2]:
load_dotenv() # populate environment variables from .env file

# Open the config.json file and read the configurations
with open('config.json', 'r') as f:
    config = json.load(f)

# Load environment variables from .env file
openai_api_key: str = os.environ["OPENAI_API_KEY"]
openai_api_base: str = os.environ["OPENAI_API_BASE"]
vector_store_address: str = os.environ['SEARCH_ENDPOINT']
vector_store_password: str = os.environ['SEARCH_KEY']
document_intelligence_endpoint: str = os.environ["DOCUMENT_INTELLIGENCE_ENDPOINT"]
document_intelligence_key: str = os.environ["DOCUMENT_INTELLIGENCE_KEY"]
storage_account_url: str = os.environ["STORAGE_ACCOUNT_URL"]
blob_raw_sas_token: str = os.environ["BLOB_RAW_SAS_TOKEN"]
blob_processed_sas_token: str = os.environ["BLOB_PROCESSED_SAS_TOKEN"]
blob_final_sas_token: str = os.environ["BLOB_FINAL_SAS_TOKEN"]

# Load configs from config.json file
openai_api_type: str = config["AOAI_CONFIGS"]["API_TYPE"] # "azure"
openai_api_version: str = config["AOAI_CONFIGS"]["API_VERSION"] # = "2023-08-01-preview"
doc_intel_model: str = config["DOC_INTEL_CONFIGS"]["ANALYSIS_MODEL"] # = "prebuilt-layout"
raw_container_name: str = config["BLOB_STORAGE_CONFIGS"]["RAW_CONTAINER"] # = "dnd-rag-bot-raw"
processed_container_name: str = config["BLOB_STORAGE_CONFIGS"]["PROCESSED_CONTAINER"] # = "dnd-rag-bot-processed"
final_container_name: str = config["BLOB_STORAGE_CONFIGS"]["FINAL_CONTAINER"] # = "dnd-rag-bot-final"

## Read the documents from Raw, process with Document Intelligence, and write to Processed

### Create the BlobServiceClients for the storage containers

In [3]:
# See https://pypi.org/project/azure-identity/
# We are using a blob SAS url + token, but you could define at the overall blob and not container or use default credentials 
# token + url allows us to set expirations for access

# Create the BlobServiceClient object so we can connect to the blob storage
# One for raw documents, one for processed documents, and one for the final chunked data before it goes into the search index 
raw_blob_service_client = BlobServiceClient(storage_account_url, blob_raw_sas_token)
processed_blob_service_client = BlobServiceClient(storage_account_url, blob_processed_sas_token)
final_blob_service_client = BlobServiceClient(storage_account_url, blob_final_sas_token)

#### Optional: Write to / Read From local storage

In [4]:
# # Use the local_file_write function to write the results to a local file
# # We are writing the results to a text file and the dictionary to a json file
# fn.local_file_write(dnd_pdf_doc_intel_result, 'text', '../data/results/raw_results', 'dnd_pdf_doc_intel_result.txt')
# fn.local_file_write(dnd_pdf_doc_intel_dict, 'json', '../data/results/dictionaries', 'dnd_pdf_doc_intel_dict.json')

# # Or read in the results from local to save time / doc intel costs
# dnd_pdf_doc_intel_result = fn.local_file_read('../data/results/raw_results/dnd_pdf_doc_intel_result.txt', 'text')
# dnd_pdf_doc_intel_dict = fn.local_file_read('../data/results/dictionaries/dnd_pdf_doc_intel_dict.json', 'json')

### Read in from Processed Blob Storage
This allows for these processes to live in separate Function Apps

In [5]:
dictionary_of_processed_blobs = fn.blob_name_and_url_dict(processed_blob_service_client, processed_container_name)
# print(dictionary_of_processed_blobs)

# We will be working pimarily with the dictionary of processed blobs, so we will load that in
# and using the json dictionary representation of the Document Ingeligence results
# dnd_pdf_doc_intel_result = fn.load_blob(dictionary_of_processed_blobs, container_name='dnd-rag-bot-processed', file_type='.txt', file_name='dnd-rag-bot-processed/raw_results/dnd_pdf_doc_intel_result.txt')

dnd_pdf_doc_intel_dict = fn.load_blob(dictionary_of_processed_blobs, container_name='dnd-rag-bot-processed', file_type='.json', file_name='dnd-rag-bot-processed/dictionaries/dnd_pdf_doc_intel_dict.json')

# Only necessary to see keys from original dictionary
# fn.print_keys(dnd_pdf_doc_intel_dict)

The container dnd-rag-bot-processed is being accessed.
The file dnd-rag-bot-processed/dictionaries/dnd_pdf_doc_intel_dict.json located at https://iancogsearchstorage.blob.core.windows.net/dnd-rag-bot-processed/dnd-rag-bot-processed/dictionaries/dnd_pdf_doc_intel_dict.json?si=dnd-rag-bot&spr=https&sv=2022-11-02&sr=c&sig=R6J8EuDEVTvG41nmQ1QlSDexuRT2%2BqDNv0yBiJc0kvc%3D is being added to the blob list.
The file dnd-rag-bot-processed/raw_results/dnd_pdf_doc_intel_result.txt located at https://iancogsearchstorage.blob.core.windows.net/dnd-rag-bot-processed/dnd-rag-bot-processed/raw_results/dnd_pdf_doc_intel_result.txt?si=dnd-rag-bot&spr=https&sv=2022-11-02&sr=c&sig=R6J8EuDEVTvG41nmQ1QlSDexuRT2%2BqDNv0yBiJc0kvc%3D is being added to the blob list.
blob_details:{'file_name': 'dnd-rag-bot-processed/dictionaries/dnd_pdf_doc_intel_dict.json', 'blob_url': 'https://iancogsearchstorage.blob.core.windows.net/dnd-rag-bot-processed/dnd-rag-bot-processed/dictionaries/dnd_pdf_doc_intel_dict.json?si=dnd-r

In [6]:
keys_to_select = [
'content',
'pages',
'pages.page_number',
'pages.lines',
'pages.lines.content',
'pages.words',
'pages.words.content',
'pages.selection_marks',
'pages.selection_marks.state',
'pages.selection_marks.confidence',
'paragraphs',
'paragraphs.role',
'paragraphs.content',
'paragraphs.bounding_regions.page_number',
'tables',
'tables.row_count',
'tables.column_count',
'tables.cells',
'tables.cells.kind',
'tables.cells.row_index',
'tables.cells.column_index',
'tables.cells.row_span',
'tables.cells.column_span',
'tables.cells.content',
'tables.cells.bounding_regions.page_number',
'tables.bounding_regions.page_number'
]

# Create a new dictionary with only the keys we want to work with as defined in the keys_to_select list
# the format of the keys is that which is generated by the fn.print_keys function
intermediate_dnd_pdf_working_dict = fn.select_keys(dnd_pdf_doc_intel_dict, keys_to_select)
# Only necessary if you want to see the keys in the new dictionary
# fn.print_keys(intermediate_dnd_pdf_working_dict)

In [7]:

keys_to_remove = ['polygon', 'spans', 'span', 'styles']
final_dnd_pdf_working_dict = fn.process_data(intermediate_dnd_pdf_working_dict, keys_to_remove)
# Only necessary if you want to see the keys in the final dictionary structure
fn.print_keys(final_dnd_pdf_working_dict)

# Final Structure of the Dictionary
# content
# pages
# pages.page_number
# pages.lines
# pages.lines.content
# pages.words
# pages.words.content
# pages.selection_marks
# pages.selection_marks.state
# pages.selection_marks.confidence
# paragraphs
# paragraphs.role
# paragraphs.content
# paragraphs.page_number
# tables
# tables.row_count
# tables.column_count
# tables.cells
# tables.cells.kind
# tables.cells.row_index
# tables.cells.column_index
# tables.cells.row_span
# tables.cells.column_span
# tables.cells.content
# tables.cells.page_number
# tables.page_number

content
pages
pages.page_number
pages.lines
pages.lines.content
pages.words
pages.words.content
pages.selection_marks
pages.selection_marks.state
pages.selection_marks.confidence
paragraphs
paragraphs.role
paragraphs.content
paragraphs.page_number
tables
tables.row_count
tables.column_count
tables.cells
tables.cells.kind
tables.cells.row_index
tables.cells.column_index
tables.cells.row_span
tables.cells.column_span
tables.cells.content
tables.cells.page_number
tables.page_number


In [8]:
# # We'll do a quick print to see how things faired after the first removal
# print(dnd_pdf_doc_intel_dict['pages'][0])
# print(intermediate_dnd_pdf_working_dict['pages'][0])
print(final_dnd_pdf_working_dict['tables'][1])#['bounding_regions'][0]['page_number'])


{'row_count': 44, 'column_count': 2, 'cells': [{'kind': 'columnHeader', 'row_index': 0, 'column_index': 0, 'row_span': 1, 'column_span': 1, 'content': 'PART 2', 'page_number': 2}, {'kind': 'columnHeader', 'row_index': 0, 'column_index': 1, 'row_span': 1, 'column_span': 1, 'content': '171', 'page_number': 2}, {'kind': 'content', 'row_index': 1, 'column_index': 0, 'row_span': 1, 'column_span': 1, 'content': 'CHAPTER 7: USING ABILITY SCORES', 'page_number': 2}, {'kind': 'content', 'row_index': 1, 'column_index': 1, 'row_span': 1, 'column_span': 1, 'content': '173', 'page_number': 2}, {'kind': 'content', 'row_index': 2, 'column_index': 0, 'row_span': 1, 'column_span': 1, 'content': 'Ability Scores and Modifiers.', 'page_number': 2}, {'kind': 'content', 'row_index': 2, 'column_index': 1, 'row_span': 1, 'column_span': 1, 'content': '173', 'page_number': 2}, {'kind': 'content', 'row_index': 3, 'column_index': 0, 'row_span': 1, 'column_span': 1, 'content': 'Advantage and Disadvantage ..', 'pag

In [9]:
# # Creating a dictionary to hold the content of each page
# pages_content = {}

# for page in dnd_pdf_doc_intel_dict['pages']:
#     page_number = page['page_number']
#     content = ""

#     # Concatenating the content from lines
#     for line in page['lines']:
#         content += line['content'] + " "

#     # Concatenating the content from words
#     for word in page['words']:
#         content += word['content'] + " "

#     # Storing the content in the dictionary
#     pages_content[page_number] = content

# pages_content[2]