In [1]:
# Import required libraries
import os
import json
from dotenv import load_dotenv
import pandas as pd
from tenacity import retry, wait_random_exponential, stop_after_attempt
from openai import AzureOpenAI
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters
)


from azure.identity import DefaultAzureCredential, get_bearer_token_provider
import json

load_dotenv()

True

In [2]:
# Configure environment variables
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")

AZURE_OPENAI_EMBEDDINGS_ADA_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_EMBEDDINGS_ADA_DEPLOYMENT_NAME")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_VERSION="2024-02-01"
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
azure_openai_embedding_dimensions = 1536
index_name = "books1"

In [3]:
# Configure OpenAI API
aoai_client = AzureOpenAI(
  azure_endpoint = AZURE_OPENAI_ENDPOINT, 
  api_key=AZURE_OPENAI_API_KEY,  
  api_version=AZURE_OPENAI_API_VERSION
)
credential = AzureKeyCredential(key)

In [4]:
# Generate Document Embeddings using OpenAI Ada Model
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def calc_embeddings(text):
    # model = "deployment_name"
    embeddings = aoai_client.embeddings.create(input = [text], model=AZURE_OPENAI_EMBEDDINGS_ADA_DEPLOYMENT_NAME).data[0].embedding
    return embeddings

In [5]:
# splitting into 1000 char long chunks with 30 char overlap
# split ["\n\n", "\n", " ", ""]
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=30,
)

documentName = "moby dick book"
fileName = "./data/moby dick.pdf"
loader = PyPDFLoader(fileName)
pages = loader.load_and_split(text_splitter=splitter)
print("Number of pages: ", len(pages))

Number of pages:  1446


In [6]:

import uuid
df = pd.DataFrame(columns=['id','document_name', 'content', 'embedding'])
for page in pages:
    df.loc[len(df.index)] = [str(uuid.uuid4()), documentName, page.page_content, ""]  
df.head()

Unnamed: 0,id,document_name,content,embedding
0,9955e645-e7bf-44af-822e-e7e1ab2d1ccf,moby dick book,The Project Gutenberg eBook of Moby-Dick; or T...,
1,7044cd4e-1f3f-43c4-9ef3-b4737e7fa608,moby dick book,CHAPTER 1. Loomings. \n \nCHAPTER 2. The Carpe...,
2,01f3b5a9-523e-48dc-b9d1-fb638b887980,moby dick book,CHAPTER 11. Nightgown. \n \nCHAPTER 12. Biogra...,
3,885c77b0-3fd4-4582-b2a5-a9ef19468aeb,moby dick book,CHAPTER 41. Moby Dick. \n \nCHAPTER 42. The Wh...,
4,76c61415-cd32-4506-9fcd-690e20251969,moby dick book,CHAPTER 68. The Blanket. \n \nCHAPTER 69. The ...,


In [7]:
# calculate the embeddings using openAI ada 
df["embedding"] = df.content.apply(lambda x: calc_embeddings(x))
df.to_csv('./data/aia_embeddings.csv', index=False)
print(df.head(2))

                                     id   document_name  \
0  9955e645-e7bf-44af-822e-e7e1ab2d1ccf  moby dick book   
1  7044cd4e-1f3f-43c4-9ef3-b4737e7fa608  moby dick book   

                                             content  \
0  The Project Gutenberg eBook of Moby-Dick; or T...   
1  CHAPTER 1. Loomings. \n \nCHAPTER 2. The Carpe...   

                                           embedding  
0  [-0.017309309914708138, -0.02644442208111286, ...  
1  [0.019777635112404823, -0.010173485614359379, ...  


In [8]:
# Output embeddings to json file
output_path = os.path.join('.', 'data', 'aia_embeddings.json')

with open(output_path, 'w') as f:
    df.to_json(f, orient='records', default_handler=str)

In [9]:
# Create a search index
index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="document_name", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")
]

# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw"
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
            vectorizer="myVectorizer"
        )
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            name="myVectorizer",
            azure_open_ai_parameters=AzureOpenAIParameters(
                resource_uri=AZURE_OPENAI_ENDPOINT,
                deployment_id=AZURE_OPENAI_EMBEDDINGS_ADA_DEPLOYMENT_NAME,
                model_name=AZURE_OPENAI_EMBEDDINGS_ADA_DEPLOYMENT_NAME,
                api_key=AZURE_OPENAI_API_KEY
            )
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        content_fields=[SemanticField(field_name="content")]
    )
)
# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

 books1 created


In [10]:
# option 1: upload documents to the index
from azure.search.documents import SearchClient
import json

# Upload some documents to the index
output_path = os.path.join('.', 'data', 'aia_embeddings.json')
with open(output_path, 'r') as file:  
    documents = json.load(file)  
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents") 

Uploaded 1446 documents


In [None]:
# option 2 - If you are indexing a very large number of documents, you can use the SearchIndexingBufferedSender which is an optimized way to automatically index the docs as it will handle the batching for you
from azure.search.documents import SearchIndexingBufferedSender

# Upload some documents to the index 
output_path = os.path.join('..', 'data', 'aia_embeddings.json') 
with open(output_path, 'r') as file:  
    documents = json.load(file)  
  
# Use SearchIndexingBufferedSender to upload the documents in batches optimized for indexing  
with SearchIndexingBufferedSender(  
    endpoint=service_endpoint,  
    index_name=index_name,  
    credential=credential,  
) as batch_client:  
    # Add upload actions for all documents  
    batch_client.upload_documents(documents=documents)  
print(f"Uploaded {len(documents)} documents in total")  