In [None]:
import os
import openai
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.text_splitter import TokenTextSplitter

load_dotenv(".env")

embeddings = OpenAIEmbeddings(deployment_id=EMBEDDING_DEPLOYMENT_NAME, chunk_size=450)

acs = AzureSearch(azure_search_endpoint=os.getenv('AZURE_COGNITIVE_SEARCH_SERVICE_NAME'),
                 azure_search_key=os.getenv('AZURE_COGNITIVE_SEARCH_API_KEY'),
                 index_name=os.getenv('AZURE_COGNITIVE_SEARCH_INDEX_NAME'),
                 embedding_function=embeddings.embed_query)


In [3]:
acs.client.get_document_count()

0

In [5]:
loader = DirectoryLoader('./', glob="*9.json", loader_cls=TextLoader, loader_kwargs={'autodetect_encoding': True})

documents = loader.load()
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

# Add documents to Azure Search
# doc_ids = acs.add_documents(documents=docs)


In [7]:
import json

data = json.loads(docs[0].page_content)
data

{'Title': 'what is a+B', 'Body': 'a+b = b+a'}

In [8]:
keys = data['Title'].lower().replace(" ", "-")

In [9]:
keys

'what-is-a+b'

In [10]:
keys

'what-is-a+b'

In [11]:
doc_ids = acs.add_or_modify_documents(docs, keys=[keys])

In [12]:
doc_ids[0].encode()

b'd2hhdC1pcy1hK2I='

In [13]:
import base64
base64.urlsafe_b64decode(doc_ids[0].encode())


b'what-is-a+b'

In [16]:
docs = acs.similarity_search(
    query="what is a+b",
    k=3,
    search_type="similarity",
)
for doc in docs:
    print(doc.page_content)

{
  "Title": "what is a+B",
  "Body": "a+b = b+a"
}



## Changed the body of that file

In [17]:
loader = DirectoryLoader('./', glob="*9.json", loader_cls=TextLoader, loader_kwargs={'autodetect_encoding': True})

documents = loader.load()
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

# Add documents to Azure Search
# doc_ids = acs.add_documents(documents=docs)


In [18]:
import json

data = json.loads(docs[0].page_content)
data

{'Title': 'what is a+B', 'Body': "I don't know"}

In [19]:
keys = data['Title'].lower().replace(" ", "-")

In [20]:
keys

'what-is-a+b'

In [21]:
doc_ids = acs.add_or_modify_documents(docs, keys=[keys])

In [22]:
doc_ids[0].encode()

b'd2hhdC1pcy1hK2I='

In [23]:
import base64
base64.urlsafe_b64decode(doc_ids[0].encode())


b'what-is-a+b'

## Same document key 

In [24]:
docs = acs.similarity_search(
    query="what is a+b",
    k=3,
    search_type="similarity",
)
for doc in docs:
    print(doc.page_content)

{
  "Title": "what is a+B",
  "Body": "I don't know"
}



In [26]:
acs.client.get_document_count()  # With same count

1