In [1]:
# https://python.langchain.com/docs/integrations/vectorstores/azuresearch
import openai
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch


import os
from dotenv import load_dotenv

load_dotenv("./credentials.env")

True

In [2]:
embeddings: OpenAIEmbeddings = OpenAIEmbeddings(
    model=os.getenv("OPENAI_API_EMBEDDING_MODEL"),
    deployment=os.getenv("OPENAI_API_EMBEDDING_DEPLOYMENT_ID"),
    openai_api_version=os.getenv("OPENAI_API_VERSION"),
    openai_api_base=os.getenv("OPENAI_API_BASE"),
    openai_api_type=os.getenv("OPENAI_API_TYPE"),
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    chunk_size=1
)

index_name: str = "langchain-vector-demo"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=os.getenv("COGNITIVE_SEARCH_API_BASE"),
    azure_search_key=os.getenv("COGNITIVE_SEARCH_API_KEY"),
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

In [3]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

loader = TextLoader("./test.txt", encoding="utf-8")

documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

vector_store.add_documents(documents=docs)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 9 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Embeddings_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 5 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to t

['YmIwZmIxN2ItZjY3Mi00YzEzLWJjNjYtZDM0NmVlNGY0ZTlh',
 'ODE3MTI4NjQtZDYxMS00ZTc1LThlY2YtZmUyNDg0YzJkYjlk',
 'NTU3MDJhZjgtNmMzMi00MDY2LWIxZGQtNzk1YjE4NGJiOTRm',
 'YzRmYWI3Y2ItMzQ4Zi00ZDcxLWE5MGQtM2NkOTIxNTQzNDA4',
 'ODgyYjRhN2MtYmZlMS00OTNkLWEzMGYtYTJhZmI3ZmE3ZDU3',
 'YmU1MzVjNTQtODdiOC00MDE1LWJjNjktM2JiNWYxNTUyODUx',
 'NWJmYTlhNjAtZDUzNi00OTJmLTk2MmItZDI0OGRmNmM0M2Rl',
 'M2M5NzkwN2YtMTAyNi00ZDMwLWI4NmYtNzVlNjYyMDQwYTk4',
 'NTMxYTc0NDEtZmE3YS00MTNjLThkZDUtYzQwMWQ4ZTIyMTFl',
 'NTcxMzJkMTgtYzA4OS00YjAwLTk5NzUtZjlhZTU1MDEwN2Y1',
 'MzY5Zjc0MjUtN2QyNS00OTJlLWEyOWYtZTRiYTRmOTg5Yjc3',
 'MTc5ZDc2ZjEtZGIzNy00NTAyLWI2N2QtNDdkZTI0ODAyZGY4',
 'ODc4ZDU1ZmYtZTY0ZC00YmNjLThjMmItMjE0ZGM2ODFlYzdl',
 'MmU4ZTZhMjgtZjViMy00NGY0LThmYjctZjFmYTBiNDM5N2E3',
 'OTY2NDg4MjQtNzEyYS00MDlhLTlhYjItODM0ZDM0NmJjMmNl',
 'N2JhNzVlZmQtN2YxYy00MGVlLWExOWQtOTEzYWI1N2M3MTc5',
 'ZTY0OTRhZDctZWM4NS00ZjFmLWFiMTEtZWExMmI0MTJlZGU1',
 'M2M3ODkyNGQtYzA5ZS00MGY4LWE4MjUtYmJmZmJiZTlhNzUw',
 'NmNkYTEzOTAtZWI2Yi00Mjg2LTkxMWEtYTNmYjgyYTg0

In [4]:
# Perform a similarity search
docs = vector_store.similarity_search(
    query="What did the president say about Ketanji Brown Jackson",
    k=3,
    search_type="similarity",
)
print(docs[0].page_content)

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.


In [5]:
# Perform a hybrid search
docs = vector_store.similarity_search(
    query="What did the president say about Ketanji Brown Jackson",
    k=3, 
    search_type="hybrid"
)
print(docs[0].page_content)

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.


In [6]:
from azure.search.documents.indexes.models import (
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    ScoringProfile,
    TextWeights,
)

# embeddings: OpenAIEmbeddings = OpenAIEmbeddings(deployment=model, chunk_size=1)
embedding_function = embeddings.embed_query

fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embedding_function("Text")),
        vector_search_configuration="default",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field to store the title
    SearchableField(
        name="title",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field for filtering on document source
    SimpleField(
        name="source",
        type=SearchFieldDataType.String,
        filterable=True,
    ),
]

index_name: str = "langchain-vector-demo-custom"

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=os.getenv("COGNITIVE_SEARCH_API_BASE"),
    azure_search_key=os.getenv("COGNITIVE_SEARCH_API_KEY"),
    index_name=index_name,
    embedding_function=embedding_function,
    fields=fields,
)

In [7]:
# Data in the metadata dictionary with a corresponding field in the index will be added to the index
# In this example, the metadata dictionary contains a title, a source and a random field
# The title and the source will be added to the index as separate fields, but the random won't. (as it is not defined in the fields list)
# The random field will be only stored in the metadata field
vector_store.add_texts(
    ["Test 1", "Test 2", "Test 3"],
    [
        {"title": "Title 1", "source": "A", "random": "10290"},
        {"title": "Title 2", "source": "A", "random": "48392"},
        {"title": "Title 3", "source": "B", "random": "32893"},
    ],
)

['NzNhMTlmMWQtM2U5My00ZDcwLWIwNzEtYjkwODFjMTk4YWY1',
 'MDIzOTJmMWMtNjFjMS00MGY4LWEwYWUtMDg5M2M5NzVhNWFk',
 'M2U3YTM5MTktNWNjNi00ZTI0LWJjNjYtN2JmOTc3YjI3ZTQ5']

In [8]:
res = vector_store.similarity_search(query="Test 3 source1", k=3, search_type="hybrid")
res

[Document(page_content='Test 3', metadata={'title': 'Title 3', 'source': 'B', 'random': '32893'}),
 Document(page_content='Test 2', metadata={'title': 'Title 2', 'source': 'A', 'random': '48392'}),
 Document(page_content='Test 1', metadata={'title': 'Title 1', 'source': 'A', 'random': '10290'})]

In [9]:
res = vector_store.similarity_search(query="Test 3 source1", k=3, search_type="hybrid", filters="source eq 'A'")
res

[Document(page_content='Test 2', metadata={'title': 'Title 2', 'source': 'A', 'random': '48392'}),
 Document(page_content='Test 1', metadata={'title': 'Title 1', 'source': 'A', 'random': '10290'})]

In [10]:
from azure.search.documents.indexes.models import (
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    ScoringProfile,
    TextWeights,
    ScoringFunction,
    FreshnessScoringFunction,
    FreshnessScoringParameters
)

# embeddings: OpenAIEmbeddings = OpenAIEmbeddings(deployment=model, chunk_size=1)
embedding_function = embeddings.embed_query

fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embedding_function("Text")),
        vector_search_configuration="default",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field to store the title
    SearchableField(
        name="title",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field for filtering on document source
    SimpleField(
        name="source",
        type=SearchFieldDataType.String,
        filterable=True,
    ),
    # Additional data field for last doc update
    SimpleField(
        name="last_update",
        type=SearchFieldDataType.DateTimeOffset,
        searchable=True,
        filterable=True
    )
]
# Adding a custom scoring profile with a freshness function
sc_name = "scoring_profile"
sc = ScoringProfile(
    name=sc_name,
    text_weights=TextWeights(weights={"title": 5}),
    function_aggregation="sum",
    functions=[
        FreshnessScoringFunction(
            field_name="last_update",
            boost=100,
            parameters=FreshnessScoringParameters(boosting_duration="P2D"),
            interpolation="linear"
        )
    ]
)

index_name = "langchain-vector-demo-custom-scoring-profile"

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=os.getenv("COGNITIVE_SEARCH_API_BASE"),
    azure_search_key=os.getenv("COGNITIVE_SEARCH_API_KEY"),
    index_name=index_name,
    embedding_function=embeddings.embed_query,
    fields=fields,
    scoring_profiles = [sc],
    default_scoring_profile = sc_name
)

In [11]:
# Adding same data with different last_update to show Scoring Profile effect
from datetime import datetime, timedelta

today = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S-00:00')
yesterday = (datetime.utcnow() - timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%S-00:00')
one_month_ago = (datetime.utcnow() - timedelta(days=30)).strftime('%Y-%m-%dT%H:%M:%S-00:00')

vector_store.add_texts(
    ["Test 1", "Test 1", "Test 1"],
    [
        {"title": "Title 1", "source": "source1", "random": "10290", "last_update": today},
        {"title": "Title 1", "source": "source1", "random": "48392", "last_update": yesterday},
        {"title": "Title 1", "source": "source1", "random": "32893", "last_update": one_month_ago},
    ],
)

['ZTNkZDM3NDctMWU3MS00ZTJjLWJkY2YtYzVhYmU2Yjc5MDBj',
 'M2RmZjBiNTAtZjcyNC00MGM2LWJlY2MtZTFjZDllNGZkNGY1',
 'NDNiYmM4OWQtODlkMi00ZGM5LWI0OWYtMDc1YmMyNTM0NDMy']

In [12]:
res = vector_store.similarity_search(query="Test 1", k=3, search_type="similarity")
res

[Document(page_content='Test 1', metadata={'title': 'Title 1', 'source': 'source1', 'random': '10290', 'last_update': '2023-11-07T02:37:11-00:00'}),
 Document(page_content='Test 1', metadata={'title': 'Title 1', 'source': 'source1', 'random': '48392', 'last_update': '2023-11-06T02:37:11-00:00'}),
 Document(page_content='Test 1', metadata={'title': 'Title 1', 'source': 'source1', 'random': '32893', 'last_update': '2023-10-08T02:37:11-00:00'})]