In [1]:
# https://python.langchain.com/docs/integrations/vectorstores/azuresearch
import openai
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch


import getpass

OPENAI_API_TYPE = "azure"
OPENAI_API_VERSION = "2023-05-15"

# "https://<your-endpoint.openai.azure.com/"
OPENAI_API_BASE = input('OpenAI API Base:')
OPENAI_API_KEY = getpass.getpass('OpenAI API Key:')
OPENAI_API_EMBEDDING_MODEL = "text-embedding-ada-002"
OPENAI_API_EMBEDDING_DEPLOYMENT_ID = "embedding"

OpenAI API Base: https://aoai-r7a5whblfnou2.openai.azure.com/
OpenAI API Key: ········


In [2]:
vector_store_address: str = input('Cognitive Search URL:')
vector_store_password: str = getpass.getpass('Cognitive Search Admin Key:')

Cognitive Search URL: https://searchaz6ysakwgm47s.search.windows.net
Cognitive Search Admin Key: ········


In [3]:
embeddings: OpenAIEmbeddings = OpenAIEmbeddings(
    model=OPENAI_API_EMBEDDING_MODEL,
    deployment=OPENAI_API_EMBEDDING_DEPLOYMENT_ID,
    openai_api_version=OPENAI_API_VERSION,
    openai_api_base=OPENAI_API_BASE,
    openai_api_type=OPENAI_API_TYPE,
    openai_api_key=OPENAI_API_KEY,
    chunk_size=1
)

index_name: str = "langchain-vector-demo"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

In [4]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

loader = TextLoader("./test.txt", encoding="utf-8")

documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

vector_store.add_documents(documents=docs)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Get a vector representation of a given input that can be easily consumed by machine learning models and algorithms. Operation under Azure OpenAI API version 2023-05-15 have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 10 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Get a vector representation of a given input that can be easily consumed by machine learning models and algorithms. Operation under Azure OpenAI API version 2023-05-15 have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 6 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to

['OTQyMTZmYzEtZWU3Yi00NjMzLWJhNjYtOTg1YzQyNWYwMTY4',
 'NzExZTg5MTctOWIyYS00NTg5LWFkZTMtZjBkODUwMzliN2Ez',
 'MDg1MTE2YzctYjQ1My00MDdkLTkwY2EtZjZhYmE4ZDYwNDM0',
 'ZTAyNjdjMDktYWJhZC00OWYzLTkyODYtMmY5MzhjMTQ3NjFl',
 'MWZmODUxZTQtMzI0Mi00MDQ3LTllNDQtN2NiNTg2MjY0N2Y3',
 'Y2Q4MGNlZjItMzE1ZS00YmFjLTk1OTMtZGQzODNkMmNkYWJi',
 'Y2NkYWE5ZmQtZjMzOS00YTI0LTgzZWMtM2FlNzVlMTk5ZGE5',
 'MGI5YmViYzQtYjI5Mi00YzAxLWIzNzYtNGYyNDVhZWY2YzAw',
 'OWM4YTA2ZTctNDhjNi00MWEwLTliNjEtYWZkMGQ2MmU1ZDcx',
 'NGM4NzM3NTEtYWM3OC00OGVkLWJkNTMtZWZiZTVlNzk4ODZi',
 'N2EzZDNhZjMtNmIxZS00NDQ2LWFkMzQtMWI5OTlmZjk1NTc1',
 'NjhlYTAwODUtZjc4NC00NzVkLTk3MDAtYTU3OGFhYWM4NmU5',
 'ZGRjODFkNjMtOTE2Ny00NzllLThlOTctMjViNDgyNDM2OTEw',
 'NjhiMzBmYzAtNzAyMC00Yzg1LWE5ZGEtMDIxODZlYmJlYWFi',
 'YTQ5ZjA3MzktOGY3NS00ZmU2LThkNTItYTNmYmNlYzhmM2Jl',
 'MDlhNTI4NjktMjdkMC00ZGEzLThmMTAtMDY2YjlkMGQyMzA2',
 'ZDhkMDdkZjgtZjMxZi00NTUxLWJhNDEtZjM2YWIzNzUwYTU4',
 'N2ZkM2VkYTEtOTc5ZC00MTM0LWE4MGMtYTQyOTI5OTA4Mjkz',
 'ZTBlZTI4N2EtZTkwMy00NzJjLTk1ZjAtMzAxYWM0ZTRl

In [5]:
# Perform a similarity search
docs = vector_store.similarity_search(
    query="What did the president say about Ketanji Brown Jackson",
    k=3,
    search_type="similarity",
)
print(docs[0].page_content)

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.


In [6]:
# Perform a hybrid search
docs = vector_store.similarity_search(
    query="What did the president say about Ketanji Brown Jackson",
    k=3, 
    search_type="hybrid"
)
print(docs[0].page_content)

Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. 

Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. 

One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. 

And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.


In [7]:
from azure.search.documents.indexes.models import (
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    ScoringProfile,
    TextWeights,
)

# embeddings: OpenAIEmbeddings = OpenAIEmbeddings(deployment=model, chunk_size=1)
embedding_function = embeddings.embed_query

fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embedding_function("Text")),
        vector_search_configuration="default",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field to store the title
    SearchableField(
        name="title",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field for filtering on document source
    SimpleField(
        name="source",
        type=SearchFieldDataType.String,
        filterable=True,
    ),
]

index_name: str = "langchain-vector-demo-custom"

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embedding_function,
    fields=fields,
)

In [8]:
# Data in the metadata dictionary with a corresponding field in the index will be added to the index
# In this example, the metadata dictionary contains a title, a source and a random field
# The title and the source will be added to the index as separate fields, but the random won't. (as it is not defined in the fields list)
# The random field will be only stored in the metadata field
vector_store.add_texts(
    ["Test 1", "Test 2", "Test 3"],
    [
        {"title": "Title 1", "source": "A", "random": "10290"},
        {"title": "Title 2", "source": "A", "random": "48392"},
        {"title": "Title 3", "source": "B", "random": "32893"},
    ],
)

['MjgwN2FkY2ItOGY5ZS00MmI0LWJkNjItMjI0ODk5ZWU2ZjU3',
 'NzNlN2JlNGItMmU0ZC00MTg3LThhYzUtZjc0MzAwNGQ4YTFi',
 'OTUxOTc4Y2EtOTEyZC00N2RjLWFiNzYtZTUyOTZhODc3ZDVi']

In [9]:
res = vector_store.similarity_search(query="Test 3 source1", k=3, search_type="hybrid")
res

[Document(page_content='Test 3', metadata={'title': 'Title 3', 'source': 'B', 'random': '32893'}),
 Document(page_content='Test 3', metadata={'title': 'Title 3', 'source': 'B', 'random': '32893'}),
 Document(page_content='Test 1', metadata={'title': 'Title 1', 'source': 'A', 'random': '10290'})]

In [10]:
res = vector_store.similarity_search(query="Test 3 source1", k=3, search_type="hybrid", filters="source eq 'A'")
res

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Get a vector representation of a given input that can be easily consumed by machine learning models and algorithms. Operation under Azure OpenAI API version 2023-05-15 have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 8 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Get a vector representation of a given input that can be easily consumed by machine learning models and algorithms. Operation under Azure OpenAI API version 2023-05-15 have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 4 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to 

[Document(page_content='Test 1', metadata={'title': 'Title 1', 'source': 'A', 'random': '10290'}),
 Document(page_content='Test 1', metadata={'title': 'Title 1', 'source': 'A', 'random': '10290'}),
 Document(page_content='Test 2', metadata={'title': 'Title 2', 'source': 'A', 'random': '48392'})]

In [11]:
from azure.search.documents.indexes.models import (
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    ScoringProfile,
    TextWeights,
    ScoringFunction,
    FreshnessScoringFunction,
    FreshnessScoringParameters
)

# embeddings: OpenAIEmbeddings = OpenAIEmbeddings(deployment=model, chunk_size=1)
embedding_function = embeddings.embed_query

fields = [
    SimpleField(
        name="id",
        type=SearchFieldDataType.String,
        key=True,
        filterable=True,
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    SearchField(
        name="content_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=len(embedding_function("Text")),
        vector_search_configuration="default",
    ),
    SearchableField(
        name="metadata",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field to store the title
    SearchableField(
        name="title",
        type=SearchFieldDataType.String,
        searchable=True,
    ),
    # Additional field for filtering on document source
    SimpleField(
        name="source",
        type=SearchFieldDataType.String,
        filterable=True,
    ),
    # Additional data field for last doc update
    SimpleField(
        name="last_update",
        type=SearchFieldDataType.DateTimeOffset,
        searchable=True,
        filterable=True
    )
]
# Adding a custom scoring profile with a freshness function
sc_name = "scoring_profile"
sc = ScoringProfile(
    name=sc_name,
    text_weights=TextWeights(weights={"title": 5}),
    function_aggregation="sum",
    functions=[
        FreshnessScoringFunction(
            field_name="last_update",
            boost=100,
            parameters=FreshnessScoringParameters(boosting_duration="P2D"),
            interpolation="linear"
        )
    ]
)

index_name = "langchain-vector-demo-custom-scoring-profile"

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
    fields=fields,
    scoring_profiles = [sc],
    default_scoring_profile = sc_name
)

In [12]:
# Adding same data with different last_update to show Scoring Profile effect
from datetime import datetime, timedelta

today = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S-00:00')
yesterday = (datetime.utcnow() - timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%S-00:00')
one_month_ago = (datetime.utcnow() - timedelta(days=30)).strftime('%Y-%m-%dT%H:%M:%S-00:00')

vector_store.add_texts(
    ["Test 1", "Test 1", "Test 1"],
    [
        {"title": "Title 1", "source": "source1", "random": "10290", "last_update": today},
        {"title": "Title 1", "source": "source1", "random": "48392", "last_update": yesterday},
        {"title": "Title 1", "source": "source1", "random": "32893", "last_update": one_month_ago},
    ],
)

['Yzk2OGYzN2MtMzQ1Ny00OGU3LTk2ZTItODQ2NTcyMjkwNDA2',
 'ZmJlZGI0ZjItZWE0Ny00ZDUzLTgzOTMtN2RjMjdlNjBmNDJl',
 'NTIzNTA2ODktNWVlNS00ZDZlLWJkZGQtNWM4ZWYwNzVmNjcw']

In [13]:
res = vector_store.similarity_search(query="Test 1", k=3, search_type="similarity")
res

[Document(page_content='Test 1', metadata={'title': 'Title 1', 'source': 'source1', 'random': '10290', 'last_update': '2023-09-07T08:01:31-00:00'}),
 Document(page_content='Test 1', metadata={'title': 'Title 1', 'source': 'source1', 'random': '48392', 'last_update': '2023-09-06T08:01:31-00:00'}),
 Document(page_content='Test 1', metadata={'title': 'Title 1', 'source': 'source1', 'random': '32893', 'last_update': '2023-08-08T08:01:31-00:00'})]