# Azure Cognitive Search LangChain Vector Code Sample
This code demonstrates how to use Azure Cognitive Search with OpenAI and the Azure Cognitive Search LangChain Vector Store
To run the code, install the following packages. Please use the latest pre-release version `pip install azure-search-documents --pre`.

In [None]:
! pip install azure-search-documents --pre 
! pip install openai
! pip install python-dotenv
! pip install langchain

## Import required libraries and environment variables

In [1]:
# Import required libraries  
import openai
import os  
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch
from azure.search.documents.indexes.models import (
    SemanticSettings,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField
)
import pandas as pd

# load environment variables from .env
load_dotenv()


True

## Configure OpenAI Settings

In [2]:
# Configure environment variables - uncomment Azure settings if using Azure OpenAI 
openai.api_type: os.getenv("OPENAI_API_TYPE")
openai.api_key = os.getenv("OPENAI_API_KEY")  
openai.api_base = os.getenv("OPENAI_API_BASE")  
openai.api_version = os.getenv("OPENAI_API_VERSION")  
completion_model: str = os.getenv("OPENAI_MODEL")
embedding_model: str = os.getenv("OPENAI_MODEL_EMBED")

## Configure Vector Store Settings

In [3]:
vector_store_address: str = os.getenv("AZURE_SEARCH_ENDPOINT")  
vector_store_key: str = os.getenv("AZURE_SEARCH_ADMIN_KEY") 
index_name: str = os.getenv("AZURE_SEARCH_INDEX") 

## Create index
Create the index in Azure Cognitive Search service:

In [None]:
embeddings: OpenAIEmbeddings = OpenAIEmbeddings(deployment=embedding_model, model=embedding_model, chunk_size=1, openai_api_key=openai.api_key, openai_api_base=openai.api_base, openai_api_type="azure" )

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_key,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
    semantic_configuration_name='centriqe-config',
        semantic_settings=SemanticSettings(
            default_configuration='centriqe-config',
            configurations=[
                SemanticConfiguration(
                    name='centriqe-config',
                    prioritized_fields=PrioritizedFields(
                        title_field=SemanticField(field_name='content'),
                        prioritized_content_fields=[SemanticField(field_name='content')],
                        prioritized_keywords_fields=[SemanticField(field_name='metadata')]
                    ))
            ])
    )

In [None]:
# read csv into df - the catalog has following columns 
# ProductID, ProductName, ProductBrand, Gender, Price (INR), Description, PrimaryColor, ImageURL

df = pd.read_csv('../data/products_catalog.csv')
print(df)

## Insert text and embeddings into vector store
Load text files into the index

In [None]:
from langchain.schema import Document

docs = []

for index, row in df.iterrows():
    document = Document(
        page_content = row['Description'].replace('â€', '').replace('Â', '').replace('©', '').replace('*', '').replace('•', '').replace('*', '').replace('“', '').replace('”', ''),
        metadata = {"id": row['ProductID'], "name": row['ProductName'], "brand": row['ProductBrand'], "gender": row['Gender'], "price": row['Price (INR)'], "color": row['PrimaryColor'], "image": row['ImageURL']},
    )
    docs.append(document)
vector_store.add_documents(documents=docs)

In [None]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="name",
        description="The name of the product",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="brand",
        description="The brand of the product",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="gender",
        description="The gender the product is for",
        type="string or list[string]",
    ),
    AttributeInfo(
        name="price",
        description="The price of the product",
        type="integer",
    ),
    AttributeInfo(
        name="color",
        description="The color of the product",
        type="string or list[string]",
    ),  
    AttributeInfo(
        name="image_url",
        description="The url of an image of the product",
        type="string",
    ),
]
document_content_description = "Brief product description of clothing"

In [5]:
from langchain.chat_models import AzureChatOpenAI
from langchain.retrievers import AzureCognitiveSearchRetriever

# Setup LLM and QA chain
#llm = AzureChatOpenAI(
#    deployment_name=completion_model,
#    temperature=0,
#    openai_api_base=openai.api_base,
#    openai_api_version=openai.api_version,
#)

#retriever = SelfQueryRetriever.from_llm(
#    llm,
#    vector_store,
#    document_content_description,
#    metadata_field_info,
#    verbose=True
#)

# Define retriever k determines the number of dicuments to return
#retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5, "fetch_k": 4})
retriever = AzureCognitiveSearchRetriever(
    service_name="gtlopenaidemisearch",
    api_key=vector_store_key,
    index_name=index_name,
    content_key="content", 
    top_k=10,
    )

In [7]:
# This example only specifies a relevant query
retriever.get_relevant_documents("What are some pepe jeans")

[Document(page_content='Yellow printed T-shirt, has a round neck, and short sleeves Design Details-PEPE JEANS MENS ROUND NECK TSHIRT ,PLACEMENT PRINTED,IN SLIM FIT.SHORT SLEEVES', metadata={'@search.score': 17.661282, 'id': '10265171', 'content_vector': [-0.029193599, -0.017465167, 0.0062084207, -0.010989032, -0.013156243, 0.0058674035, -0.011868664, 0.01115476, -0.0030819008, -0.020524757, -0.002165617, 0.025764307, -0.002791877, 0.008573229, -0.024005042, -0.010504597, 0.022398757, -0.011849542, 0.02125141, -0.0022054554, -0.0067629716, 0.009612216, 0.008955679, -0.011843167, 0.005411652, 0.0077127195, 0.02927009, -0.010205012, 0.027306851, 0.006820339, 0.026975395, -0.0033018088, -0.008394754, -0.042400833, -0.012792916, 0.008898311, -0.010377114, 0.009790692, 0.01608835, -0.011142012, 0.023699084, -0.0015162505, 0.015361697, -0.007011563, -0.00020178163, -0.009223392, 0.014533059, 0.006833087, 0.0039615333, 0.02738334, 0.0037671216, 0.016904242, -0.0012588943, -0.018293805, 0.00439

## Perform a vector similarity search

In [None]:
# Perform a similarity search
docs = vector_store.similarity_search(
    query="red pants",
    k=3,
    search_type="similarity",
)
results = [doc.page_content for doc in docs]
print("\n".join(results))

## Perform a hybrid search

In [None]:
# Perform a hybrid search
docs = vector_store.hybrid_search(
    query="What is challenger sales model",
    k=3, 
    search_type="hybrid"
)
results = [doc.page_content for doc in docs]
print("\n".join(results))

## Perform a Hybrid Search with Semantic re-ranking (powered by Bing)

In [None]:
# Perform a hybrid search with semantic reranking  
docs_and_scores = vector_store.semantic_hybrid_search_with_score(  
    query="What is challenger sales model",  
    k=3,  
)  
  
# Print the results  
for doc, score in docs_and_scores:  
    print("-" * 80)  
    answers = doc.metadata['answers']  
    if answers:  
        if answers.get('highlights'):  
            print(f"Semantic Answer: {answers['highlights']}")  
        else:  
            print(f"Semantic Answer: {answers['text']}")  
        print(f"Semantic Answer Score: {score}")  
    print("Content:", doc.page_content)  
    captions = doc.metadata['captions']
    print(f"Score: {score}") 
    if captions:  
        if captions.get('highlights'):  
            print(f"Caption: {captions['highlights']}")  
        else:  
            print(f"Caption: {captions['text']}")  
    else:  
        print("Caption not available")  
