#### Import required libraries

In [1]:
import openai
import json  
import openai
import wget
import pandas as pd
import zipfile
import os
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector 
from azure.search.documents import SearchIndexingBufferedSender
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,   
)

In [13]:
!pip show openai

Name: openai
Version: 0.27.9
Summary: Python client library for the OpenAI API
Home-page: https://github.com/openai/openai-python
Author: OpenAI
Author-email: support@openai.com
License: 
Location: c:\users\hzmarrou\onedrive\azopenai\azopenai-training\vtraining\lib\site-packages
Requires: aiohttp, requests, tqdm
Required-by: semantic-kernel


#### Configure OpenAI settings
Configure your OpenAI or Azure OpenAI settings. For this example, we use Azure OpenAI.

In [2]:
# Config for Azure OpenAI.
# Load environment variables
load_dotenv()
OPENAI_API_TYPE = "azure"
OPENAI_API_BASE = os.getenv("OPENAI_API_BASE")
OPENAI_API_VERSION = "2023-07-01-preview"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
model: str = "text-embedding-ada-002"
print(OPENAI_API_TYPE)
print(OPENAI_API_BASE)

azure
https://trefoil.openai.azure.com/


##### Configure Azure Cognitive Search Vector Store settings
You can find this in the Azure Portal or using the Search Management SDK

In [3]:
# Config for Azure Search.
AZURE_SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
print(AZURE_SEARCH_ENDPOINT)
AZURE_SEARCH_KEY = os.getenv("AZURE_SEARCH_KEY")
AZURE_SEARCH_INDEX_NAME =  "azure-cognitive-search-vector-demo"
credential = AzureKeyCredential(AZURE_SEARCH_KEY )

https://bea-azsearch.search.windows.net


#### Load data

In [5]:
embeddings_url = "https://cdn.openai.com/API/examples/data/vector_database_wikipedia_articles_embedded.zip"

# The file is ~700 MB so this will take some time
wget.download(embeddings_url)

'vector_database_wikipedia_articles_embedded (2).zip'

In [6]:
with zipfile.ZipFile("vector_database_wikipedia_articles_embedded.zip","r") as zip_ref:
    zip_ref.extractall("../datavdb")

In [4]:
article_df = pd.read_csv('../datavdb/vector_database_wikipedia_articles_embedded.csv')  
  
#Applies the json.loads function to the 'title_vector' and 'content_vector' columns of the DataFrame. This is done to convert these string values back into lists.
article_df["title_vector"] = article_df.title_vector.apply(json.loads)  
article_df["content_vector"] = article_df.content_vector.apply(json.loads)  
article_df['vector_id'] = article_df['vector_id'].apply(str)  
article_df.head()  

Unnamed: 0,id,url,title,text,title_vector,content_vector,vector_id
0,1,https://simple.wikipedia.org/wiki/April,April,April is the fourth month of the year in the J...,"[0.001009464613161981, -0.020700545981526375, ...","[-0.011253940872848034, -0.013491976074874401,...",0
1,2,https://simple.wikipedia.org/wiki/August,August,August (Aug.) is the eighth month of the year ...,"[0.0009286514250561595, 0.000820168002974242, ...","[0.0003609954728744924, 0.007262262050062418, ...",1
2,6,https://simple.wikipedia.org/wiki/Art,Art,Art is a creative activity that expresses imag...,"[0.003393713850528002, 0.0061537534929811954, ...","[-0.004959689453244209, 0.015772193670272827, ...",2
3,8,https://simple.wikipedia.org/wiki/A,A,A or a is the first letter of the English alph...,"[0.0153952119871974, -0.013759135268628597, 0....","[0.024894846603274345, -0.022186409682035446, ...",3
4,9,https://simple.wikipedia.org/wiki/Air,Air,Air refers to the Earth's atmosphere. Air is a...,"[0.02224554680287838, -0.02044147066771984, -0...","[0.021524671465158463, 0.018522677943110466, -...",4


#### Create an index

Use Azure's search client to create or update a search index. Here's a breakdown: It starts by initializing a `SearchIndexClient` with the specified Azure endpoint and credentials.
It then defines the fields of the search index. Each field has its name, type, and other properties. Some fields are marked as 'searchable', indicating they can be used for search queries.
The vector search configuration is set up using the VectorSearch class. It uses Hierarchical Navigable Small World (HNSW) method for vector search with specified parameters and cosine metric.
The semantic configuration is defined, indicating which fields should be prioritized during semantic search (title, url, text).
It creates a SearchIndex with the defined fields, vector search configuration, and semantic settings.
Finally, it uses the `create_or_update_index` method of the `SearchIndexClient` to create or update the search index in Azure. If the operation is successful, it prints a message with the name of the created or updated index.

This code is generally used to set up a search index for a dataset, enabling efficient search and retrieval of data based on specific search queries. The use of vector search and semantic search settings makes it possible to handle complex search queries and improve the relevance of the search results.

In [5]:
# Configure a search index
index_client = SearchIndexClient(
    endpoint=AZURE_SEARCH_ENDPOINT, credential=credential)

fields = [
    SimpleField(name="id", type=SearchFieldDataType.String),
    SimpleField(name="vector_id", type=SearchFieldDataType.String, key=True),
    SimpleField(name="url", type=SearchFieldDataType.String),
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="text", type=SearchFieldDataType.String),
    SearchField(name="title_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="my-vector-config"),
    SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="my-vector-config"),
]

# Configure the vector search configuration
vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
            parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            }
        )
    ]
)

# Optional: configure semantic reranking by passing your title, keywords, and content fields
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="title"),
        prioritized_keywords_fields=[SemanticField(field_name="url")],
        prioritized_content_fields=[SemanticField(field_name="text")]
    )
)
# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the index 
index = SearchIndex(name=AZURE_SEARCH_INDEX_NAME , fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f'{result.name} created')

azure-cognitive-search-vector-demo created


#### Insert text and embeddings into vector store
In this notebook, the wikipedia articles dataset provided by OpenAI, the embeddings are pre-computed. The code below takes the data frame and converts it into a dictionary list to upload to your Azure Search index.

In [6]:
# Convert the 'id' and 'vector_id' columns to string so one of them can serve as our key field  
article_df['id'] = article_df['id'].astype(str)  
article_df['vector_id'] = article_df['vector_id'].astype(str)  
  
# Convert the DataFrame to a list of dictionaries  
documents = article_df.to_dict(orient='records')  
  
# Use SearchIndexingBufferedSender to upload the documents in batches optimized for indexing 
with SearchIndexingBufferedSender(AZURE_SEARCH_ENDPOINT, AZURE_SEARCH_INDEX_NAME, AzureKeyCredential(AZURE_SEARCH_KEY)) as batch_client:  
    # Add upload actions for all documents  
    batch_client.upload_documents(documents=documents)  
  
print(f"Uploaded {len(documents)} documents in total")  

Uploaded 25000 documents in total


In [8]:
print(documents[0:2])

[{'id': '1', 'url': 'https://simple.wikipedia.org/wiki/April', 'title': 'April', 'text': 'April is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of four months to have 30 days.\n\nApril always begins on the same day of week as July, and additionally, January in leap years. April always ends on the same day of the week as December.\n\nApril\'s flowers are the Sweet Pea and Daisy. Its birthstone is the diamond. The meaning of the diamond is innocence.\n\nThe Month \n\nApril comes between March and May, making it the fourth month of the year. It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.\n\nApril begins on the same day of the week as July every year and on the same day of the week as January in leap years. April ends on the same day of the week as December every year, as each other\'s last days are exactly 35 weeks (245 days) apart.\n\nIn comm

If your dataset didn't already contain pre-computed embeddings, you can create embeddings by using the below function using the `openai` python library. You'll also notice the same function and model are being used to generate query embeddings for performing vector searches.

In [24]:
# Example function to generate document embedding  
openai.api_version = '2023-05-15'
openai.api_base = "https://trefoil.openai.azure.com/"
openai.api_type = 'azure'
openai.api_key = "deec54ba420e44bbac7fbc640b724fd4"
deployment_id = "text-embedding-ada-002"

def generate_document_embeddings(text):  
    response = openai.Embedding.create(  
                deployment_id=deployment_id,
                input = text
                )  
    embeddings = response['data'][0]['embedding']  
    return embeddings  

In [25]:

# Sampling the first document content as an example 
first_document_content = documents[0]['text']  
print(f"Content: {first_document_content[:100]}")    
    

Content: April is the fourth month of the year in the Julian and Gregorian calendars, and comes between March


In [26]:
# Generate the content vector using the `generate_document_embeddings` function    
content_vector = generate_document_embeddings(first_document_content)    
print(f"Content vector generated")    

Content vector generated


#### Perform a vector similarity search

In [27]:
# Function to generate query embedding
def generate_embeddings(text):
    response = openai.Embedding.create(
        input=text, engine="text-embedding-ada-002")
    embeddings = response['data'][0]['embedding']
    return embeddings

# Pure Vector Search
query = "modern art in Europe"
  
search_client = SearchClient(AZURE_SEARCH_ENDPOINT, AZURE_SEARCH_INDEX_NAME, AzureKeyCredential(AZURE_SEARCH_KEY))  
vector = Vector(value=generate_embeddings(query), k=3, fields="content_vector")  
  
results = search_client.search(  
    search_text=None,  
    vectors=[vector],  
    select=["title", "text", "url"] 
)
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}") 
    print(f"Content: {result['text']}")   
    print(f"URL: {result['url']}\n")  

Title: April
Score: 0.77926207
Content: April is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of four months to have 30 days.

April always begins on the same day of week as July, and additionally, January in leap years. April always ends on the same day of the week as December.

April's flowers are the Sweet Pea and Daisy. Its birthstone is the diamond. The meaning of the diamond is innocence.

The Month 

April comes between March and May, making it the fourth month of the year. It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.

April begins on the same day of the week as July every year and on the same day of the week as January in leap years. April ends on the same day of the week as December every year, as each other's last days are exactly 35 weeks (245 days) apart.

In common years, April starts on the same day of the week as October of

In [28]:
def generate_embeddings(text):
    response = openai.Embedding.create(
        input=text, engine="text-embedding-ada-002")
    embeddings = response['data'][0]['embedding']
    return embeddings

# Pure Vector Search
query = "modern art in Europe"

#### Perform a Hybrid Search

In [29]:
# Hybrid Search
query = "Famous battles in Scottish history"  
  
search_client = SearchClient(AZURE_SEARCH_ENDPOINT, AZURE_SEARCH_INDEX_NAME, AzureKeyCredential(AZURE_SEARCH_KEY)) 
vector = Vector(value=generate_embeddings(query), k=3, fields="content_vector")  
  
results = search_client.search(  
    search_text=query,  
    vectors=[vector],
    select=["title", "text", "url"],
    top=3
)  
  
for result in results:  
    print(f"Title: {result['title']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"URL: {result['url']}\n")  

Title: April
Score: 0.03333333507180214
URL: https://simple.wikipedia.org/wiki/April



#### Perform a Hybrid Search with Reranking (powered by Bing)
Semantic search allows you to leverage deep neural networks from Microsoft Bing to further increase your search accuracy. Additionally, you can get captions, answers, and highlights.

In [31]:
# Semantic Hybrid Search
query = "Famous battles in Scottish history" 

search_client = SearchClient(AZURE_SEARCH_ENDPOINT, AZURE_SEARCH_INDEX_NAME, AzureKeyCredential(AZURE_SEARCH_KEY)) 
vector = Vector(value=generate_embeddings(query), k=3, fields="content_vector")  

results = search_client.search(  
    search_text=query,  
    vectors=[vector], 
    select=["title", "text", "url"],
    query_type="semantic", query_language="en-us", semantic_configuration_name='my-semantic-config', query_caption="extractive", query_answer="extractive",
    top=3
)

semantic_answers = results.get_answers()
for answer in semantic_answers:
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"Title: {result['title']}")
    print(f"URL: {result['url']}")
    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")

HttpResponseError: (FeatureNotSupportedInService) Semantic search is not enabled for this service.
Parameter name: queryType
Code: FeatureNotSupportedInService
Message: Semantic search is not enabled for this service.
Parameter name: queryType
Exception Details:	(SemanticQueriesNotAvailable) Semantic search is not enabled for this service.
	Code: SemanticQueriesNotAvailable
	Message: Semantic search is not enabled for this service.

In [44]:
from tenacity import retry, wait_random_exponential, stop_after_attempt  
with open('./data/json-sample/text-sample.json', 'r', encoding='utf-8') as file:
    input_data = json.load(file)


In [45]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text):
    response = openai.Embedding.create(
        input=text, engine="text-embedding-ada-002")
    embeddings = response['data'][0]['embedding']
    return embeddings


In [46]:

# Generate embeddings for title and content fields
for item in input_data:
    title = item['title']
    content = item['content']
    title_embeddings = generate_embeddings(title)
    content_embeddings = generate_embeddings(content)
    item['titleVector'] = title_embeddings
    item['contentVector'] = content_embeddings

In [48]:
# Output embeddings to docVectors.json file
with open("./output/docVectors.json", "w") as f:
    json.dump(input_data, f)

In [None]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=AZURE_SEARCH_ENDPOINT, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchableField(name="category", type=SearchFieldDataType.String,
                    filterable=True),
    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile="myHnswProfile"),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile="myHnswProfile"),
]