## Import require libraries and envrioment variables

In [None]:
pip install -r requirements.txt

In [None]:
import datetime
import io
import json
import glob
import math
import os
import requests
import sys
import time
import http.client, urllib.parse

from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SemanticPrioritizedFields,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection,
    SemanticConfiguration,
    SemanticField,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
    IndexProjectionMode,
    SearchIndexer,
    FieldMapping,
    SplitSkill
)

from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv

load_dotenv()  

service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")  
api_version = os.getenv("AZURE_SEARCH_API_VERSION")
key = os.getenv("AZURE_SEARCH_ADMIN_KEY") 
credential = AzureKeyCredential(key)

blob_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
blob_base_url = os.getenv("AZURE_STORAGE_BASE_URL")

# Setup the Payloads header for cog search
headers = {'Content-Type': 'application/json','api-key': key}
params = {'api-version': api_version}

In [None]:
def index_status(index_name):
    print("Azure Cognitive Search Index:", index_name, "\n")

    index_status = requests.get(
        service_endpoint + "/indexes/" + index_name, headers=headers, params=params
    )
    try:
        print(json.dumps((index_status.json()), indent=5))
    except:
        print("Request failed")

def index_stats(index_name):
    url = (
        service_endpoint
        + "/indexes/"
        + index_name
        + "/stats?api-version="
        + api_version
    )
    
    response = requests.get(url, headers=headers)
    print("Azure AI Search index status for:", index_name, "\n")

    if response.status_code == 200:
        res = response.json()
        print(json.dumps(res, indent=2))
        document_count = res["documentCount"]
        storage_size = res["storageSize"]

    else:
        print("Request failed with status code:", response.status_code)

    return document_count, storage_size


## Connect to Blob Storage

Retrieve documents from blob storage. 

In [None]:
# get files from data folder
data_folder = "data"
filename = "rao-barista.pdf"

container_name = 'blob-rao-barista-file'

# connect to blob storage
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
container_client = blob_service_client.get_container_client(container_name)
if not container_client.exists():
    container_client.create_container()

# upload data to blob storage
documents_directory = os.path.join("data")
for file in os.listdir(documents_directory): 
    if file == filename:
        with open(os.path.join(documents_directory, file), "rb") as data:
            name = os.path.basename(file)
            if not container_client.get_blob_client(name).exists():
                print(f'Uploading {name} to blob storage...')
                container_client.upload_blob(name=name, data=data)

## Connect your Blob storage to a data source in Azure AI search

In [None]:
index_name = 'ksegawa-index-w-textsplitchunking'

# create a data source
# connect to the search service indexer
indexer_client = SearchIndexerClient(service_endpoint, credential)

# create a data source
container = SearchIndexerDataContainer(name=container_name)

# create a data source connection
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-connection",
    type="azureblob",
    connection_string=blob_connection_string,
    container=container
)

# create or update the data source
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

# print the data source
print(f"Data source '{data_source.name}' created or updated")

## Create an index

In [None]:
# Create a search index
index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)

# define the fields for the index
fields = [  
    SearchField(name="parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),  
    SearchField(name="title", type=SearchFieldDataType.String),  
    SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),  
    SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),  
]  

scoring_profiles = []

index = SearchIndex(
    name=index_name,
    fields=fields,
    scoring_profiles=scoring_profiles)

result = index_client.create_or_update_index(index)
result

# Create an indexer

In [None]:
# Required to use the preview SDK
from azure.search.documents.indexes._generated.models import (
    SearchIndexerSkillset,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    SearchIndexerIndexProjections,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    InputFieldMappingEntry,
    OutputFieldMappingEntry
)

# Create an indexer
indexer_name = f"{index_name}-indexer"
skillset_name = f"{index_name}-chunk-skillset"

indexer_client = SearchIndexerClient(service_endpoint, credential)

index_projections = SearchIndexerIndexProjections(  
    selectors=[  
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,  
            parent_key_field_name="parent_id",  
            source_context="/document/pages/*",  
            mappings=[  
                InputFieldMappingEntry(name="chunk", source="/document/pages/*"),  
                InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),  
            ],  
        ),  
    ],  
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),  
)  

skillset = SearchIndexerSkillset(
        name=skillset_name,
        skills=[
            SplitSkill(
                name="Text Splitter",
                default_language_code="en",
                text_split_mode='pages',
                maximum_page_length=500,
                page_overlap_length=0,
                context="/document",
                inputs=[
                    InputFieldMappingEntry(
                        name="text",
                        source="/document/content"
                    )
                ],
                outputs=[
                    OutputFieldMappingEntry(
                        name="textItems",
                        target_name="pages"
                    )
                ]
            )
        ],
        index_projections=index_projections
)
            
indexer_client.create_or_update_skillset(skillset)

In [None]:
# Create an indexer
try:           
    indexer = SearchIndexer(  
        name=indexer_name,  
        description="Indexer to index documents",  
        target_index_name=index_name,  
        data_source_name=data_source.name,  
        skillset_name=skillset_name
    )

    indexer_result = indexer_client.create_or_update_indexer(indexer)

    # Run the indexer  
    indexer_client.run_indexer(indexer_name) 
    print(f' {indexer_name} is created and running. If queries return no results, please wait a bit and try again.')
except Exception as ex:
    # if the indexer fails, delete and retry again
    print(ex)
    indexer_client.delete_indexer(indexer_name)
    indexer_client.create_or_update_indexer(indexer)
    indexer_client.run_indexer(indexer_name)
    print(f' {indexer_name} is re-created and running. If queries return no results, please wait a bit and try again.')

In [None]:
index_stats(index_name)

In [None]:
query = "grinder coffee"
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
results = search_client.search(  
    search_text=query,  
    select=["chunk_id", "chunk", "title"],
    top=1
)  

for result in results: 
    print(f"Score: {result['@search.score']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"title: {result['title']}") 
    print(f"content: {result['chunk']}")  
    print('===================================================')

In [None]:
query = "What are the key steps a barista should follow to prepare an espresso with uniform resistance to water?"

results = search_client.search(  
    search_text=query,  
    select=["parent_id", "chunk_id", "chunk", "title"],
    top=1
)  

for result in results:  
    print(f"Score: {result['@search.score']}")  
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"title: {result['title']}") 
    print(f"Chunk: {result['chunk']}") 
    print('===================================================')

In [None]:
# delete index, indexer, skillsest

index_client.delete_index(index_name)
index_client.delete_indexer(indexer_name)
index_client.delete_skillset(skillset_name)