## Import require libraries and envrioment variables

In [34]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [17]:
import datetime
import io
import json
import glob
import math
import os
import requests
import sys
import time
import http.client, urllib.parse

from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    PrioritizedFields,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection,
    SemanticConfiguration,
    SemanticField,
    SemanticSettings,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
    SearchIndexer,
    FieldMapping,
    SplitSkill
)
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv

load_dotenv()  

service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")  
api_version = os.getenv("AZURE_SEARCH_API_VERSION")
key = os.getenv("AZURE_SEARCH_ADMIN_KEY") 
credential = AzureKeyCredential(key)

blob_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
blob_base_url = os.getenv("AZURE_STORAGE_BASE_URL")
container_name = os.getenv("AZURE_STORAGE_CONTAINER_NAME")

# Setup the Payloads header for cog search
headers = {'Content-Type': 'application/json','api-key': key}
params = {'api-version': api_version}

In [18]:
sys.version

'3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0]'

In [19]:
print("Today is", datetime.datetime.today())

Today is 2024-03-16 05:20:33.175647


In [20]:
def index_status(index_name):
    print("Azure Cognitive Search Index:", index_name, "\n")

    index_status = requests.get(
        service_endpoint + "/indexes/" + index_name, headers=headers, params=params
    )
    try:
        print(json.dumps((index_status.json()), indent=5))
    except:
        print("Request failed")

def index_stats(index_name):
    url = (
        service_endpoint
        + "/indexes/"
        + index_name
        + "/stats?api-version="
        + api_version
    )
    
    response = requests.get(url, headers=headers)
    print("Azure AI Search index status for:", index_name, "\n")

    if response.status_code == 200:
        res = response.json()
        print(json.dumps(res, indent=2))
        document_count = res["documentCount"]
        storage_size = res["storageSize"]

    else:
        print("Request failed with status code:", response.status_code)

    return document_count, storage_size


## Connect to Blob Storage

Retrieve documents from blob storage. 

In [21]:
# connect to blob storage
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
container_client = blob_service_client.get_container_client(container_name)
blobs = container_client.list_blobs()

# print the url of each blob
for blob in blobs:
    blobs_url = container_client.get_blob_client(blob).url
    print(blobs_url)

https://ksegawastorage.blob.core.windows.net/kenichi-aisearch-doc-custom/EspressoEssentials_150720.pdf
https://ksegawastorage.blob.core.windows.net/kenichi-aisearch-doc-custom/rao-barista.pdf


## Connect your Blob storage to a data source in Azure AI search

In [22]:
index_name = 'ksegawa-index-w-textsplitchunking'

# create a data source
# connect to the search service indexer
indexer_client = SearchIndexerClient(service_endpoint, credential)

# create a data source
container = SearchIndexerDataContainer(name=container_name)

# create a data source connection
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-blob",
    type="azureblob",
    connection_string=blob_connection_string,
    container=container
)

# create or update the data source
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

# print the data source
print(f"Data source '{data_source.name}' created or updated")

Data source 'ksegawa-index-w-textsplitchunking-blob' created or updated


## Create an index

In [23]:
# Create a search index
index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)

# define the fields for the index
fields = [
    SearchField(name="parent_id", type=SearchFieldDataType.String,filterable=True, sortable=True,searchable=True),
    SearchField(name="content", type=SearchFieldDataType.String, searchable=True, filterable=True, sortable=True),
    SearchField(name="title", type=SearchFieldDataType.String, searchable=True),
    SearchField(name="chunk_id",type=SearchFieldDataType.String,key=True,filterable=True,sortable=True,searchable=True,analyzer_name="keyword"),
    SearchField(name="chunk",type=SearchFieldDataType.String,searchable=True),
]

scoring_profiles = []

index = SearchIndex(
    name=index_name,
    fields=fields,
    scoring_profiles=scoring_profiles)

result = index_client.create_or_update_index(index)
result

<azure.search.documents.indexes.models._index.SearchIndex at 0x7f619a41f070>

In [24]:
index_status(index_name)

Azure Cognitive Search Index: ksegawa-index-w-textsplitchunking 

{
     "@odata.context": "https://cogsearchbasiceastus.search.windows.net/$metadata#indexes/$entity",
     "@odata.etag": "\"0x8DC4575C85C4D4B\"",
     "name": "ksegawa-index-w-textsplitchunking",
     "defaultScoringProfile": null,
     "fields": [
          {
               "name": "parent_id",
               "type": "Edm.String",
               "searchable": true,
               "filterable": true,
               "retrievable": true,
               "sortable": true,
               "facetable": true,
               "key": false,
               "indexAnalyzer": null,
               "searchAnalyzer": null,
               "analyzer": null,
               "normalizer": null,
               "dimensions": null,
               "vectorSearchProfile": null,
               "synonymMaps": []
          },
          {
               "name": "content",
               "type": "Edm.String",
               "searchable": true,
         

# Create an indexer

In [27]:
# Required to use the preview SDK
from azure.search.documents.indexes._generated.models import (
    SearchIndexerSkillset,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    SearchIndexerIndexProjections,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    InputFieldMappingEntry,
    OutputFieldMappingEntry
)

# Create an indexer
indexer_name = f"{index_name}-indexer"
skillset_name = f"{index_name}-chunk-skillset"

indexer_client = SearchIndexerClient(service_endpoint, credential)

skillset = SearchIndexerSkillset(
        name=skillset_name,
        skills=[
            SplitSkill(
                name="Text Splitter",
                default_language_code="en",
                text_split_mode='pages',
                maximum_page_length=2000,
                page_overlap_length=500,
                context="/document",
                inputs=[
                    InputFieldMappingEntry(
                        name="text",
                        source="/document/content"
                    )
                ],
                outputs=[
                    OutputFieldMappingEntry(
                        name="textItems",
                        target_name="pages"
                    )
                ]
            )
        ]
)
            
indexer_client.create_or_update_skillset(skillset)
            
indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents",  
    target_index_name=index_name,  
    data_source_name=data_source.name,  
    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results  
    field_mappings=[FieldMapping(source_field_name="id", target_field_name="title")]  
)

indexer_result = indexer_client.create_or_update_indexer(indexer)

# Run the indexer  
indexer_client.run_indexer(indexer_name) 
print(f' {indexer_name} is created and running. If queries return no results, please wait a bit and try again.')

 ksegawa-index-w-textsplitchunking-indexer is created and running. If queries return no results, please wait a bit and try again.


In [28]:
index_stats(index_name)

Azure AI Search index status for: ksegawa-index-w-textsplitchunking 

{
  "@odata.context": "https://cogsearchbasiceastus.search.windows.net/$metadata#Microsoft.Azure.Search.V2023_10_01_Preview.IndexStatistics",
  "documentCount": 0,
  "storageSize": 0,
  "vectorIndexSize": 0
}


(0, 0)

In [48]:
query = "grinder coffee"
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
results = search_client.search(  
    search_text=query,  
    select=["id", "content", "title"],
    top=1
)  
  
for result in results:  
    print(f"id: {result['id']}")  
    print(f"content: {result['content']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"title: {result['title']}") 

id: aHR0cHM6Ly9rc2VnYXdhc3RvcmFnZS5ibG9iLmNvcmUud2luZG93cy5uZXQva2VuaWNoaS1haXNlYXJjaC1kb2MtY3VzdG9tL0VzcHJlc3NvRXNzZW50aWFsc18xNTA3MjAucGRm0
content: 
This tutorial 
looks at espresso 
in detail.

You will learn:
What an espresso is.

The meaning of extraction 
and strength in terms of 
espresso.

The meaning of an Espresso 
recipe and the components 
of that recipe.

What is a Brew Ratio.

How to dial in an Espresso 
Recipe.

WHAT IS ESPRESSO?

Simply put Espresso is a way of brewing coffee. There are many different 
ways to brew coffee such as AeroPress, French Press and Pour Over; this 
particular brew method produces a complex, concentrated drink, also 
known as Espresso. 

Espresso is brewed by forcing a small amount of pressurised hot water 
through finely-ground coffee, using an Espresso Machine. It is made up of:

- Soluble Solids including complex sugars, acids and caffeine that 
  contribute to the taste.
- Soluble Gases including Co2 that form the aroma of the coffee.
- Ins

In [None]:
results = search_client.search(  
    search_text=query,  
    select=["parent_id", "chunk_id", "chunk"],
    top=1
)  
  
for result in results:  
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['chunk']}") 

In [None]:
index_client.delete_index(index_name)
index_client.delete_indexer(indexer_name)
index_client.delete_skillset(skillset_name)