In [52]:
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os
from azure.search.documents.indexes import SearchIndexClient
from typing import List, Optional, Union
from langchain.docstore.document import Document


from azure.core.credentials_async import AsyncTokenCredential
from azure.search.documents.aio import SearchClient
from azure.search.documents.indexes.aio import SearchIndexClient, SearchIndexerClient


from azure.search.documents.indexes.models import (
    HnswAlgorithmConfiguration,
    HnswParameters,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    SemanticSearch,
    SimpleField,
    VectorSearch,
    VectorSearchProfile,
    VectorSearchVectorizer,
)




In [8]:
AZURE_SEARCH_FULL_INDEX = index_name="gptkbindex"
AZURE_SEARCH_SERVICE = "temple-search"
AZURE_SEARCH_ENDPOINT = f"https://{AZURE_SEARCH_SERVICE}.search.windows.net"


In [21]:

class SearchInfo:
    """
    Class representing a connection to a search service
    To learn more, please visit https://learn.microsoft.com/azure/search/search-what-is-azure-search
    """

    def __init__(self, endpoint: str, credential: Union[AsyncTokenCredential, AzureKeyCredential], index_name: str):
        self.endpoint = endpoint
        self.credential = credential
        self.index_name = index_name

    def create_search_client(self) -> SearchClient:
        return SearchClient(endpoint=self.endpoint, index_name=self.index_name, credential=self.credential)

    def create_search_index_client(self) -> SearchIndexClient:
        return SearchIndexClient(endpoint=self.endpoint, credential=self.credential)

    def create_search_indexer_client(self) -> SearchIndexerClient:
        return SearchIndexerClient(endpoint=self.endpoint, credential=self.credential)


In [2]:
#define fields for the index
fields = [
                (
                    SimpleField(name="id", type="Edm.String", key=True)
                    
                ),
                SearchableField(
                    name="content",
                    type="Edm.String",
                    analyzer_name="en.microsoft",
                ),
                SearchField(
                    name="embedding",
                    type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    hidden=False,
                    searchable=True,
                    filterable=False,
                    sortable=False,
                    facetable=False,
                    vector_search_dimensions=1536,
                    vector_search_profile_name="embedding_config",
                ),
                SimpleField(name="category", type="Edm.String", filterable=True, facetable=True),
                SimpleField(
                    name="sourcepage",
                    type="Edm.String",
                    filterable=True,
                    facetable=True,
                ),
                SimpleField(
                    name="sourcefile",
                    type="Edm.String",
                    filterable=True,
                    facetable=True,
                ),
                SimpleField(
                    name="storageUrl",
                    type="Edm.String",
                    filterable=True,
                    facetable=False,
                ),
            ]


In [6]:
index = SearchIndex(
                name=AZURE_SEARCH_FULL_INDEX,
                fields=fields,
                semantic_search=SemanticSearch(
                    configurations=[
                        SemanticConfiguration(
                            name="default",
                            prioritized_fields=SemanticPrioritizedFields(
                                title_field=None, content_fields=[SemanticField(field_name="content")]
                            ),
                        )
                    ]
                ),
                vector_search=VectorSearch(
                    algorithms=[
                        HnswAlgorithmConfiguration(
                            name="hnsw_config",
                            parameters=HnswParameters(metric="cosine"),
                        )
                    ],
                    profiles=[
                        VectorSearchProfile(
                            name="embedding_config",
                            algorithm_configuration_name="hnsw_config",
                            vectorizer=(
                                None
                            ),
                        ),
                    ],
                ),
            )

In [3]:
#First create the search service in the Azure portal and turn on API access control

!azd auth login

Logged in to Azure.ing subscriptions...


In [13]:
search_index_client=SearchIndexClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=DefaultAzureCredential())

In [16]:
if index_name not in [name for name in search_index_client.list_index_names()]:
                print("Creating search index", index_name)
                search_index_client.create_index(index)
else:
    print("Search index already exists", index_name)
    index_definition = search_index_client.get_index(index_name)
    if not any(field.name == "storageUrl" for field in index_definition.fields):
        print("Adding storageUrl field to index:",index_name)
        index_definition.fields.append(
            SimpleField(
                name="storageUrl",
                type="Edm.String",
                filterable=True,
                facetable=False,
            ),
        )
        search_index_client.create_or_update_index(index_definition)


Search index %s already exists gptkbindex


## Data Ingestion

In [26]:
search_info=SearchInfo(endpoint=AZURE_SEARCH_ENDPOINT, credential=DefaultAzureCredential(), index_name=AZURE_SEARCH_FULL_INDEX)

In [98]:
from openai import AsyncOpenAI
from blobmanager import BlobManager
from listfilestrategy import File
openai_client = AsyncOpenAI(api_key="")


In [33]:
async def create_embedding(lis,client):
    emb_response = await client.embeddings.create(
                        model="text-embedding-ada-002", input=lis,
                    )
    return [data.embedding for data in emb_response.data]

In [47]:
# await create_embedding(["hello","world"],openai_client)
import os

from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
# Option 1: Use OpenAIEmbeddings with OpenAI account
embeddings: OpenAIEmbeddings = OpenAIEmbeddings(
    openai_api_key="",  model="text-embedding-ada-002"
)

# Option 2: Use AzureOpenAIEmbeddings with an Azure account
# embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
#     azure_deployment=azure_deployment,
#     openai_api_version=azure_openai_api_version,
#     azure_endpoint=azure_endpoint,
#     api_key=azure_openai_api_key,
# )

In [105]:
em=embeddings.embed_documents(["The blue fox is really thirsty. He is 99.8 years old"])

In [100]:
os.path.basename("role.pdf")

'role.pdf'

In [106]:
Documents= [
        {
            "id": "doc3",
            "content": "The blue fox is really thirsty. He is 99.8 years old",
            "category": "animals",
            "sourcepage": (BlobManager.sourcepage_from_file_page(
                                filename="Intro-to-Data-and Data-Science-Course-Notes-365-Data-Science.pdf",
                                page=2,
                            )),
            "sourcefile": "Intro-to-Data-and Data-Science-Course-Notes-365-Data-Science.pdf",
            "storageUrl": "https://st2szpvrpnkzjuk.blob.core.windows.net/content/role_library.pdf",
            "embedding": em[0],
        },
        # {
        #     "id": "doc5",
        #     "content": "Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
        #     "category": "lorem",
        #     "sourcepage": "page2",
        #     "sourcefile": "role_library.pdf",
        #     "storageUrl": "https://st2szpvrpnkzjuk.blob.core.windows.net/content/role_library.pdf",
        #     "embedding":e[1]
        # }
    ]


In [73]:
document1

Document(page_content='The quick brown fox jumps over the lazy dog.', metadata={'id': 'doc1', 'content': 'The quick brown fox jumps over the lazy dog.', 'category': 'animals', 'sourcepage': 'page1', 'sourcefile': 'animals.pdf', 'storageUrl': 'https://st2szpvrpnkzjuk.blob.core.windows.net/content/role_library.pdf'})

In [79]:
vector_store.add_documents(documents=[document1,document2])

HttpResponseError: () The request is invalid. Details: The property 'metadata' does not exist on type 'search.documentFields'. Make sure to only use property names that are defined by the type.
Code: 
Message: The request is invalid. Details: The property 'metadata' does not exist on type 'search.documentFields'. Make sure to only use property names that are defined by the type.

In [None]:
async def update_content(
     sections: List[Section], url: Optional[str] = None
    ):
        MAX_BATCH_SIZE = 1000
        section_batches = [sections[i : i + MAX_BATCH_SIZE] for i in range(0, len(sections), MAX_BATCH_SIZE)]

        async with search_info.create_search_client() as search_client:
            for batch_index, batch in enumerate(section_batches):
                documents = [
                    {
                        "id": f"{section.content.filename_to_id()}-page-{section_index + batch_index * MAX_BATCH_SIZE}",
                        "content": section.split_page.text,
                        "category": section.category,
                        "sourcepage": (
                            1
                            # BlobManager.sourcepage_from_file_page(
                            #     filename=section.content.filename(),
                            #     page=section.split_page.page_num,
                            # )
                        ),
                        "sourcefile": section.content.filename(),
                    }
                    for section_index, section in enumerate(batch)
                ]
                if url:
                    for document in documents:
                        document["storageUrl"] = url
                
                embeddings = await create_embedding(
                    [section.split_page.text for section in batch],openai_client
                )
                for i, document in enumerate(documents):
                    document["embedding"] = embeddings[i]
        

                await search_client.upload_documents(documents)

In [85]:
c=search_info.create_search_client()

In [89]:
e=await create_embedding(["The quick brown fox jumps over the lazy dog.","Lorem ipsum dolor sit amet, consectetur adipiscing elit."],openai_client)

In [107]:
await c.upload_documents(Documents)

[<azure.search.documents._generated.models._models_py3.IndexingResult at 0x133597410>]

### Dont forget to turn on semantic ranking in the azure portal