In [14]:
import yaml
import os
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile
)
from openai import AzureOpenAI
from azure.core.credentials import AzureKeyCredential


In [15]:
# Import config file
with open("config.yml", "r") as config_file:
    config = yaml.safe_load(config_file)

In [16]:
# Azure OpenAI configuration
AZURE_OPENAI_ENDPOINT = config['azure_openai_endpoint']
AZURE_OPENAI_API_KEY = config['azure_openai_key']
AZURE_OPENAI_API_VERSION = config['azure_openai_api_version']

# Azure AI search
AISEARCH_ENDPOINT = config['aisearch_endpoint']
AISEARCH_CREDENTIAL = AzureKeyCredential(config['aisearch_credential'])
AISEARCH_INDEX_NAME = config['aisearch_index_name']

# Azure embedding
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = config['azure_openai_embedding_deployment']
AZURE_OPENAI_EMBEDDING_MODEL_NAME = config['embedding_model_name']

In [17]:
client = AzureOpenAI(
    api_key= AZURE_OPENAI_API_KEY,  
    api_version= AZURE_OPENAI_API_VERSION,
    azure_endpoint= AZURE_OPENAI_ENDPOINT,
)

In [5]:
#client.embeddings.create(input=["test, embedding"],model=AZURE_OPENAI_EMBEDDING_MODEL_NAME)

In [6]:
#client.chat.completions.create(messages=[{"role": "user", "content": "hello, test gpt"}],model=config["azure_openai_model_deployment"])

In [18]:
# Lazy Connect to AI Search
index_client = SearchIndexClient(endpoint=AISEARCH_ENDPOINT, credential=AISEARCH_CREDENTIAL)
search_client = SearchClient(endpoint=AISEARCH_ENDPOINT, index_name=AISEARCH_INDEX_NAME, credential=AISEARCH_CREDENTIAL)

In [19]:
# Función para obtener embeddings
def get_embedding(text, model=AZURE_OPENAI_EMBEDDING_MODEL_NAME):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

In [20]:
# Configurar índice en Azure Search
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")
]

# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw"
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        content_fields=[SemanticField(field_name="content")]
    )
)


# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=AISEARCH_INDEX_NAME, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f'{result.name} created')

sbm_index created


In [21]:
# Cargar chunks
chunks = []
directory="chunks/"
for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    if os.path.isfile(filepath):
        with open(filepath, "r", encoding="utf-8") as file:
            chunks.append({"id": filename.replace(".txt",""), "text": file.read()})
   

In [22]:
chunks

[{'id': 'page_1',
  'text': '--- Page1 ---\nPROPRIETARY DOCUMENT - OFFICIAL COPY - ES44331 - PECEELPF447001A1 - PDF Generated on 29-Apr-2022\n11:51AM\nCLIENT: SBM OFFSHORE NV\nPROJECT NAME: GROUP ENGINEERING STANDARD\nPROJECT NO: ES44331\nDocument No. : PECEELPF447001\nDocument Title: CABLE ROUTING DESIGN\nDocument Type : STANDARD\nDocument Status : A 1\nDocument Status : A – Approved\nDocument Revision : 1\nDate of Issue : 29-Apr-2022\nDocument Description :\nDiscipline Group : ENGINEERING\nDiscipline: ELECTRICAL\nSystem: CABLES AND CABLE TRAYS\nArea : FPSO\nDocument Purpose:\nThis document captures the internal requirements to deliver the Cable Routing Design and its\ndeliverables.\nDocument Use:\nConfidentiality Rating : 2 - Restricted\nCopyright © 2022 SBM Offshore N.V. and/or one or more of its subsidiaries and/or affiliates, as the case may be. This document is the property of SBM Offshore N.V. and/or one or\nmore of its subsidiaries or affiliates. This document or any part there

In [23]:
# Indexar documentos
documents = []
for chunk in chunks:
    embedding = get_embedding(chunk['text'])
    document = {
        "id": chunk['id'],
        "content": chunk['text'],
        "content_vector": embedding
    }
    documents.append(document)

# Subir los documentos en lotes
search_client.upload_documents(documents=documents)
print(f"Uploaded {len(documents)} documents") 

Uploaded 14 documents
