Install Dependencies


In [None]:
!pip install azure-identity==1.15.0
!pip install tiktoken
!pip install fitz
!pip install langchain==0.1.6
!pip install python-docx
!pip install azure-core==1.30.0
!pip install azure-search-documents==11.4.0
!pip install pymupdf
!pip install pymupdf4llm
!pip install python-pptx
!pip install openai

In [None]:
dbutils.library.restartPython()

Instantiate index, read in documents, convert them to text, chunk them, convert them to JSON

In [None]:
import os
import json
import fitz
import uuid
import docx
import pptx
import csv
import pandas as pd
import tiktoken
import pymupdf4llm
import pymupdf
import openai
from openai import AzureOpenAI
from langchain.schema import HumanMessage
from langchain.chat_models import AzureChatOpenAI
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex, SearchField, SearchFieldDataType, VectorSearch,
    VectorSearchProfile, HnswAlgorithmConfiguration, HnswParameters,
    VectorSearchAlgorithmMetric, SemanticConfiguration, SemanticPrioritizedFields,
    SemanticField, SemanticSearch
)
from concurrent.futures import ThreadPoolExecutor, as_completed
import uuid
import os

# Define credentials, endpoints, and index name
scope_name = "your_scope"
azure_search_key_id = "your_search_key"
azure_openai_key_id = "your_openai_key"
azure_search_key_value = dbutils.secrets.get(scope=scope_name, key=azure_search_key_id)
azure_openai_key_value = dbutils.secrets.get(scope=scope_name, key=azure_openai_key_id)
index_name= "your_index_name"

azure_openai_endpoint = "openai_endpoint"
azure_openai_embedding_deployment = "text-embedding-ada-002"
azure_openai_api_version = "2024-02-15-preview"
endpoint = "endpoint"
key_credential_azure_search = azure_search_key_value

# Initialize clients
index_client = SearchIndexClient(endpoint=endpoint, credential=AzureKeyCredential(key_credential_azure_search))
search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(key_credential_azure_search))
embedding_client = AzureOpenAI(
    azure_deployment=azure_openai_embedding_deployment,
    api_version=azure_openai_api_version,
    azure_endpoint=azure_openai_endpoint,
    api_key=azure_openai_key_value
)
GPT_DEPLOYMENT_NAME='gpt-4-32k'
os.environ["AZURE_OPENAI_API_KEY"] = azure_openai_key_value
os.environ["AZURE_OPENAI_ENDPOINT"] = azure_openai_endpoint
model = AzureChatOpenAI(
    azure_endpoint=azure_openai_endpoint,
    openai_api_version=azure_openai_api_version,
    azure_deployment='gpt-4o'
)

# Define the index schema
fields = [
    SearchField(name="filepath", type=SearchFieldDataType.String, filterable=True, sortable=True),
    SearchField(name="summary", type=SearchFieldDataType.String),
    SearchField(name="chunk_id", type=SearchFieldDataType.String,key=True),
    SearchField(name="content", type=SearchFieldDataType.String, searchable=True),
    SearchField(name="id", type=SearchFieldDataType.String),
    SearchField(name="file_name", type=SearchFieldDataType.String),
    SearchField(name="sap_number", type=SearchFieldDataType.String),
    SearchField(name="year", type=SearchFieldDataType.String),
    SearchField(name="status", type=SearchFieldDataType.String),
    SearchField(name="description", type=SearchFieldDataType.String),
    SearchField(name="Publication_Type", type=SearchFieldDataType.String),
    SearchField(name="Model_Tag", type=SearchFieldDataType.String),
    SearchField(name="meta_content", type=SearchFieldDataType.String),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
    SearchField(name="meta_content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")
]

# Configure vector search
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            parameters=HnswParameters(
                m=4,
                ef_construction=100,
                ef_search=100,
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw"
        )
    ]
)

# Configure semantic search
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        content_fields=[
            SemanticField(field_name="content"),
            SemanticField(field_name="filepath"),
            SemanticField(field_name="summary")
        ],
        keyword_field=SemanticField(field_name="content")
    )
)

index = SearchIndex(
    name=index_name,
    fields=fields,
    vector_search=vector_search,
    semantic_search=SemanticSearch(configurations=[semantic_config])
)

# Create or update the search index
index_client.create_or_update_index(index)
print(f'{index_name} index created or updated')

# Define prompt template for summarization
prompt_template = """Your task is to summarize the given content in less than 50 words. You need to capture important information such as what
product is being spoken about and which year is the document published.
You dont have to go through every line. Just go through all topics and headings and come up with a summary
Context is enclosed in <context>{context}</context>
"""

def load_metadata(csv_filepath):
    df = pd.read_csv(csv_filepath)
    records = df.to_dict(orient='records')
    return {record['file_name']: {k: v for k, v in record.items() if k != 'file_name'} for record in records}

def generate_summary(context):
    prompt = prompt_template.format(context=context)
    message = HumanMessage(content=prompt)
    return str(model([message]))

def convert_to_text(filepath):
    ext = filepath.split('.')[-1].lower()
    if ext == 'pdf':
        try:
         text = pymupdf4llm.to_markdown(filepath)
        except Exception as e:
         print(f"Error using pymupdf4llm: {e}. Falling back to pymupdf.")
         doc = pymupdf.open(filepath)
         text=""
         for page in doc: # iterate the document pages
            text += page.get_text()
    elif ext == 'docx':
        doc = docx.Document(filepath)
        text = "\n".join([para.text for para in doc.paragraphs])
    elif ext == 'pptx':
        prs = pptx.Presentation(filepath)
        text = "\n".join([shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")])
    elif ext == 'csv':
        with open(filepath, newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            text = "\n".join([','.join(row) for row in reader])
    else:
        raise ValueError("Unsupported file type")
    return text

def chunk_text(content, token_limit=4000):
    enc = tiktoken.encoding_for_model("gpt-4-32k")
    tokens = enc.encode(content)
    print(f"Total tokens: {len(tokens)}")  # Debug print
    def generate_chunks():
        start = 0
        while start < len(tokens):
            end = min(start + token_limit, len(tokens))
            chunk = enc.decode(tokens[start:end])
            print(f"Start: {start}, End: {end}, Chunk Length: {len(chunk)}")  # Debug print
            yield chunk
            start = end
        print("Reached the end of tokens.")
    return generate_chunks()

def process_document(filepath, metadata_dict, document_id, batch_size):
    text = convert_to_text(filepath)
    chunk_generator = chunk_text(text)
    documents = []
    filename = os.path.basename(filepath)
    meta_data = load_metadata(metadata_dict)
    chunk_counter = 1
    chunk_batch = []

    with ThreadPoolExecutor(max_workers=20) as executor:
        future_to_chunk = {}
        for chunk in chunk_generator:
            chunk_batch.append(chunk)
            if len(chunk_batch) >= batch_size:
                # Submit futures for the current batch
                for chunk in chunk_batch:
                    future = executor.submit(generate_summary, chunk)
                    future_to_chunk[future] = chunk
                # Process futures for the current batch
                for future in as_completed(future_to_chunk):
                    chunk = future_to_chunk[future]
                    try:
                        summary = future.result()
                        meta_content = (
                            f"File Name: {filename}, "
                            f"Sap Number: {meta_data.get('sap_number', '')}, "
                            f"Year: {meta_data.get('print_date', '')}, "
                            f"Status: {meta_data.get('status', '')}, "
                            f"Description: {meta_data.get('description', '')}, "
                            f"Publication Type: {meta_data.get('Publication_Type', '')}, "
                            f"Model Tag: {meta_data.get('Model_Tag', '')}"
                        )
                        chunk_id = f"{document_id}_{chunk_counter}"

                        document = {
                            "filepath": filepath,
                            "summary": summary,
                            "chunk_id": chunk_id,
                            "content": chunk,
                            "id": document_id,
                            "meta_content": meta_content,
                            "file_name": filename,
                            "sap_number": meta_data.get('sap_number', ''),
                            "year": meta_data.get('print_date', ''),
                            "status": meta_data.get('status', ''),
                            "description": meta_data.get('description', ''),
                            "Publication_Type": meta_data.get('Publication_Type', ''),
                            "Model_Tag": meta_data.get('Model_Tag', ''),
                            "contentVector": [],
                            "meta_content_vector": []
                        }
                        documents.append(document)
                        chunk_counter += 1
                    except Exception as e:
                        print(f"Error generating summary for chunk: {e}")

                chunk_batch = []  # Clear batch after processing

        # Process remaining chunks in the batch after the loop
        if chunk_batch:
            for chunk in chunk_batch:
                future = executor.submit(generate_summary, chunk)
                future_to_chunk[future] = chunk
            for future in as_completed(future_to_chunk):
                chunk = future_to_chunk[future]
                try:
                    summary = future.result()
                    meta_content = (
                        f"File Name: {filename}, "
                        f"Sap Number: {meta_data.get('sap_number', '')}, "
                        f"Year: {meta_data.get('print_date', '')}, "
                        f"Status: {meta_data.get('status', '')}, "
                        f"Description: {meta_data.get('description', '')}, "
                        f"Publication Type: {meta_data.get('Publication_Type', '')}, "
                        f"Model Tag: {meta_data.get('Model_Tag', '')}"
                    )
                    chunk_id = f"{document_id}_{chunk_counter}"
                    document = {
                        "filepath": filepath,
                        "summary": summary,
                        "chunk_id": chunk_id,
                        "content": chunk,
                        "id": document_id,
                        "meta_content": meta_content,
                        "file_name": filename,
                        "sap_number": meta_data.get('sap_number', ''),
                        "year": meta_data.get('print_date', ''),
                        "status": meta_data.get('status', ''),
                        "description": meta_data.get('description', ''),
                        "Publication_Type": meta_data.get('Publication_Type', ''),
                        "Model_Tag": meta_data.get('Model_Tag', ''),
                        "contentVector": [],
                        "meta_content_vector": []
                    }
                    documents.append(document)
                    chunk_counter += 1
                except Exception as e:
                    print(f"Error generating summary for chunk: {e}")

    return documents

# Upload the current batch of documents
def index_documents(documents,batch_size):
    total_docs = len(documents)
    for start in range(0, total_docs, batch_size):
        end = min(start + batch_size, total_docs)
        doc_batch = documents[start:end]
        try:
            result = search_client.upload_documents(documents=doc_batch)
            print(f"Uploaded documents {start + 1} to {end} (total: {total_docs})")
        except Exception as e:
            print(f"Error uploading batch {start + 1} to {end}: {e}")
    print("All documents uploaded successfully")

def process_and_index_folder(folder_path, metadata_dict, batch_size):
    all_documents = []

    for filename in os.listdir(folder_path):
        filepath = os.path.join(folder_path, filename)
        print(filepath)

        if not os.path.isfile(filepath):
            continue
        try:
            document_id = str(uuid.uuid4())
            documents = process_document(filepath, metadata_dict, document_id, batch_size=batch_size)
            all_documents.extend(documents)
        except Exception as e:
            print(f"Error processing file {filename}: {e}")

        # Upload documents in batches if the batch size is reached
        while len(all_documents) >= batch_size:
            batch_to_upload = all_documents[:batch_size]
            all_documents = all_documents[batch_size:]
            try:
                index_documents(batch_to_upload, batch_size)
            except Exception as e:
                print(f"Error uploading batch: {e}")

    # Upload any remaining documents after processing all files
    if all_documents:
        try:
            index_documents(all_documents, batch_size)
        except Exception as e:
            print(f"Error uploading final batch: {e}")

# Usage example
folder_path = "your_data"
metadata_csv = "your_meta_data" #optional
process_and_index_folder(folder_path, metadata_csv, batch_size=10)

Set permissions and dependencies for azure search, embedding client, and GPT

Create embeddings for content, store in index

In [None]:
import numpy as np
from azure.core.exceptions import HttpResponseError

def generate_embeddings(input_data):
    # Extract content and meta_content from input data
    content = [item['content'] for item in input_data]
    meta_content = [item['meta_content'] for item in input_data]

    # Generate embeddings for content
    content_response = embedding_client.embeddings.create(input=content, model=azure_openai_embedding_deployment, dimensions=1536)
    content_embeddings = [item.embedding for item in content_response.data]
    #print("CONENT", content_embeddings)

    # Generate embeddings for meta_content
    meta_content_response = embedding_client.embeddings.create(input=meta_content, model=azure_openai_embedding_deployment, dimensions=1536)
    meta_content_embeddings = [item.embedding for item in meta_content_response.data]
    #print("META", meta_content_embeddings)

    # Assign embeddings to input data
    for i, item in enumerate(input_data):
        item['contentVector'] = content_embeddings[i]
        #print(item['contentVector'])
        item['meta_content_vector'] = meta_content_embeddings[i]

    return input_data

def filter_documents_with_empty_vectors(documents):
    filtered_docs = []
    for doc in documents:
        content_vector = doc.get('contentVector', [])
        meta_content_vector = doc.get('meta_content_vector', [])

        # Check if either vector field is empty
        if content_vector is None or len(content_vector) == 0 or meta_content_vector is None or len(meta_content_vector) == 0:
            filtered_docs.append(doc)

    print(f"Total documents with empty vectors: {len(filtered_docs)}")
    return filtered_docs

def get_documents():
    try:
        # Retrieve all documents from the index
        results = search_client.search(search_text="*", filter=None)
        documents = [doc for doc in results]
        print(f"Total documents retrieved: {len(documents)}")
        grouped_documents = {}

        # Iterate through the documents
        for doc in documents:
            content_vector = doc.get("contentVector", [])
            meta_content_vector = doc.get("meta_content_vector", [])

            # Check if either vector field is empty
            if content_vector is None or len(content_vector) == 0 or meta_content_vector is None or len(meta_content_vector) == 0:
                doc_id = doc.get("id")
                if doc_id not in grouped_documents:
                    grouped_documents[doc_id] = []
                grouped_documents[doc_id].append(doc)

        print(f"Total documents with empty vectors: {len(grouped_documents)}")
        return grouped_documents

    except HttpResponseError as e:
        print(f"Error fetching documents: {e}")
        return {}

def update_content_vectors(grouped_documents):
    batch_size = 10  # Adjust batch size as needed
    for doc_id, docs in grouped_documents.items():
        for i in range(0, len(docs), batch_size):
            batch = docs[i:i + batch_size]
            try:
                updated_documents = generate_embeddings(batch)
                result = search_client.upload_documents(documents=updated_documents)
                print(f"Indexed batch {i // batch_size + 1} for doc_id {doc_id}")
                print(f"Upload result: {result}")
            except HttpResponseError as e:
                print(f"Failed to upload batch {i // batch_size + 1} for doc_id {doc_id}: {e}")
                for doc in batch:
                    print("Failed Document:")
                    print("Type:", type(doc))
                    print("Keys:", list(doc.keys()))
                    print("Type:", doc.get("content"))
                    print("Type:", doc.get("content_vector"))
                    print("Type:", doc.get("meta_vector"))
                    for key, value in doc.items():
                        print(f"Field: {key}, Type: {type(value)}, Length: {len(str(value))}")

# Process documents
grouped_documents = get_documents()
update_content_vectors(grouped_documents)

Run Test Queries

In [None]:
from langchain_core.utils import get_from_env
import os
os.environ["AZURESEARCH_FIELDS_CONTENT_VECTOR"] = "contentVector"
os.environ["AZURESEARCH_FIELDS_TAG"] = "meta_content_vector"
FIELDS_ID = get_from_env(
    key="AZURESEARCH_FIELDS_ID", env_key="AZURESEARCH_FIELDS_ID", default="meta_content_vector"
)
from azure.search.documents.models import VectorizedQuery
from azure.search.documents.models import QueryType, QueryCaptionType, QueryAnswerType
from azure.search.documents.models import VectorFilterMode
from azure.search.documents import SearchClient
from langchain_community.vectorstores import AzureSearch

query="explain 4shp17le"
def inspect_index_documents():
    # Simple vs Advanced query processs of the index
    results = search_client.search(search_text=query, filter=None, top=5)
    context=""
    for result in results:
        context += result.get("content")
    response = generate_summary(query,context)
    print(response)

global select_fields
global vectorizable_fields
select_fields = ["filepath", "content","chunk_id"]
vectorizable_fields = "contentVector", "meta_content_vector"

def generate_summary(query,context):
        prompt_template = """Query: <query>{query}<query> Please use the following context to answer the query provided: <context>{context}</context>
"""
        prompt = prompt_template.format(query=query,context=context)
        message = HumanMessage(content=prompt)
        return str(model([message]))

embedding = embedding_client.embeddings.create(
        input=query, model=azure_openai_embedding_deployment
    ).data[0].embedding

vector_query = VectorizedQuery(
        vector=embedding,
        k_nearest_neighbors=2,
        fields="contentVector, meta_content_vector",
        exhaustive=True,
    )
# Example usage
def inspect_index_documentsAdvanced():
  print("Now trying advanced vector based query")
  results_main = search_client.search(
        search_text=query,
        vector_queries=[vector_query],
        select=select_fields,
        query_type=QueryType.SEMANTIC,
        semantic_configuration_name='my-semantic-config',
        query_caption=QueryCaptionType.EXTRACTIVE,
        query_answer=QueryAnswerType.EXTRACTIVE,
        top=3,
        vector_filter_mode=VectorFilterMode.PRE_FILTER,
    )

  #results_main = search_client.search(search_text=query, vector_queries=[vector_query],top=3, vector_filter_mode=VectorFilterMode.PRE_FILTER)
  context=""
  for result in results_main:
        context += result.get("content")
  response = generate_summary(query,context)
  print(response)
inspect_index_documents()
inspect_index_documentsAdvanced()

content='The "4SHP17LE" refers to a specific model of a split system heat pump. Here are the key details and specifications for the 4SHP17LE model:\n\n### Key Features:\n\n1. **Compressor**:\n   - High-efficiency scroll compressor\n   - Uses R410A refrigerant\n   - Grommet mounted compressor for quieter operation\n   - Heavy-duty compressor sound blanket for quieter operation\n   - Internally protected against high temperature motor overload conditions\n\n2. **Cabinet**:\n   - Full metal louvered panel with 2 screws for ease of coil cleaning and service\n   - Corner-mounted controls for easy service\n   - Rounded corners for safety and an attractive appearance\n   - Baked polyester paint finished over galvanized steel for maximum durability\n   - Removable PVC coated wire fan discharge grill\n   - 45-degree offset gauge ports for easy service\n   - Removable service panel for internal access\n\n3. **Coils**:\n   - Omniguard® total corrosion protection technology\n   - Enhanced tube-and

Inspect index dimensions and document count

In [None]:
# query_empty_content_vector = "*"
# filter_empty_content_vector = "contentVector/any(c: c eq null)"

def get_all_documents(search_client):
    """Fetch all documents from the index without pagination."""
    search_text = "*"
    response = search_client.search(search_text=search_text)
    return list(response)  # Convert the response to a list of documents

def count_empty_vectors(documents):
    """Count documents with empty 'contentVector' or 'meta_content_vector'."""
    count_empty_content_vector = 0
    count_empty_meta_content_vector = 0

    for doc in documents:
        content_vector = doc.get('contentVector', None)
        meta_content_vector = doc.get('meta_content_vector', None)

        if content_vector is None or len(content_vector) == 0:
            count_empty_content_vector += 1

        if meta_content_vector is None or len(meta_content_vector) == 0:
            count_empty_meta_content_vector += 1

    return count_empty_content_vector, count_empty_meta_content_vector

# Fetch all documents
documents = get_all_documents(search_client)

# Count empty vectors
empty_content_vector_count, empty_meta_content_vector_count = count_empty_vectors(documents)

print(f"Number of documents with empty 'contentVector': {empty_content_vector_count}")
print(f"Number of documents with empty 'meta_content_vector': {empty_meta_content_vector_count}")

Number of documents with empty 'contentVector': 0
Number of documents with empty 'meta_content_vector': 0


In [None]:
!pip install openpyxl
dbutils.library.restartPython()

Automate Log Testing

Clean up NL characters from the answers

In [None]:
import pandas as pd
import re

def clean_text(text):
    """Clean up the text by removing 'content=' prefix, Markdown characters, and properly handling newlines."""
    # Ensure the text is a string
    text = str(text)

    # Remove 'content=' prefix if it exists
    if text.startswith("content="):
        text = text[len("content="):]

    # Remove Markdown bold, italic, and code characters (e.g., **bold**, *italic*, `code`)
    text = re.sub(r'(\*\*|\*|`)', '', text)

    # Replace carriage returns (\r) with line feeds (\n) and then replace consecutive newlines with a single space
    text = re.sub(r'\r\n?', '\n', text)  # Standardize to \n

    # Replace multiple newlines or newlines with no preceding text with a space
    text = re.sub(r'(?<!\S)\n(?!\S)', ' ', text)  # Newlines not surrounded by non-whitespace characters

    # Replace multiple consecutive newlines with a single space
    text = re.sub(r'\n+', ' ', text)

    # Replace any remaining whitespace sequences with a single space
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def clean_answers(file_path):
    # Read the Excel file
    df = pd.read_excel(file_path)

    # Ensure the 'New Answer' column exists
    if 'New Answer' not in df.columns:
        raise ValueError("The Excel file must contain a 'New Answer' column.")

    # Clean each answer in the 'New Answer' column
    df['New Answer'] = df['New Answer'].apply(lambda x: clean_text(x))

    # Save the cleaned DataFrame to the same Excel file
    df.to_excel(file_path, index=False)
    print(f"Excel file updated with cleaned answers: {file_path}")

# Main execution
file_path = "your_excel_file  # Path to the input Excel file
clean_answers(file_path)


In [None]:
import pandas as pd

def clean_text(text):
    """Clean up the text by removing unnecessary characters and formatting issues."""
    # Ensure the text is a string
    text = str(text)

    # Replace backward slashes with a space or remove them
    text = text.replace("\\", " ")

    # Replace the '#' character with a space or remove it
    text = text.replace("#", " ")

    # Replace literal "\n" with actual newline characters
    text = text.replace("\\n", "\n")

    # Replace actual newlines with a single space or desired formatting
    text = text.replace("\n", " ")

    # Optionally, replace forward slashes if needed (adjust or remove if not necessary)
    text = text.replace("/", " ")

    # Normalize multiple spaces
    text = " ".join(text.split())

    return text

def clean_answers(file_path):
    # Read the Excel file
    df = pd.read_excel(file_path)

    # Ensure the 'New Answer' column exists
    if 'New Answer' not in df.columns:
        raise ValueError("The Excel file must contain a 'New Answer' column.")

    # Clean each answer in the 'New Answer' column
    df['New Answer'] = df['New Answer'].apply(clean_text)

    # Save the cleaned DataFrame to the same Excel file
    df.to_excel(file_path, index=False)
    print(f"Excel file updated with cleaned answers: {file_path}")

# Main execution
file_path = "new_excel_file  # Path to the input Excel file
clean_answers(file_path)
