In [32]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
import os
from dotenv import load_dotenv
import uuid  # Import the uuid module

# Load environment variables
load_dotenv()
service_endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
index_name = os.environ["AZURE_SEARCH_INDEX_NAME"]
key = os.environ["AZURE_SEARCH_API_KEY"]

# Create a SearchClient
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

In [None]:
import uuid

def upload_bulk_documents(documents):
    """
    Uploads a batch of documents to the Azure Cognitive Search index.
    Each document in the batch will have a base UUID, and each paragraph will have a unique ID derived from it.
    
    :param documents: A list of dictionaries, where each dictionary represents a document paragraph.
                      Each dictionary must contain the keys: "base_id", "document_name", "document_type",
                      "document_link", "issuer", "resource_name", and "content".
    """
    # Prepare the documents for upload
    upload_documents = []
    paragraph_counts = {}  # Track paragraph numbering for each base_id

    for doc in documents:
        base_id = doc["base_id"]

        # Increment paragraph count for this document
        if base_id not in paragraph_counts:
            paragraph_counts[base_id] = 1
        else:
            paragraph_counts[base_id] += 1

        # Create a unique document ID by appending a paragraph number
        unique_doc_id = f"{base_id}_{paragraph_counts[base_id]}"

        # Create the document dictionary
        document = {
            "id": unique_doc_id,  # Unique ID per paragraph
            "document_name": doc["document_name"],
            "document_type": doc["document_type"],
            "document_link": doc["document_link"],
            "issuer": doc["issuer"],
            "resource_name": doc["resource_name"],
            "content": doc["content"]
        }

        # Add the document to the upload list
        upload_documents.append(document)

    # Print the prepared documents before uploading
    for document in upload_documents:
        print(document)

    # Upload the batch of documents
    search_client.upload_documents(documents=upload_documents)
    print(f"Prepared {len(upload_documents)} documents for upload.")

# Example usage



In [39]:
import json
import os
import uuid

def load_documents_from_json(directory):
    documents_to_upload = []
    document_uuid_map = {}  # Store base UUIDs for each document

    # Get all JSON files from the directory
    json_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".json")]

    for file in json_files:
        with open(file, 'r', encoding='utf-8') as f:
            data = json.load(f)

        documents = data.get("documents", [])

        for doc in documents:
            document_name = doc.get("document_name", "Unknown Document")

            # Generate or retrieve a unique base UUID for this document
            if document_name not in document_uuid_map:
                document_uuid_map[document_name] = str(uuid.uuid4())

            base_id = document_uuid_map[document_name]
            document_type = doc.get("document_type", "Unknown Type")
            document_link = doc.get("document_link", "")
            issuer = doc.get("issuer", "Unknown Issuer")
            resource_name = doc.get("resource_name", "Unknown Resource")
            content_dict = doc.get("content", {})

            if isinstance(content_dict, dict):
                for key, paragraph in content_dict.items():
                    if paragraph.strip():  # Ignore empty paragraphs
                        documents_to_upload.append({
                            "base_id": base_id,  # Store base ID instead of doc_id
                            "document_name": document_name,
                            "document_type": document_type,
                            "document_link": document_link,
                            "issuer": issuer,
                            "resource_name": resource_name,
                            "content": paragraph.strip()
                        })

    return documents_to_upload

# Automatically load all JSON files in the 'revlon/' folder
json_directory = "revlon"  # Folder containing JSON files
parsed_documents = load_documents_from_json(json_directory)

# Output result
if len(parsed_documents) == 0:
    print("No documents found.")
else:
    for doc in parsed_documents:
        print(json.dumps(doc, indent=2))  # Pretty print the results


{
  "base_id": "db50f76a-f79b-4942-8022-d31805bcb078",
  "document_name": "expert analysis 1",
  "document_type": "Expert Analysis",
  "document_link": "https://drive.google.com/file/d/1Mcq0KGzJYXIUXtci7OJiRyxrDz_llr0T/view?usp=drive_link",
  "issuer": "Revlon",
  "resource_name": "LOEB&LOEB LLP",
}
{
  "base_id": "db50f76a-f79b-4942-8022-d31805bcb078",
  "document_name": "expert analysis 1",
  "document_type": "Expert Analysis",
  "document_link": "https://drive.google.com/file/d/1Mcq0KGzJYXIUXtci7OJiRyxrDz_llr0T/view?usp=drive_link",
  "issuer": "Revlon",
  "resource_name": "LOEB&LOEB LLP",
  "content": "How It Happened\nDischarge-for-Value Rule\nConstructive Notice\nReasonable Inquiry\nBanque Worms Ruling\nAdditional Comments\nHow It Happened\nWhile transmitting accrued interest to the lenders\u2019 loan managers on Aug. 11, 2020, Citibank made an error\nthat caused the accidental wire transfer of $894 million\u2014the full amount of Revlon\u2019s outstanding principal\nbalance\u201

In [38]:
# Example usage


# Call the bulk upload function
upload_bulk_documents(parsed_documents)

{'id': 'e844d1c4-bec4-4a19-8ca9-62a1dc1e8127_2', 'document_name': 'expert analysis 1', 'document_type': 'Expert Analysis', 'document_link': 'https://drive.google.com/file/d/1Mcq0KGzJYXIUXtci7OJiRyxrDz_llr0T/view?usp=drive_link', 'issuer': 'Revlon', 'resource_name': 'LOEB&LOEB LLP', 'content': 'How It Happened\nDischarge-for-Value Rule\nConstructive Notice\nReasonable Inquiry\nBanque Worms Ruling\nAdditional Comments\nHow It Happened\nWhile transmitting accrued interest to the lenders’ loan managers on Aug. 11, 2020, Citibank made an error\nthat caused the accidental wire transfer of $894 million—the full amount of Revlon’s outstanding principal\nbalance—three years before Revlon’s loan repayment was due.\xa0\nDespite three people having reviewed and approved the transaction before it was executed, the\ntransmission was sent without certain specific settings that would have prevented the principal balance\nfrom being wired. The transaction occurred at a time when, because Revlon was ins