### Ingest Content from all Folders in SharePoint Site to an Azure AI Search Index
Includes Vectorizaztion of Content

In [1]:
import os
import json
import copy
import requests
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from tenacity import retry, wait_random_exponential, stop_after_attempt 
from dotenv import load_dotenv
import openai
from openai import AzureOpenAI
from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter

# Load environment variables from .env file
load_dotenv()

# Define the target directory (change yours)
target_directory = (
    os.getcwd()
)

# Check if the directory exists
if os.path.exists(target_directory):
    # Change the current working directory
    os.chdir(target_directory)
    print(f"Directory changed to {os.getcwd()}")
else:
    print(f"Directory {target_directory} does not exist.")

Directory changed to /Users/marcjimz/Documents/Development/sharepoint-indexing-azure-cognitive-search


In [2]:
# Set the service endpoint and API key from the environment
# Create an SDK client
endpoint = os.environ["SEARCH_SERVICE_ENDPOINT"]
search_client = SearchClient(
    endpoint=endpoint,
    index_name=os.environ["SEARCH_INDEX_NAME"],
    credential=AzureKeyCredential(os.environ["SEARCH_ADMIN_API_KEY"]),
)

admin_client = SearchIndexClient(
    endpoint=endpoint,
    index_name=os.environ["SEARCH_INDEX_NAME"],
    credential=AzureKeyCredential(os.environ["SEARCH_ADMIN_API_KEY"]),
)

openai.api_key = os.environ["OPEN_API_KEY"]
openai.api_base = os.environ["OPEN_API_BASE"]
openai.api_type = "azure"  
openai.api_version = "2023-05-15"

model = os.environ["OPEN_API_MODEL"]

client = AzureOpenAI(
        api_version=openai.api_version,
        azure_endpoint=openai.api_base,
        api_key=openai.api_key
    )

# This is in characters and there is an avg of 4 chars / token
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024*4,
    chunk_overlap  = 102*4
)

## Initialize the `client_extractor` client <a id='init-client'></a>

In [3]:
from gbb_ai.sharepoint_data_extractor import SharePointDataExtractor

# Instantiate the SharePointDataExtractor client
# The client handles the complexities of interacting with SharePoint's REST API, providing an easy-to-use interface for data extraction.
client_scrapping = SharePointDataExtractor()

> 💡 **Note**
> The `get_site_id` and `get_drive_id` methods are optional. They are automatically called by the `retrieve_sharepoint_files_content` function. However, they are available for use if further analysis is required.

In [4]:
# Load environment variables from the .env file
client_scrapping.load_environment_variables_from_env_file()

# Authenticate with Microsoft Graph API
client_scrapping.msgraph_auth()

# Get the Site ID for the specified SharePoint site
site_id = client_scrapping.get_site_id(
    site_domain=os.environ["SITE_DOMAIN"], site_name=os.environ["SITE_NAME"]
)

# Get the Drive ID associated with the Site ID
drive_id = client_scrapping.get_drive_id(site_id)

2024-09-03 22:11:03,595 - micro - MainProcess - INFO     Successfully loaded environment variables: TENANT_ID, CLIENT_ID, CLIENT_SECRET (sharepoint_data_extractor.py:load_environment_variables_from_env_file:86)
2024-09-03 22:11:03,981 - micro - MainProcess - INFO     New access token retrieved. (sharepoint_data_extractor.py:msgraph_auth:118)
2024-09-03 22:11:03,982 - micro - MainProcess - INFO     Getting the Site ID... (sharepoint_data_extractor.py:get_site_id:187)
2024-09-03 22:11:04,318 - micro - MainProcess - INFO     Site ID retrieved: 30z44s.sharepoint.com,4303930e-50c4-467a-ac6e-2128d74f3554,6828085b-3888-432a-baa6-225475f35b6b (sharepoint_data_extractor.py:get_site_id:191)
2024-09-03 22:11:04,738 - micro - MainProcess - INFO     Successfully retrieved drive ID: b!DpMDQ8RQekasbiEo1081VFsIKGiIOCpDuqYiVHXzW2vabjOBwPZiQ4_E_CuTBjAI (sharepoint_data_extractor.py:get_drive_id:208)


In [5]:
def get_folders(url, folder_list = ['/']):
    headers = {'Authorization': 'Bearer ' + client_scrapping.access_token}
    response = requests.get(url, headers=headers)
    items = response.json()

    # Check if the 'value' key is in the response
    if 'value' not in items:
        return folder_list

    for item in items['value']:
        if 'folder' in item:
            # print(item['name'] + '/')
            # If the item is a folder, get its subfolders
            subfolder_url = url + '/' + item['name'] + '/children'
            folder_val = subfolder_url[subfolder_url.index('/drive/root')+11:].replace('/children','') + '/'
            print (folder_val)
            folder_list.append(folder_val)
            get_folders(subfolder_url, folder_list)
    return folder_list

# Split up a list into chunks - this is used to ensure a limited number of items sent to Azure AI Search
def divide_chunks(l, n):  
    # looping till length l  
    for i in range(0, len(l), n):   
        yield l[i:i + n]  

# Function to generate embeddings for title and content fields, also used for query embeddings
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def generate_embeddings(text):
    response = client.embeddings.create(
        input=text,
        model=model
    )
    return json.loads(response.model_dump_json())["data"][0]['embedding']

In [6]:
#generate_embeddings('test')

In [7]:
# Use the access token to get the folders 
print ('Getting all folders in SharePoint site...')
root_url = f'https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root/children'  
folder_list = get_folders(root_url)  

Getting all folders in SharePoint site...


In [8]:
root_url

'https://graph.microsoft.com/v1.0/sites/30z44s.sharepoint.com,4303930e-50c4-467a-ac6e-2128d74f3554,6828085b-3888-432a-baa6-225475f35b6b/drive/root/children'

In [9]:
# Download and process files from a set of folders within a SharePoint site.

n = 100  # max batch size (number of docs) to upload at a time
total_docs_uploaded = 0

for folder in folder_list:
    print (f"Processing folder {folder}...")

    if folder == '/':
        selected_files_content = client_scrapping.retrieve_sharepoint_files_content(
            site_domain=os.environ["SITE_DOMAIN"],
            site_name=os.environ["SITE_NAME"],
            file_formats=["docx", "pdf"],
        )
    else:
        selected_files_content = client_scrapping.retrieve_sharepoint_files_content(
            site_domain=os.environ["SITE_DOMAIN"],
            site_name=os.environ["SITE_NAME"],
            folder_path=folder,
            file_formats=["docx", "pdf"],
        )

    if selected_files_content == None:
        print ("No documents found in this folder")
    else:
        chunked_content_docs = []
        sfc_counter = 0
        for sfc_counter in range(len(selected_files_content)):
            # print (json_data_base['content'])
            chunked_content = text_splitter.split_text(selected_files_content[sfc_counter]['content'])
            chunk_counter = 0
            for cc in chunked_content:
                json_data = copy.deepcopy(selected_files_content[sfc_counter]) 
                json_data['content'] = chunked_content[chunk_counter]
                json_data['contentVector'] = generate_embeddings(json_data['content'])
                json_data['doc_id'] = json_data['id']
                json_data['id'] = json_data['id'] + "-" + str(chunk_counter)
                json_data['chunk_id'] = chunk_counter
                chunk_counter+=1
                chunked_content_docs.append(json_data)
            sfc_counter+=1
            
        total_docs = len(chunked_content_docs)
        total_docs_uploaded += total_docs
        print (f"Total Documents to Upload: {total_docs}")

        for documents_chunk in divide_chunks(chunked_content_docs, n):  
            # Multiple Documents Upload
            try:
                # 'search_client.upload_documents' can ingest multiple documents at once
                # 'selected_files_content' is a list of documents
                print (f"Uploading batch of {len(documents_chunk)} documents...")
                result = search_client.upload_documents(documents=documents_chunk)
                # Print the result for each document
                for res in result:
                    print("Upload of new document succeeded: {}".format(res.succeeded))
            except Exception as ex:
                print("Error in multiple documents upload: ", ex)
    # print (selected_files_content)

print (f"Upload of {total_docs_uploaded} documents complete.")

2024-09-03 22:11:05,217 - micro - MainProcess - INFO     Getting the Site ID... (sharepoint_data_extractor.py:get_site_id:187)


Processing folder /...


2024-09-03 22:11:05,527 - micro - MainProcess - INFO     Site ID retrieved: 30z44s.sharepoint.com,4303930e-50c4-467a-ac6e-2128d74f3554,6828085b-3888-432a-baa6-225475f35b6b (sharepoint_data_extractor.py:get_site_id:191)
2024-09-03 22:11:05,922 - micro - MainProcess - INFO     Successfully retrieved drive ID: b!DpMDQ8RQekasbiEo1081VFsIKGiIOCpDuqYiVHXzW2vabjOBwPZiQ4_E_CuTBjAI (sharepoint_data_extractor.py:get_drive_id:208)
2024-09-03 22:11:05,923 - micro - MainProcess - INFO     Making request to Microsoft Graph API (sharepoint_data_extractor.py:get_files_in_site:247)
2024-09-03 22:11:06,136 - micro - MainProcess - INFO     Received response from Microsoft Graph API (sharepoint_data_extractor.py:get_files_in_site:250)
2024-09-03 22:11:06,137 - micro - MainProcess - ERROR    No files found in the site's drive (sharepoint_data_extractor.py:retrieve_sharepoint_files_content:536)


No documents found in this folder
Upload of 0 documents complete.


In [45]:
from typing import Any, Dict, List
from bs4 import BeautifulSoup

def extract_text_from_webparts(webparts: List[Dict[str, Any]]) -> str:
    """
    Extracts and concatenates text from a list of webparts, stripping all HTML tags.
    
    :param webparts: List of webparts that contain HTML content.
    :return: A concatenated string of text with HTML tags removed.
    """
    text_content = ""
    for webpart in webparts:
        if "innerHtml" in webpart:
            html_content = webpart.get("innerHtml", "")
            # Use BeautifulSoup to strip HTML tags
            soup = BeautifulSoup(html_content, 'html.parser')
            text_content += soup.get_text(separator=' ', strip=True) + " "
    
    return text_content.strip()

def extract_text_from_canvas_layout(canvas_layout: Dict[str, Any]) -> str:
    """
    Extracts all text from the canvasLayout by iterating through horizontalSections, columns, and webparts.
    
    :param canvas_layout: The canvasLayout object from a SharePoint page.
    :return: A concatenated string of all text extracted from the layout, with HTML tags removed.
    """
    text_content = ""

    # Iterate over horizontal sections
    horizontal_sections = canvas_layout.get("horizontalSections", [])
    for section in horizontal_sections:
        section_id = section.get("id")
        # print(f"Processing section {section_id} with layout {section.get('layout')}...")

        # Iterate over columns in each horizontal section
        columns = section.get("columns", [])
        for column in columns:
            column_id = column.get("id")
            # print(f"Processing column {column_id} in section {section_id}...")

            # Extract text from each column's webParts
            webparts = column.get("webparts", [])
            text_content += extract_text_from_webparts(webparts) + " "

    return text_content.strip()


In [49]:
total_docs_uploaded = 0

# Retrieve and process site pages
processed_pages_content = client_scrapping.retrieve_and_process_site_pages(site_id)


In [54]:
total_docs_uploaded = 0

# Retrieve and process site pages
processed_pages_content = client_scrapping.retrieve_and_process_site_pages(site_id)

if not processed_pages_content:
    print("No pages found in the site.")
else:
    chunked_content_docs = []
    
    print(f"Processing {len(processed_pages_content)} pages from the site...")

    # Iterate through the processed pages content
    for page_num, page in enumerate(processed_pages_content, start=1):
        page_id = page.get("page_id")
        canvas_layout = page.get("content", {}).get("canvasLayout", {})

        if not canvas_layout:
            print(f"[Page {page_num}/{len(processed_pages_content)}] Page ID {page_id}: No content found.")
            continue
        
        print(f"[Page {page_num}/{len(processed_pages_content)}] Processing Page ID {page_id}...")

        # Extract all text from the canvasLayout, removing HTML
        page_text_content = extract_text_from_canvas_layout(canvas_layout)

        if not page_text_content:
            print(f"[Page {page_num}/{len(processed_pages_content)}] Page ID {page_id}: No text extracted from canvas layout.")
            continue
        
        print(f"[Page {page_num}/{len(processed_pages_content)}] Extracted text length: {len(page_text_content)} characters.")
        
        # Now chunk the text content - you can bring your own text splitter here and chunk accordingly!
        chunked_content = text_splitter.split_text(page_text_content)
        print(f"[Page {page_num}/{len(processed_pages_content)}] Split into {len(chunked_content)} chunks.")
        
        chunk_counter = 0

        # Iterate through the chunks and create the chunked content docs
        for chunk in chunked_content:
            print(f"[Page {page_num}/{len(processed_pages_content)}][Chunk {chunk_counter + 1}/{len(chunked_content)}] Processing chunk of size {len(chunk)} characters.")

            json_data = {
                "id": page_id + "-" + str(chunk_counter),  # Create a unique chunk ID
                "content": chunk,  # Chunked content
                "contentVector": generate_embeddings(chunk),  # Embeddings for the chunk
                "doc_id": page_id,  # Original page ID
                "chunk_id": chunk_counter  # Chunk counter
            }
            chunked_content_docs.append(json_data)
            chunk_counter += 1

    # Calculate total documents to upload
    total_docs = len(chunked_content_docs)
    total_docs_uploaded += total_docs
    print(f"Total Documents ready for upload: {total_docs}")

    # Upload the documents in chunks
    for documents_chunk in divide_chunks(chunked_content_docs, n):
        try:
            print(f"Uploading batch of {len(documents_chunk)} documents...")
            result = search_client.upload_documents(documents=documents_chunk)
            # Print the result for each document
            for res in result:
                print(f"Upload of document {res['key']} succeeded: {res['succeeded']}")
        except Exception as ex:
            print(f"Error during multiple documents upload: {ex}")

print(f"Total Documents Uploaded: {total_docs_uploaded}.")

Processing 4 pages from the site...
[Page 1/4] Processing Page ID 41a9a211-512f-4841-a66b-744e685ac95c...
[Page 1/4] Extracted text length: 1957 characters.
[Page 1/4] Split into 1 chunks.
[Page 1/4][Chunk 1/1] Processing chunk of size 1957 characters.
[Page 2/4] Processing Page ID 9883ffad-80a7-4038-899a-542aa6f77e39...
[Page 2/4] Page ID 9883ffad-80a7-4038-899a-542aa6f77e39: No text extracted from canvas layout.
[Page 3/4] Processing Page ID 6a8b2d03-d26c-44fc-972a-c52cb449aeb1...
[Page 3/4] Extracted text length: 4050 characters.
[Page 3/4] Split into 1 chunks.
[Page 3/4][Chunk 1/1] Processing chunk of size 4050 characters.
[Page 4/4] Processing Page ID 2955533b-8c34-44db-b23b-e90fda2f80e5...
[Page 4/4] Page ID 2955533b-8c34-44db-b23b-e90fda2f80e5: No text extracted from canvas layout.
Total Documents ready for upload: 2
Uploading batch of 2 documents...
Error during multiple documents upload: 'IndexingResult' object is not subscriptable
Total Documents Uploaded: 2.
