### Ingest Content from all Folders in SharePoint Site to an Azure AI Search Index
Includes Vectorizaztion of Content

In [None]:
import os
import json
import copy
import requests
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from tenacity import retry, wait_random_exponential, stop_after_attempt 
from dotenv import load_dotenv
import openai
from openai import AzureOpenAI
from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter

# Load environment variables from .env file
load_dotenv()

# Define the target directory (change yours)
target_directory = (
    r"C:\temp\docker\sharepoint-indexer\sharepoint-indexing-azure-ai-search"
)

# Check if the directory exists
if os.path.exists(target_directory):
    # Change the current working directory
    os.chdir(target_directory)
    print(f"Directory changed to {os.getcwd()}")
else:
    print(f"Directory {target_directory} does not exist.")

In [None]:
# Set the service endpoint and API key from the environment
# Create an SDK client
endpoint = os.environ["SEARCH_SERVICE_ENDPOINT"]
search_client = SearchClient(
    endpoint=endpoint,
    index_name=os.environ["SEARCH_INDEX_NAME"],
    credential=AzureKeyCredential(os.environ["SEARCH_ADMIN_API_KEY"]),
)

admin_client = SearchIndexClient(
    endpoint=endpoint,
    index_name=os.environ["SEARCH_INDEX_NAME"],
    credential=AzureKeyCredential(os.environ["SEARCH_ADMIN_API_KEY"]),
)

openai.api_key = os.environ["OPEN_API_KEY"]
openai.api_base = os.environ["OPEN_API_BASE"]
openai.api_type = "azure"  
openai.api_version = "2023-05-15"

model = os.environ["OPEN_API_DEPLOYMENT_NAME"]

client = AzureOpenAI(
        api_version=openai.api_version,
        azure_endpoint=openai.api_base,
        api_key=openai.api_key
    )

# This is in characters and there is an avg of 4 chars / token
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024*4,
    chunk_overlap  = 102*4
)

## Initialize the `client_extractor` client <a id='init-client'></a>

In [None]:
from gbb_ai.sharepoint_data_extractor import SharePointDataExtractor

# Instantiate the SharePointDataExtractor client
# The client handles the complexities of interacting with SharePoint's REST API, providing an easy-to-use interface for data extraction.
client_scrapping = SharePointDataExtractor()

> 💡 **Note**
> The `get_site_id` and `get_drive_id` methods are optional. They are automatically called by the `retrieve_sharepoint_files_content` function. However, they are available for use if further analysis is required.

In [None]:
# Load environment variables from the .env file
client_scrapping.load_environment_variables_from_env_file()

# Authenticate with Microsoft Graph API
client_scrapping.msgraph_auth()

# Get the Site ID for the specified SharePoint site
site_id = client_scrapping.get_site_id(
    site_hostname=os.environ["SITE_HOSTNAME"], site_name=os.environ["SITE_NAME"]
)
print( site_id )

# Get the Drive ID associated with the Site ID
drive_id = client_scrapping.get_drive_id(site_id)

In [None]:
def get_folders(url, folder_list = ['/']):
    headers = {'Authorization': 'Bearer ' + client_scrapping.access_token}
    response = requests.get(url, headers=headers)
    items = response.json()

    # Check if the 'value' key is in the response
    if 'value' not in items:
        return folder_list

    for item in items['value']:
        if 'folder' in item:
            # print(item['name'] + '/')
            # If the item is a folder, get its subfolders
            subfolder_url = url + '/' + item['name'] + '/children'
            folder_val = subfolder_url[subfolder_url.index('/drive/root')+11:].replace('/children','') + '/'
            print (folder_val)
            folder_list.append(folder_val)
            get_folders(subfolder_url, folder_list)
    return folder_list

# Split up a list into chunks - this is used to ensure a limited number of items sent to Azure AI Search
def divide_chunks(l, n):  
    # looping till length l  
    for i in range(0, len(l), n):   
        yield l[i:i + n]  

# Function to generate embeddings for title and content fields, also used for query embeddings
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def generate_embeddings(text):
    response = client.embeddings.create(
        input=text,
        model=model
    )
    return json.loads(response.model_dump_json())["data"][0]['embedding']

In [None]:
# generate_embeddings('test')

In [None]:
# Use the access token to get the folders 
print ('Getting all folders in SharePoint site...')
root_url = f'https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root/children'  
folder_list = get_folders(root_url)  

In [None]:
# Download and process files from a set of folders within a SharePoint site.

n = 100  # max batch size (number of docs) to upload at a time
total_docs_uploaded = 0

for folder in folder_list:
    print (f"Processing folder {folder}...")

    if folder == '/':
        selected_files_content = client_scrapping.retrieve_sharepoint_files_content(
            site_hostname=os.environ["SITE_HOSTNAME"],
            site_name=os.environ["SITE_NAME"],
            file_formats=["docx", "pdf"],
        )
    else:
        selected_files_content = client_scrapping.retrieve_sharepoint_files_content(
            site_hostname=os.environ["SITE_HOSTNAME"],
            site_name=os.environ["SITE_NAME"],
            folder_path=folder,
            file_formats=["docx", "pdf"],
        )

    if selected_files_content == None:
        print ("No documents found in this folder")
    else:
        chunked_content_docs = []
        sfc_counter = 0
        for sfc_counter in range(len(selected_files_content)):
            # print (json_data_base['content'])
            chunked_content = text_splitter.split_text(selected_files_content[sfc_counter]['content'])
            chunk_counter = 0
            for cc in chunked_content:
                json_data = copy.deepcopy(selected_files_content[sfc_counter]) 
                json_data['content'] = chunked_content[chunk_counter]
                json_data['contentVector'] = generate_embeddings(json_data['content'])
                json_data['doc_id'] = json_data['id']
                json_data['id'] = json_data['id'] + "-" + str(chunk_counter)
                json_data['chunk_id'] = chunk_counter
                chunk_counter+=1
                chunked_content_docs.append(json_data)
            sfc_counter+=1
            
        total_docs = len(chunked_content_docs)
        total_docs_uploaded += total_docs
        print (f"Total Documents to Upload: {total_docs}")

        for documents_chunk in divide_chunks(chunked_content_docs, n):  
            # Multiple Documents Upload
            try:
                # 'search_client.upload_documents' can ingest multiple documents at once
                # 'selected_files_content' is a list of documents
                print (f"Uploading batch of {len(documents_chunk)} documents...")
                result = search_client.upload_documents(documents=documents_chunk)
                # Print the result for each document
                for res in result:
                    print("Upload of new document succeeded: {}".format(res.succeeded))
            except Exception as ex:
                print("Error in multiple documents upload: ", ex)
    # print (selected_files_content)

print (f"Upload of {total_docs_uploaded} documents complete.")