Before running this notebook, please ensure you have followed the steps to configure your SharePoint site to allow API access and have updated the config.json file with the appropriate config parameters

This notebook walks through the following steps
1) Create an Azure Cognitive Search Index to hold the content from a SharePoint Online hosted document
2) Connect to a SharePoint site through the Micrsoft Graph REST API
3) Retrieve the Site ID for the site
4) Download the first file found that is of type .docx
5) Extract the text from this file and upload to Cognitive Search

Some useful resource for more details:
https://learn.microsoft.com/en-us/azure/active-directory/develop/console-app-quickstart?pivots=devlang-python
Create a Demo SharePoint Online Env: https://cdx.transform.microsoft.com/ -- to use this you need to either be a Microsoft Employee or part of the Microsoft Partner Program: https://partner.microsoft.com/dashboard/account/v3/enrollment/introduction/partnership

In [None]:
There needs to be a config.json file that looks like this:
    
{
    "authority": "https://login.microsoftonline.com/XXX",
    "client_id": "XXX",
    "scope": [ "https://graph.microsoft.com/.default" ],
    "secret": "XXX",
    "site_domain": "XXX.sharepoint.com",
    "site_name": "Mark8ProjectTeam",
    "include_auth_info": true,
    "search_service_name": "XXX",
    "search_index_name": "sharepoint-index-1",
    "search_admin_api_key": "XXX"
}

In [None]:
# !pip install azure-search-documents
# !pip install msal
# !pip install python-docx
# !pip install langchain


In [None]:
import io
import json
import requests
import msal
from docx import Document  
import pprint

from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient 
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import (
    ComplexField,
    CorsOptions,
    SearchIndex,
    ScoringProfile,
    SearchFieldDataType,
    SimpleField,
    SearchableField
)

#Load the configuration details for the SharePoint Online site as well as your Cognitive Search Service
config = json.load(open("config.json"))

# Print out the current config
pprint.pprint(config)

# Set the service endpoint and API key from the environment
# Create an SDK client
endpoint = "https://{}.search.windows.net/".format(config["search_service_name"])
admin_client = SearchIndexClient(endpoint=endpoint,
                      index_name=config["search_index_name"],
                      credential=AzureKeyCredential(config["search_admin_api_key"]))

search_client = SearchClient(endpoint=endpoint,
                      index_name=config["search_index_name"],
                      credential=AzureKeyCredential(config["search_admin_api_key"]))


In [None]:
# Delete the index if it exists
try:
    result = admin_client.delete_index(config["search_index_name"])
    print ('Index', config["search_index_name"], 'Deleted')
except Exception as ex:
    print (ex)


In [None]:
# Create the index
fields = [
        SimpleField(name="id", type=SearchFieldDataType.String, filterable=True, sortable=True, key=True),
        SearchableField(name="name", type=SearchFieldDataType.String, filterable=True, sortable=True),
        SimpleField(name="createdDateTime", type=SearchFieldDataType.DateTimeOffset, facetable=True, filterable=True, sortable=True),
        SimpleField(name="lastModifiedDateTime", type=SearchFieldDataType.DateTimeOffset, facetable=True, filterable=True, sortable=True),
        SimpleField(name="webUrl", type=SearchFieldDataType.String),
        SearchableField(name="content", type=SearchFieldDataType.String, analyzer_name="en.lucene")
    ]
cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
scoring_profiles = []
suggester = [{'name': 'sg', 'source_fields': ['name']}]

index = SearchIndex(
    name=config["search_index_name"],
    fields=fields,
    scoring_profiles=scoring_profiles,
    suggesters = suggester,
    cors_options=cors_options)

try:
    result = admin_client.create_index(index)
    print ('Index', result.name, 'created')
except Exception as ex:
    print (ex)

In [None]:
# Functions to authenticate to SharePoint Online site and to perform a Graph API request
def authenticate():
    # Create a preferably long-lived app instance which maintains a token cache.
    app = msal.ConfidentialClientApplication(
        config["client_id"], authority=config["authority"],
        client_credential=config["secret"],
        # token_cache=...  # Default cache is in memory only.
                           # You can learn how to use SerializableTokenCache from
                           # https://msal-python.rtfd.io/en/latest/#msal.SerializableTokenCache
        )

    # The pattern to acquire a token looks like this.
    result = None

    # Firstly, looks up a token from cache
    # Since we are looking for token for the current app, NOT for an end user,
    # notice we give account parameter as None.
    result = app.acquire_token_silent(config["scope"], account=None)

    if not result:
        print("No suitable token exists in cache. Getting a new one from AAD...")
        result = app.acquire_token_for_client(scopes=config["scope"])
        
    return result


def exec_graph_request(endpoint, access_token):
    # Calling graph using the access token
    return requests.get(  # Use token to call downstream service
        endpoint,
        headers={'Authorization': 'Bearer ' + access_token}, )


In [None]:
def get_drive_id(access_token, site_id):  
    url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive"  
    headers = {  
        "Authorization": f"Bearer {access_token}"  
    }  
    response = requests.get(url, headers=headers)  
    response.raise_for_status()  
    return response.json()["id"]  


def get_files_in_site(access_token, site_id, drive_id):  
    url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives/{drive_id}/root/children"  
    headers = {  
        "Authorization": f"Bearer {access_token}"  
    }  
    response = requests.get(url, headers=headers)  
    response.raise_for_status()  
    return response.json()["value"]  

def get_file_permissions(access_token, site_id, item_id):  
    url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/items/{item_id}/permissions"  
    headers = {  
        "Authorization": f"Bearer {access_token}"  
    }  
    response = requests.get(url, headers=headers)  
    response.raise_for_status()  
    return response.json()["value"]  

In [None]:
# Authenticate to SharePoint Online and print out the token details
auth = authenticate()
print (auth)

In [None]:
# Get Site ID
print ('Getting the Site ID...')
endpoint = 'https://graph.microsoft.com/v1.0/sites/' + config["site_domain"] + ':/sites/' + config["site_name"] + ':/'
result = exec_graph_request(endpoint, auth['access_token'])
# site_id = result.json()["id"].split(',')[1]
site_id = result.json()["id"]
print ('Site ID:', site_id)

In [None]:
print ('Getting the Drive ID...')
drive_id = get_drive_id(auth['access_token'], site_id)

In [None]:
for file in get_files_in_site(auth['access_token'], site_id, drive_id):
    print (file)
    break

In [None]:
pprint.pprint(file)

In [None]:
get_file_permissions(auth['access_token'], site_id, '01O33TVYTDCM4WT7NTYFHKL6FNRU5PYK5W')

In [None]:
files = get_files_in_site(auth['access_token'], site_id, drive_id)  

for file in files:  
    print(f"File name: {file['name']} - Item ID: {file['id']}")  

When a delta query is executed Microsoft Graph sends a response containing the requested resource and a state token.

a. If a @odata.nextLink URL is returned, there may be additional pages of data to be retrieved in the session. The application continues making requests using the @odata.nextLink URL to retrieve all pages of data until a @odata.deltaLink URL is returned in the response.

b. If a @odata.deltaLink URL is returned, there is no more data about the existing state of the resource to be returned. For future requests, the application uses the @odata.deltaLink URL to learn about changes to the resource.

For this first demo, we know it can contain all files in a single request. We will handle changes in a subsequent notebook.

In [None]:
# Iterage files in SharePoint online until it finds the first Word document
# Extract the content from this file

print ('Looking through files...')
# Load some documents to the index
documents = []
endpoint = 'https://graph.microsoft.com/v1.0/sites/' + site_id + '/drive/root/delta'
json_result = exec_graph_request(endpoint, auth['access_token']).json()

last_modified_q = json_result['@odata.deltaLink']
for v in json_result['value']:
    if 'file' in v:
        # If file is Word
        if v["file"]["mimeType"] == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
            json_doc = {"@search.action": "upload"}
            print (v["createdDateTime"])
            print (v["id"])
            print (v["lastModifiedDateTime"])
            print (v["name"])
            print (v["webUrl"])
            
            json_doc["id"] = v["id"]
            json_doc["name"] = v["name"]
            json_doc["webUrl"] = v["webUrl"]
            json_doc["createdDateTime"] = v["createdDateTime"]
            json_doc["lastModifiedDateTime"] = v["lastModifiedDateTime"]
            
            # Get text from a docx file  
            # endpoint = 'https://graph.microsoft.com/v1.0/sites/' + site_id + '/drive/root:/' + v["name"] + ':/content'
            
            endpoint = 'https://graph.microsoft.com/v1.0/sites/' + site_id + '/drive/root:/' + v["name"] + ':/content'
            
            
            # https://graph.microsoft.com/(version)/sites/(site-id)/drive/(drive-id)/items/(item-id)/content

            file_content = exec_graph_request(endpoint, auth['access_token']).content
            try:
                document = Document(io.BytesIO(file_content))
                content = "\n".join([paragraph.text for paragraph in document.paragraphs])  

                print ("\n--------------------------")

                # print ("File Content")
                # print ("--------------------------")
                # print (content)
                json_doc["content"] = content

                documents.append(json_doc)
            except Exception as ex:
                print ('Error:', ex)
            


In [None]:
try:
    result = search_client.upload_documents(documents=documents)
    print("Upload of new document succeeded: {}".format(result[0].succeeded))
except Exception as ex:
    print (ex.message)

In [None]:
# Perform a search on the index and iterate through results
# Switch to the Azure Cognitive Search SDK for this
results =  search_client.search(search_text="marketing")

for result in results:
    print("{}: {}".format(result["id"], result["name"]))
    print("{}".format(result["content"]))