# 2. Sharepoint File Search

> Using Office365 API to find XMLs on SharePoint

In [1]:
#| code-fold: true
from datetime import datetime; datetime.now().strftime("created: %F %H:%M")

'created: 2023-04-27 08:48'

## SharePoint

Our organization uses SharePoint as the main file repository for customer data.  
In addition to handling local files, it will be useful to accommodate the following use cases:
- scan a given SharePoint folder for relevant XML files and record **unique** file GUIDs  
- scan and download unmodified XMLs to local storage  
- scan, parse, and download processed data as JSON files  

In [2]:
#| hide
%load_ext autoreload
%autoreload 2

In [3]:
#| default_exp io

In [4]:
#| hide
from fastcore.test import *
from nbdev.showdoc import *

## Connect to SharePoint using Office365 API with secure `.env` Credentials
To use sensitive data such as passwords securely, it is common to place the secrets into environment variable (`.env`) file, which is excluded from code repositories. Read more: [python-dotenv](https://github.com/theskumar/python-dotenv)

In [5]:
from dotenv import dotenv_values
from office365.runtime.auth.user_credential import UserCredential
from office365.runtime.client_request_exception import ClientRequestException
from office365.sharepoint.client_context import ClientContext
import io
import pandas as pd

In [6]:
config = dotenv_values("../.env")
sharepoint_url = config["SHAREPOINT_URL"]
credentials = UserCredential(config["USERNAME"], config["PASSWORD"])
ctx = ClientContext(sharepoint_url).with_credentials(credentials)

### Traverse files and folders and find XML files 

In [7]:
%%time
xml_files = {}
def enum_folder(parent_folder, folder_action, file_limit=None):
    """Recursively traverse files and folders."""
    parent_folder.expand(["Files", "Folders"]).get().execute_query()
    folder_action(parent_folder)
    
    for file in parent_folder.files:
          if file.name.endswith(".xml"):
              xml_files[file.serverRelativeUrl] = file.unique_id
    if file_limit:
        if len(xml_files) >= file_limit:
            return
        else:
            for folder in parent_folder.folders:
                enum_folder(folder, folder_action, file_limit=file_limit)


def folder_action(folder):
    """Record some data while iterating on folders."""
    # print(len(xml_files), folder.serverRelativeUrl, end='\r')
    return


root_folder = ctx.web.get_folder_by_server_relative_url(config["STARTING_FOLDER"]).get().execute_query()
enum_folder(root_folder, folder_action, file_limit=1000)

CPU times: user 3.01 s, sys: 46.7 ms, total: 3.06 s
Wall time: 19.4 s


### Inferring the files are relevant from file name

In [8]:
prefixes = []
for k in xml_files:
  prefix = k.split('/')[-1].split('-')[0]
  prefixes.append(prefix)
pd.Series(prefixes).value_counts()

Cartridge_PAT                     899
Cartridge_WET                     135
CalibrationData.xml                 4
PersistedStatus.xml                 3
ConfigurationSettings.xml           3
Users.xml                           3
Patients.xml                        3
RegistrySettings.xml                2
NormalRangesLibrary.xml             2
CartridgeLibrary.xml                1
MicrofluidicsScriptLibrary.xml      1
dtype: int64

## Get file content by SharePoint GUID

In [9]:
sharepoint_guid = xml_files[list(xml_files.keys())[3]]
sharepoint_guid

'66e6a98a-3308-4a83-bc38-00818e953985'

In [10]:
with io.BytesIO() as bytestream:
    ctx.web.get_file_by_id(sharepoint_guid).download(bytestream).execute_query()
    print(f"read {bytestream.tell():,d} bytes from file {sharepoint_guid}\n")
    bytestream.seek(0)
    print(bytestream.read(420))

read 1,342,694 bytes from file 66e6a98a-3308-4a83-bc38-00818e953985

b'<!--\r\nFileCount=3\r\nFileOffset1=0000000120\r\nFileOffset2=0000010692\r\nFileOffset3=0001340966\r\nFileOffset4=0001342694\r\n-->\r\n<?xml version="1.0" encoding="utf-8"?>\r\n<Cartridge xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" SchemaVersion="1" AllowTemperatureChange="false" BarcodeExpirationDate="2024-08-20T00:00:00" CalculatedExpirationDate="2024-08-31T00:00:00" CartridgeN'


## Sandbox

In [11]:
#| hide
import nbdev; nbdev.nbdev_export()