In [1]:
# imports
import asyncio
import textwrap

# 'pandas' is used to take what we have obtained from DocAI
# and save it in a format that we want in a spreadsheet
import pandas as pd

# 'time' is used to wait a couple seconds between our checks
# to DocAI to see if the request has completed
import time

# 'os' is used to get the files that exist in our file uploads
# folder. We will use this to join directory paths to file names
# and also determine if the path provided is a file (or not).
import os

# 'zdai' is the Zuva DocAI Python SDK, which provides functions
# which make it easier to use DocAI via Python.
# We are importing ZDAISDK (Zuva DocAI Software Development Kit)
# which is the entry point to DocAI's services.
from zdai import ZDAISDK, DocumentClassificationRequest, \
   LanguageClassificationRequest, FieldExtractionRequest


In [15]:
from zdai import ZDAISDK

# Specify the API URL
api_url = 'https://us.app.zuva.ai/api/v2'

# Specify the authentication token
auth_token = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJvcmdfaWQiOiJvcmdfZmlhUUpXTGlqaUpKdUhTZyIsInRvayI6ImNuNTNqcTRjY3B2YzczYTJtbGlnIiwidSI6ImF1dGgwfDY1Y2EzNWNjNTUxYzEzMjU4YmM4Y2Y3NCJ9.Gsre_fMRQKLPpRcETWSkBALphNtTO1j6XOVRjsxRrq0'

# Initialize the SDK with the provided URL and token
sdk = ZDAISDK(url=api_url, token=auth_token)

# Fetch the fields using the SDK
fields, _ = sdk.fields.get()
              
print(f'Found {len(fields)} fields on region {sdk.url}')


Found 1351 fields on region https://us.app.zuva.ai/api/v2


In [16]:

# Upload the file directory from the absolute path - please change to the folder you are reviewing ensure
#there aren't sub-folders. 
upload_files_directory = '/Users/joshuaschoen/Downloads/Abstract - Series Seed Financing Documents'

#defining the list of the file in the directory
docs = [os.path.join(upload_files_directory, f) for f
       in os.listdir(upload_files_directory)
       if os.path.isfile(os.path.join(upload_files_directory, f))
       and not f.startswith('.')]

# All of the documents in the list
print([d for d in docs])

docai_files = []

for doc in docs:
    with open(doc, 'rb') as f:
        file, _ = sdk.file.create(content=f.read())
        file.name = os.path.basename(doc)
        docai_files.append(file)
        print(f'Submitted "{file.name}" to DocAI. '
              f'DocAI sees this file as "{file.id}')


['/Users/joshuaschoen/Downloads/Abstract - Series Seed Financing Documents/04 Abstract - Series Seed - Voting Agreement.docx', '/Users/joshuaschoen/Downloads/Abstract - Series Seed Financing Documents/01 Abstract - Series Seed - Restated Certificate of Incorporation.docx', '/Users/joshuaschoen/Downloads/Abstract - Series Seed Financing Documents/03 Abstract - Series Seed - Disclosure Schedule.docx', '/Users/joshuaschoen/Downloads/Abstract - Series Seed Financing Documents/05 Abstract - Series Seed - Investors Rights Agreement.docx', '/Users/joshuaschoen/Downloads/Abstract - Series Seed Financing Documents/07 Abstract - Series Seed - Pro Forma (02.02.2024).xlsx', '/Users/joshuaschoen/Downloads/Abstract - Series Seed Financing Documents/06 Abstract - Series Seed - Right of First Refusal & Co-Sale Agreement.docx', '/Users/joshuaschoen/Downloads/Abstract - Series Seed Financing Documents/02 Abstract - Series Seed - Preferred Stock Purchase Agreement.docx']
Submitted "04 Abstract - Series S

In [4]:

#listing all of the fields covered in the Zuva API
fields, _ = sdk.fields.get()

for field in fields:
    print(f'{field.id}: {field.name}')


00339a5d-0d92-42f5-b4d6-2488d1768bc8: Commercial General Liability Insurance
003c3e8f-222c-4983-8ef7-5ddca5e02e58: Devotion of Time — LPA/LLC
005801fc-495f-4599-9a84-d02360c442d9: Additional Insured Coverage
00a2715b-70f4-4be8-9942-ce9707a43548: Pricing
00abb5ca-f661-47c5-ba41-960934e53128: Independent Legal Advice
00b7bec7-da3b-4829-b28b-eef56fe1a477: Tax Representations
010e3724-9179-468e-ba7a-ed155399ec9d: Supplemental Indentures/Amendments without Holder Consent — Bond Indenture
01cc2460-bd20-472e-ae97-e388cbe90d5e: “Excess Availability” Definition
0213c822-ee09-4203-814e-5624922693f4: “Disqualified Stock” Definition
025e1173-8d5c-4b49-ad66-e762735efd12: Financial Statements and Information Reporting Covenant
0265c96d-b437-49ab-a271-898366f73c73: Subcontracting
0280c6c2-e1f6-4261-8681-72af13487cb6: Additional Conditions for Transfer
0292d814-6386-46ae-b881-27aee52be81f: Counterparty Specified Entity
0297ecf6-8358-4e13-ae3f-09fc96b77808: Non-Refundable Amounts
02ddb539-5f15-4bbd-9d0

In [5]:
 
languages, _ = sdk.language.create(file_ids = [d.id for d in docai_files])


classifications, _ = sdk.classification.create(file_ids = [d.id for d in docai_files])






In [6]:
requests = []

#picking the relevant fields to track 
field_names = ['Title', 'Parties', 'Date', 'Size/Purchase Price',
              'Governing Law', 'Vesting Schedule', "Number of Option Shares",
              'Termination for Cause or Breach', 'Termination for Insolvency',
              'Termination for Convenience', '“Confidential Information” Definition','Right of First Offer/Right of First Refusal', 'Arbitration', 'Number of Directors',"Quorum at Directors' Meetings", "Directors' Meetings (Broad)","Board/Manager Selection", "Shareholder Meeting — Notice","Quorum at Shareholders' Meetings", "Transfer of Shares", "Exclusivity/Non-Compete", "Tag Along Rights", "Drag Along Rights", "Ownership of Intellectual Property '('Broad')', 'Information Rights'"]

field_ids = [f.id for f in fields if f.name in field_names]

In [7]:

extractions, _ = sdk.extraction.create(file_ids = [d.id for d in docai_files], field_ids = field_ids)


In [8]:
requests.extend(classifications + languages + extractions)


In [9]:

# reviewing and conducting extractions for the table 
results = {}

while len(requests) > 0:
    for request in requests:
        print(request.type, request.id, request.status)
        request.update()
        if request.is_finished():
            requests.remove(request)
            if not request.is_successful():
                print(f'{request.id} failed.')
                continue

            # Creates the data structure for the file_id if it doesn't already exist
            if request.file_id not in results:
                results[request.file_id] = {}
                results[request.file_id]['name'] = [d.name for d in docai_files
                                                    if d.id == request.file_id][0]

            if request.is_type(DocumentClassificationRequest):
                results[request.file_id]['type'] = request.classification
                results[request.file_id]['is_contract'] = 'Yes' if request.is_contract else 'No'

            elif request.is_type(LanguageClassificationRequest):
                results[request.file_id]['language'] = request.language

            elif request.is_type(FieldExtractionRequest):
                results[request.file_id]['extractions'] = []

                for result in request.get_results():
                    field_name = [f.name for f in fields if f.id == result.field_id][0]

                    for span in result.spans:
                        results[request.file_id]['extractions'].append({
                            'field_name': field_name,
                            'page_start': span.page_start,
                            'page_end': span.page_end,
                            'text': result.text
                        })
    time.sleep(2)


DocumentClassificationRequest cn53ka2bqkqc73aviku0 queued
DocumentClassificationRequest cn53ka2bqkqc73avikug queued
DocumentClassificationRequest cn53ka2bqkqc73avikv0 queued
DocumentClassificationRequest cn53ka2bqkqc73avikvg queued
DocumentClassificationRequest cn53ka2bqkqc73avil00 queued
DocumentClassificationRequest cn53ka2bqkqc73avil0g queued
DocumentClassificationRequest cn53ka2bqkqc73avil10 queued
LanguageClassificationRequest cn53ka2bqkqc73avikpg queued
LanguageClassificationRequest cn53ka2bqkqc73avikq0 queued
LanguageClassificationRequest cn53ka2bqkqc73avikqg queued
LanguageClassificationRequest cn53ka2bqkqc73avikr0 queued
LanguageClassificationRequest cn53ka2bqkqc73avikrg queued
LanguageClassificationRequest cn53ka2bqkqc73aviks0 queued
LanguageClassificationRequest cn53ka2bqkqc73aviksg queued
FieldExtractionRequest cn53ka2bqkqc73btgbf0 queued
FieldExtractionRequest cn53ka2bqkqc73btgbfg queued
FieldExtractionRequest cn53ka2bqkqc73btgbg0 queued
FieldExtractionRequest cn53ka2bqkqc

FieldExtractionRequest cn53ka2bqkqc73btgbi0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbf0 processing
FieldExtractionRequest cn53ka2bqkqc73btgbfg processing
FieldExtractionRequest cn53ka2bqkqc73btgbgg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
FieldExtractionRequest cn53ka2bqkqc73btgbhg processing
FieldExtractionRequest cn53ka2bqkqc73btgbi0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbf0 processing
FieldExtractionRequest cn53ka2bqkqc73btgbfg processing
FieldExtractionRequest cn53ka2bqkqc73btgbgg processing
FieldExtractionRequest cn53ka2bqkqc73btgbhg processing
FieldExtractionRequest cn53ka2bqkqc73btgbi0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest 

FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00

DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc7

DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc7

DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc7

DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc7

DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc73avikrg processing
FieldExtractionRequest cn53ka2bqkqc73btgbh0 processing
DocumentClassificationRequest cn53ka2bqkqc73avil00 processing
LanguageClassificationRequest cn53ka2bqkqc7

In [10]:

df_columns = [
    'Filename',
    'Language',
    'Document Type',
    'Contract?',
    'Field Name',
    'Page',
    'Text'
]


In [11]:
data = []

for file_id, metadata in results.items():
    filename = metadata.get('name')
    language = metadata.get('language')
    document_type = metadata.get('type')
    is_contract = metadata.get('is_contract')
    for extraction in metadata.get('extractions'):
        data.append([filename, language, document_type, is_contract,
                     extraction.get('field_name'),
                     extraction.get('page'),
                     extraction.get('text')])


In [12]:

df = pd.DataFrame(data, columns=df_columns)

print(df.head(2))

                                            Filename Language  \
0  03 Abstract - Series Seed - Disclosure Schedul...  English   

            Document Type Contract?               Field Name  Page  \
0  Employment-Related Agt        No  Number of Option Shares  None   

               Text  
0  Number of Shares  


In [None]:
#export to excel
df.to_excel('output_Legal.xlsx')
