In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Setup for rms_api imports
import sys
from pathlib import Path

# Add rmsbql directory to path for setup_utils
notebook_dir = Path().absolute()
rmsbql_dir = notebook_dir
while rmsbql_dir.name != 'docretrieval':
    rmsbql_dir = rmsbql_dir.parent
if str(rmsbql_dir) not in sys.path:
    sys.path.append(str(rmsbql_dir))

from setup_utils import setup_rms_api_imports, get_relative_notebook_path

# Set up imports and print diagnostic information
project_root = setup_rms_api_imports()
print(f"Project root: {project_root}")
print(f"Current notebook location: {get_relative_notebook_path()}")

Added /Users/lucasescobar/Desktop/ebg to Python path
Project root: /Users/lucasescobar/Desktop/ebg/rms_api
Current notebook location: docretrieval


In [3]:
# Import and initialize RMSBQL client
from rms_api import initialize_docretrieval_client
doc_retriever = initialize_docretrieval_client(environment='bbg')

Loaded bbg configuration


In [4]:
doc_retriever

<rms_api.DocumentRetriever.DocumentRetriever at 0x7fdf45fa0cd0>

### Document Retrieval

In [5]:
# Specify Documents to Retrieve (NOTE IDs)
documents = [
    '6802245183462703105',
    '6808554980048633856',
    '6809856290712977408',
    '6811977394673418240',
]

doc = '7424012126903599105' # has attachments
#doc = '7419329650482479105' # has html attachment
#doc = '7381120680816017409' # does not have attachments
#doc = '7434557272345804801'

#### Retrieve NOTE Meta Data

In [6]:
metaDataSearchResults = doc_retriever.getDocMetaData(doc)

In [7]:
metaDataSearchResults.keys()

dict_keys(['title', 'creator', 'lastUpdater', 'activityDate', 'eventDate', 'tags', 'attachments', 'sharing'])

In [8]:
metaDataSearchResults['tags']

[{'isDerived': False,
  'displayName': '45200Q US Equity',
  'type': 1,
  'isPrimary': False,
  'tagListName': None,
  'figi': 'BBG000HZ4DM6',
  'ticker': '45200Q',
  'secNumDes': '45200Q'},
 {'isDerived': False,
  'displayName': 'AEP US Equity',
  'type': 1,
  'isPrimary': False,
  'tagListName': None,
  'figi': 'BBG000BB9KF2',
  'ticker': 'AEP',
  'secNumDes': 'AEP'},
 {'isDerived': False,
  'displayName': 'AEP 6.95 12/15/54 Corp',
  'type': 1,
  'isPrimary': True,
  'tagListName': None,
  'figi': 'BBG01NC3JTR5',
  'ticker': 'AEP',
  'secNumDes': 'AEP V6.95 12/15/54'},
 {'isDerived': False,
  'displayName': 'AEP 4 ½ 08/01/32 Corp',
  'type': 1,
  'isPrimary': False,
  'tagListName': None,
  'figi': 'BBG01908FZS7',
  'ticker': 'AEP',
  'secNumDes': 'AEP 4.5 08/01/32 BB'},
 {'isDerived': False,
  'displayName': 'Meeting Note',
  'type': 80,
  'isPrimary': False,
  'tagListName': 'BUYSIDE CREDIT - Note Type',
  'figi': None,
  'ticker': None,
  'secNumDes': None},
 {'isDerived': False,


#### Retrieve NOTE Attachments

In [59]:
attachments = doc_retriever.getDocAttachments(metaDataSearchResults)

In [60]:
attachments

[{'name': 'BUYSIDE CREDIT - Structured Word Template.pdf',
  'fileId': '23508056_BUYSIDE CREDIT - Structured Word Template_670763B000016FA72C9E0002.pdf',
  'extension': '.pdf'},
 {'name': 'Note Summary', 'fileId': None, 'extension': '.html'},
 {'name': 'BUYSIDE CREDIT - Structured Word Template.docx',
  'fileId': '23508056_BUYSIDE CREDIT - Structured Word Template_670763B000016FA72C9E0001.docx',
  'extension': '.docx'}]

#### Retrieve a Specified Attachment Using File ID

In [41]:
fileId = attachments[0]['fileId']
noteAttachmentSearchResults = doc_retriever.getDocument(doc, fileId)

In [42]:
noteAttachmentSearchResults

<Response [200]>

#### Save Retrieved File to Secified Path

In [31]:
# Recreate file name (should be the same as fileID)
i = noteAttachmentSearchResults.request.url.find('attachmentId=') + len('attachmentId=')
# path = 'documents/' + noteAttachmentSearchResults.request.url[i:].replace('%20',' ')
path = 'documents/' + fileId
doc_retriever.saveFile(noteAttachmentSearchResults, path)

#### Combine Each Step in the Document Attachment Retrieval & Storage Process into a Single Method

In [32]:
doc = '7424012126903599105'
base_path = 'documents/'
saved_attachments = doc_retriever.transplantDocAttachments(doc, base_path, file_types=['.pdf'])

3 attachments were retrieved from document 7424012126903599105
1 attachments met retrieval criteria

Retrieving 23508056_BUYSIDE CREDIT - Structured Word Template_670763B000016FA72C9E0002.pdf...
Saving 23508056_BUYSIDE CREDIT - Structured Word Template_670763B000016FA72C9E0002.pdf...

1 document attachment saved


#### Retrieve Attachments for Multiple Documents

In [85]:
# Specify Documents to Retrieve (NOTE IDs)
documents = [
    '6802245183462703105',
    '6808554980048633856',
    '6809856290712977408',
    '6811977394673418240',
]
base_path = 'documents/'

##### Using Threading (Synchronous)

In [92]:
doc_retrieval_results = doc_retriever.batch_process_documents(
    docs=documents,
    base_path=base_path,
    file_types=['.pdf'],
    max_workers=4  # Adjust based on your needs
)

Starting batch processing of 4 documents...
3 attachments were retrieved from document 6809856290712977408
1 attachments met retrieval criteria

Retrieving 13806294_ERM Scorecard - ERS CDEMapped_5E8178410001444E16DC0001.pdf...
3 attachments were retrieved from document 6808554980048633856
1 attachments met retrieval criteria

Retrieving 13806294_T.Rowe Scorecard_228_CDEMAP_5E8147550001444E16DC0002.pdf...
3 attachments were retrieved from document 6811977394673418240
1 attachments met retrieval criteria

Retrieving 13806294_Dummy Upload_5E8C98950001941F16DA0002.pdf...
3 attachments were retrieved from document 6802245183462703105
1 attachments met retrieval criteria

Retrieving 12382079_ERM Scorecard - TRS_5E666E010001FE4B16DB0002.pdf...
Saving 13806294_T.Rowe Scorecard_228_CDEMAP_5E8147550001444E16DC0002.pdf...

1 document attachment saved
Progress: 1/4 documents processed
Successfully processed document 6808554980048633856 (1 attachments saved)
Saving 13806294_Dummy Upload_5E8C9895000

In [87]:
# Generate and print a report
report = doc_retriever.generate_batch_report(doc_retrieval_results)
print(report)

Batch Processing Report

Summary Statistics:
- Total documents processed: 4
- Successfully processed: 4
- Failed: 0

Detailed Results:
---------------

Successfully Processed Documents:
- 6802245183462703105 (1 attachments)
- 6811977394673418240 (1 attachments)
- 6809856290712977408 (1 attachments)
- 6808554980048633856 (1 attachments)


In [89]:
# You can also access individual results
for doc_id, result in doc_retrieval_results.items():
    if result.success:
        print(f"Document {doc_id} saved {len(result.saved_attachments)} attachments")
    else:
        print(f"Document {doc_id} failed: {result.error_message}")

Document 6802245183462703105 saved 1 attachments
Document 6811977394673418240 saved 1 attachments
Document 6809856290712977408 saved 1 attachments
Document 6808554980048633856 saved 1 attachments


In [90]:
doc_retrieval_results

{'6802245183462703105': DocumentProcessResult(doc_id='6802245183462703105', success=True, saved_attachments=['12382079_ERM Scorecard - TRS_5E666E010001FE4B16DB0002.pdf'], error_message=None),
 '6811977394673418240': DocumentProcessResult(doc_id='6811977394673418240', success=True, saved_attachments=['13806294_Dummy Upload_5E8C98950001941F16DA0002.pdf'], error_message=None),
 '6809856290712977408': DocumentProcessResult(doc_id='6809856290712977408', success=True, saved_attachments=['13806294_ERM Scorecard - ERS CDEMapped_5E8178410001444E16DC0001.pdf'], error_message=None),
 '6808554980048633856': DocumentProcessResult(doc_id='6808554980048633856', success=True, saved_attachments=['13806294_T.Rowe Scorecard_228_CDEMAP_5E8147550001444E16DC0002.pdf'], error_message=None)}

##### Using async/await (Asynchronous)

In [99]:
import asyncio
import nest_asyncio

In [100]:
# This is the correct way to run async code in Jupyter/IPython
nest_asyncio.apply()  # This allows for nested event loops

In [101]:
# async def main():
#     retriever = DocumentRetriever.DocumentRetriever('config', environment='bbg')
#     document_ids = [
#         '6802245183462703105',
#         '6808554980048633856',
#         '6809856290712977408',
#         '6811977394673418240',
#     ]

#     results = await retriever.async_batch_process_documents(
#         docs=document_ids,
#         base_path='documents/',
#         file_types=['.pdf'],
#         max_concurrent=4
#     )
    
#     # Generate report
#     report = retriever.generate_batch_report(results)
#     print(report)

# # Run the async code
# asyncio.run(main())

In [7]:
async def main():
    retriever = DocumentRetriever.DocumentRetriever('config', environment='bbg')
    document_ids = [
        '6802245183462703105',
        '6808554980048633856',
        '6809856290712977408',
        '6811977394673418240',
    ]
    
    results = await retriever.async_batch_process_documents(
        docs=document_ids,
        base_path='documents/',
        file_types=['.pdf'],
        max_concurrent=4
    )
    
    # Generate report
    report = retriever.generate_batch_report(results)
    print(report)

In [8]:
await main()  # Note: we use 'await' directly instead of asyncio.run()

Loaded bbg configuration
Starting async batch processing of 4 documents...
Progress: 1/4 documents processed
Successfully processed document 6802245183462703105
Progress: 2/4 documents processed
Successfully processed document 6811977394673418240
Progress: 3/4 documents processed
Successfully processed document 6808554980048633856
Progress: 4/4 documents processed
Successfully processed document 6809856290712977408

Async batch processing complete:
- Total documents: 4
- Successfully processed: 4
- Failed: 0
Batch Processing Report

Summary Statistics:
- Total documents processed: 4
- Successfully processed: 4
- Failed: 0

Detailed Results:
---------------

Successfully Processed Documents:
- 6802245183462703105 (1 attachments)
- 6811977394673418240 (1 attachments)
- 6808554980048633856 (1 attachments)
- 6809856290712977408 (1 attachments)
