In [1]:
# script to download documents from Nexis Uni API in bulk
# questions? Jeremy Darrington, jdarring@princeton.edu
from oauthlib.oauth2 import BackendApplicationClient
from requests_oauthlib import OAuth2Session
from requests.auth import HTTPBasicAuth
import os
import time

In [2]:
# be sure to set up the client_id and client_secret as environment variables in your shell
client_id = os.getenv('CLIENT_ID')
client_secret = os.getenv('CLIENT_SECRET')
scope = 'http://oauth.lexisnexis.com/all'
grant_type = 'client_credentials'
content_type = 'application/x-www-form-urlencoded'

In [3]:
# for details on authentication, see the docs at https://www.lexisnexis.com/lextalk/developers/ln-webservice-api/p/apidocs.aspx#introduction, section "Service to Service ID"
# also see https://requests-oauthlib.readthedocs.io/en/latest/oauth2_workflow.html#backend-application-flow 
auth = HTTPBasicAuth(client_id, client_secret)

client = BackendApplicationClient(client_id=client_id, scope=scope, grant_type=grant_type, token_type='Bearer', client_secret=client_secret, content_type=content_type)

oauth = OAuth2Session(client=client)

token = oauth.fetch_token(token_url='https://auth-api.lexisnexis.com/oauth/v2/token', auth=auth)

In [4]:
# get the token
token

{'access_token': 'NWQ2Mzc5ZWEtNmY1Zi00ZWEwLTliNWMtMWI5MDAzZjM0ODcx',
 'token_type': 'Bearer',
 'expires_in': 86400,
 'expires_at': 1723218042.117908}

In [24]:
# set up a request to the BatchNews endpoint; should only be used when returning > 500 documents 
base_url = 'https://services-api.lexisnexis.com/v1/'
search_endpoint = "BatchNews"
# Documentation says you add these search fields: agg-copyright, body, byline, cite, company, headline, hlead, length, publication, publication-type, pub-copyright, section, show, term, ticker, title, jurisinfo, load-date, xce-date, subject, industry, geography, document-type, country
search_string = "biden w/5 (poll* OR survey) and publication('New York Times')"
filter = "Date ge 2024-01-01" # comparison operators: gt, ge, lt, le
results_per_page = 50 # max is 50 for BatchNews
url_params = {'$search': search_string, '$filter': filter, '$top': results_per_page}

In [25]:
# you need to pass the token in the headers by calling 'get' on the oauth object
r = oauth.get(base_url+search_endpoint, params=url_params, timeout=10)

In [26]:
# see the results
r.json()

{'@odata.context': 'https://services-api.lexisnexis.com/v1/$metadata#BatchNews(ResultId,Source)',
 '@odata.count': 761,
 'value': [{'ResultId': 'urn:contentItem:6CGT-XPF1-JBG3-6033-00000-00',
   'Source': {'Id': '', 'Name': 'The New York Times', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6CHY-TR91-JBG3-60D6-00000-00',
   'Source': {'Id': '', 'Name': 'The New York Times', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6CG0-BWY1-JBG3-6021-00000-00',
   'Source': {'Id': '', 'Name': 'The New York Times', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6C17-2CY1-DXY4-X07D-00000-00',
   'Source': {'Id': '', 'Name': 'The New York Times', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6CGN-B9B1-JBG3-60SJ-00000-00',
   'Source': {'Id': '', 'Name': 'The New York Times', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6CG5-YTV1-JBG3-60P9-00000-00',
   'Source': {'Id': '', 'Name': 'The New York Times', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6CCP-B0M1-D

In [28]:
# get the document IDs
doc_ids = [doc['ResultId'] for doc in r.json()['value']]

In [29]:
# iterate through the pages of results and add the doc_ids to the list. Because of rate limits, you may need to add a sleep statement of 2-3 seconds, depending on the number of results
# 
counter = 0
pages = 5  # number of pages to fetch

while '@odata.nextLink' in r.json().keys() and counter < pages:
    r = oauth.get(r.json()['@odata.nextLink'])
    doc_ids.extend([doc['ResultId'] for doc in r.json()['value']])
    counter += 1  # Increment the counter while '@odata.nextLink' in r.json().keys():
    #time.sleep(2)

In [30]:
# iterate over the doc_ids and fetch the documents and save them to a file. This will take a while, depending on the number of documents
counter = 1
for doc_id in doc_ids[:100]:
    docs_url = f"{base_url}Documents(DocumentId='{doc_id}',DocumentIdType='PGuid')/$value"
    doc = oauth.get(docs_url)
    print(f"Fetching document {counter}: {doc_id}")
    with open(f'{doc_id}.xml', 'w') as f:
        f.write(doc.text)
    counter += 1


Fetching document 1: urn:contentItem:6C65-FKM1-DXY4-X003-00000-00
Fetching document 2: urn:contentItem:6B7M-WSG1-JBG3-60M4-00000-00
Fetching document 3: urn:contentItem:6C2G-3HR1-DXY4-X0DB-00000-00
Fetching document 4: urn:contentItem:6CBV-CNS1-JC85-N00R-00000-00
Fetching document 5: urn:contentItem:6C1K-P3S1-DXY4-X0P1-00000-00
Fetching document 6: urn:contentItem:6BDS-HFJ1-JBG3-61K5-00000-00
Fetching document 7: urn:contentItem:6CJY-YGF1-JBG3-616G-00000-00
Fetching document 8: urn:contentItem:6BDV-P421-DXY4-X076-00000-00
Fetching document 9: urn:contentItem:6C3H-SXX1-JBG3-60KH-00000-00
Fetching document 10: urn:contentItem:6CD4-1BC1-DXY4-X01K-00000-00
Fetching document 11: urn:contentItem:6CJB-1WY1-DXY4-X476-00000-00
Fetching document 12: urn:contentItem:6CD8-72F1-JBG3-6041-00000-00
Fetching document 13: urn:contentItem:6B83-NYR1-DXY4-X043-00000-00
Fetching document 14: urn:contentItem:6BYY-W831-DYR7-C00G-00000-00
Fetching document 15: urn:contentItem:6C6R-0F51-DXY4-X1TS-00000-00


KeyboardInterrupt: 

In [31]:
# get the headers to check on your rate limits. X-RateLimit-Limit is for 10 second/1 minute/1 hour/1 day thresholds
r.headers

{'Date': 'Thu, 08 Aug 2024 15:57:24 GMT', 'Content-Type': 'application/json; odata.metadata=minimal', 'Content-Length': '8383', 'Connection': 'keep-alive', 'Vary': 'Origin, Access-Control-Request-Method, Access-Control-Request-Headers, Accept-Encoding', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'Expires': '-1', 'Server': 'auth_api.lexisnexis.com  3000', 'X-RateLimit-Limit': '60/125/1500/12000', 'X-RateLimit-Remaining': '54/118/1487/11987', 'X-RateLimit-Reset': '1723132648/1723132683/1723135253/1723218053', 'OData-Version': '4.0', 'X-AspNet-Version': '4.0.30319', 'X-Powered-By': 'ASP.NET', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block', 'X-Frame-Options': 'DENY', 'X-RE-Ref': '1 1723131652968472', 'P3P': 'CP="IDC DSP LAW ADM DEV TAI PSA PSD IVA IVD CON HIS TEL OUR DEL SAM OTR IND OTC"'}