In [1]:
# script to download documents from Nexis Uni API in bulk
# questions? Jeremy Darrington, jdarring@princeton.edu
from oauthlib.oauth2 import BackendApplicationClient
from requests_oauthlib import OAuth2Session
from requests.auth import HTTPBasicAuth
import os
import time

In [3]:
# be sure to set up the client_id and client_secret as environment variables in your shell
client_id = os.getenv('CLIENT_ID')
client_secret = os.getenv('CLIENT_SECRET')
scope = 'http://oauth.lexisnexis.com/all'
grant_type = 'client_credentials'
content_type = 'application/x-www-form-urlencoded'

In [4]:
# for details on authentication, see the docs at https://www.lexisnexis.com/lextalk/developers/ln-webservice-api/p/apidocs.aspx#introduction, section "Service to Service ID"
# also see https://requests-oauthlib.readthedocs.io/en/latest/oauth2_workflow.html#backend-application-flow 
auth = HTTPBasicAuth(client_id, client_secret)

client = BackendApplicationClient(client_id=client_id, scope=scope, grant_type=grant_type, token_type='Bearer', client_secret=client_secret, content_type=content_type)

oauth = OAuth2Session(client=client)

token = oauth.fetch_token(token_url='https://auth-api.lexisnexis.com/oauth/v2/token', auth=auth)

In [5]:
# get the token
token

{'access_token': 'ZjEwNGE4ODQtNzExYi00NjRlLWFmZWItMDEwYmU1ZDM4YjMw',
 'token_type': 'Bearer',
 'expires_in': 86400,
 'expires_at': 1718826294.492864}

In [6]:
# set up a request to the BatchNews endpoint; should only be used when returning > 500 documents 
base_url = 'https://services-api.lexisnexis.com/v1/'
search_endpoint = 'BatchNews'
search_string = 'biden w/5 (poll* OR survey)'
filter = 'Date ge 2024-01-01' # comparison operators: gt, ge, lt, le
results_per_page = 50 # max is 50 for BatchNews
url_params = {'$search': search_string, '$filter': filter, '$top': results_per_page}

In [7]:
# you need to pass the token in the headers by calling 'get' on the oauth object
r = oauth.get(base_url+search_endpoint, params=url_params, timeout=10)

In [8]:
# see the results
r.json()

{'@odata.context': 'https://services-api.lexisnexis.com/v1/$metadata#BatchNews(ResultId,Source)',
 '@odata.count': 19487,
 'value': [{'ResultId': 'urn:contentItem:6C8V-YBT1-DYRS-T3TN-00000-00',
   'Source': {'Id': '', 'Name': 'Pittsburgh Post-Gazette', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6BTX-0RJ1-JBR6-9181-00000-00',
   'Source': {'Id': '', 'Name': 'Newsweek.com', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6C70-FVD1-F03R-N377-00000-00',
   'Source': {'Id': '', 'Name': 'Axios', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6C6J-27S1-DY68-12HW-00000-00',
   'Source': {'Id': '', 'Name': 'Newsweek.com', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6C6J-60W1-F03R-N1MS-00000-00',
   'Source': {'Id': '', 'Name': 'Mediaite', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6C8P-D511-F03F-K4JF-00000-00',
   'Source': {'Id': '', 'Name': 'TheHill.com', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6C5W-G7F1-DY68-10KV-00000-00',
   'Source': {'

In [9]:
# get the document IDs
doc_ids = [doc['ResultId'] for doc in r.json()['value']]

In [10]:
# iterate through the pages of results and add the doc_ids to the list. Because of rate limits, you may need to add a sleep statement of 2-3 seconds, depending on the number of results
# 
counter = 0
pages = 5  # number of pages to fetch

while '@odata.nextLink' in r.json().keys() and counter < pages:
    r = oauth.get(r.json()['@odata.nextLink'])
    doc_ids.extend([doc['ResultId'] for doc in r.json()['value']])
    counter += 1  # Increment the counter while '@odata.nextLink' in r.json().keys():
    #time.sleep(2)

In [12]:
# iterate over the doc_ids and fetch the documents and save them to a file. This will take a while, depending on the number of documents
counter = 1
for doc_id in doc_ids[:100]:
    docs_url = f"{base_url}Documents(DocumentId='{doc_id}',DocumentIdType='PGuid')/$value"
    doc = oauth.get(docs_url)
    print(f"Fetching document {counter}: {doc_id}")
    with open(f'{doc_id}.xml', 'w') as f:
        f.write(doc.text)
    counter += 1


In [13]:
# get the headers to check on your rate limits
r.headers

{'Cache-Control': 'no-cache', 'Content-Type': 'application/json; odata.metadata=minimal', 'Date': 'Tue, 18 Jun 2024 19:46:29 GMT', 'Expires': '-1', 'OData-Version': '4.0', 'Pragma': 'no-cache', 'Server': 'auth_api.lexisnexis.com  3000', 'Vary': 'Origin, Access-Control-Request-Method, Access-Control-Request-Headers, Accept-Encoding', 'X-AspNet-Version': '4.0.30319', 'X-Content-Type-Options': 'nosniff', 'X-Frame-Options': 'DENY', 'X-Powered-By': 'ASP.NET', 'X-RateLimit-Limit': '60/125/1500/12000', 'X-RateLimit-Remaining': '55/120/1494/11994', 'X-RateLimit-Reset': '1718739996/1718740046/1718743514/1718826314', 'X-XSS-Protection': '1; mode=block', 'Content-Length': '8036', 'Connection': 'keep-alive', 'X-RE-Ref': '1 1718739914788919', 'P3P': 'CP="IDC DSP LAW ADM DEV TAI PSA PSD IVA IVD CON HIS TEL OUR DEL SAM OTR IND OTC"'}