In [1]:
# script to download documents from Nexis Uni API in bulk
# questions? Jeremy Darrington, jdarring@princeton.edu
from oauthlib.oauth2 import BackendApplicationClient
from requests_oauthlib import OAuth2Session
from requests.auth import HTTPBasicAuth
import os
import time

In [2]:
# be sure to set up the client_id and client_secret as environment variables in your shell
client_id = os.getenv('CLIENT_ID')
client_secret = os.getenv('CLIENT_SECRET')
scope = 'http://oauth.lexisnexis.com/all'
grant_type = 'client_credentials'
content_type = 'application/x-www-form-urlencoded'

In [3]:
# for details on authentication, see the docs at https://www.lexisnexis.com/lextalk/developers/ln-webservice-api/p/apidocs.aspx#introduction, section "Service to Service ID"
# also see https://requests-oauthlib.readthedocs.io/en/latest/oauth2_workflow.html#backend-application-flow 
auth = HTTPBasicAuth(client_id, client_secret)

client = BackendApplicationClient(client_id=client_id, scope=scope, grant_type=grant_type, token_type='Bearer', client_secret=client_secret, content_type=content_type)

oauth = OAuth2Session(client=client)

token = oauth.fetch_token(token_url='https://auth-api.lexisnexis.com/oauth/v2/token', auth=auth)

In [4]:
# get the token
token

{'access_token': 'NjVkZjk0NTktMDJhOS00NTAyLWJjYTAtOTI1ZDg2MGE1ZWRl',
 'token_type': 'Bearer',
 'expires_in': 86400,
 'expires_at': 1725469525.705436}

In [28]:
# set up a request to the BatchNews endpoint; should only be used when returning > 500 documents 
base_url = 'https://services-api.lexisnexis.com/v1/'
search_endpoint = "BatchNews"
# Documentation says you can add these search fields: agg-copyright, body, byline, cite, company, headline, hlead, length, publication, publication-type, pub-copyright, section, show, term, ticker, title, jurisinfo, load-date, xce-date, subject, industry, geography, document-type, country
search_string = "biden w/5 (poll* OR survey) and publication('The Associated Press')"
filter = "Date ge 2024-01-01" # comparison operators: gt, ge, lt, le
results_per_page = 50 # max is 50 for BatchNews
select = "ResultId,Title,Date,Source"
url_params = {'$search': search_string, '$filter': filter, '$top': results_per_page, '$select': select}

In [29]:
# you need to pass the token in the headers by calling 'get' on the oauth object
r = oauth.get(base_url+search_endpoint, params=url_params, timeout=10)

In [30]:
# see the results
r.json()

{'@odata.context': 'https://services-api.lexisnexis.com/v1/$metadata#BatchNews(ResultId,Title,Date,Source)',
 '@odata.count': 146,
 'value': [{'ResultId': 'urn:contentItem:6CT6-CTR1-JC5B-G0G4-00000-00',
   'Title': "What polling shows about Americans' views of Robert F. Kennedy Jr.",
   'Date': '2024-08-23T00:00:00Z',
   'Source': {'Id': '', 'Name': 'The Associated Press', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6CH3-PJR1-JC5B-G3Y0-00000-00',
   'Title': 'Takeaways from AP-NORC poll showing majority of Democrats want Biden to drop out',
   'Date': '2024-07-17T00:00:00Z',
   'Source': {'Id': '', 'Name': 'The Associated Press', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6CSS-PB31-JC5B-G2V7-00000-00',
   'Title': "What polling shows about Americans' views of Robert F. Kennedy Jr.",
   'Date': '2024-08-22T00:00:00Z',
   'Source': {'Id': '', 'Name': 'The Associated Press', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6CKD-9N21-JC5B-G1NH-00000-00',
   'Title': 

In [31]:
# get the document IDs
doc_ids = [doc['ResultId'] for doc in r.json()['value']]

In [32]:
# iterate through the pages of results and add the doc_ids to the list. Because of rate limits, you may need to add a sleep statement of 2-3 seconds, depending on the number of results
# 
counter = 0
pages = 5  # number of pages to fetch

while '@odata.nextLink' in r.json().keys() and counter < pages:
    r = oauth.get(r.json()['@odata.nextLink'])
    doc_ids.extend([doc['ResultId'] for doc in r.json()['value']])
    counter += 1  # Increment the counter while '@odata.nextLink' in r.json().keys():
    #time.sleep(2)

In [33]:
# iterate over the doc_ids and fetch the documents and save them to a file. This will take a while, depending on the number of documents
counter = 1
for doc_id in doc_ids[:10]:
    docs_url = f"{base_url}Documents(DocumentId='{doc_id}',DocumentIdType='PGuid')/$value"
    doc = oauth.get(docs_url)
    print(f"Fetching document {counter}: {doc_id}")
    with open(f'{doc_id}.xml', 'w') as f:
        f.write(doc.text)
    counter += 1


Fetching document 1: urn:contentItem:6CT6-CTR1-JC5B-G0G4-00000-00
Fetching document 2: urn:contentItem:6CH3-PJR1-JC5B-G3Y0-00000-00
Fetching document 3: urn:contentItem:6CSS-PB31-JC5B-G2V7-00000-00
Fetching document 4: urn:contentItem:6CKD-9N21-JC5B-G1NH-00000-00
Fetching document 5: urn:contentItem:6C50-W0V1-JC5B-G3MG-00000-00
Fetching document 6: urn:contentItem:6CH3-6RR1-JC5B-G3JW-00000-00
Fetching document 7: urn:contentItem:6CM1-0SF1-DYMD-64K7-00000-00
Fetching document 8: urn:contentItem:6CD3-W6R1-JC5B-G0DX-00000-00
Fetching document 9: urn:contentItem:6CD7-5PH1-JC5B-G180-00000-00
Fetching document 10: urn:contentItem:6C7D-8681-DYMD-61FS-00000-00


In [34]:
# get the headers to check on your rate limits. X-RateLimit-Limit is for 10 second/1 minute/1 hour/1 day thresholds
r.headers

{'Date': 'Tue, 03 Sep 2024 17:19:24 GMT', 'Content-Type': 'application/json; odata.metadata=minimal', 'Content-Length': '12595', 'Connection': 'keep-alive', 'Vary': 'Origin, Access-Control-Request-Method, Access-Control-Request-Headers, Accept-Encoding', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'Expires': '-1', 'Server': 'auth_api.lexisnexis.com  3000', 'X-RateLimit-Limit': '60/125/1500/12000', 'X-RateLimit-Remaining': '58/123/1490/11990', 'X-RateLimit-Reset': '1725383974/1725384024/1725386820/1725469620', 'OData-Version': '4.0', 'X-AspNet-Version': '4.0.30319', 'X-Powered-By': 'ASP.NET', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block', 'X-Frame-Options': 'DENY', 'X-RE-Ref': '1 1725383316767301', 'P3P': 'CP="IDC DSP LAW ADM DEV TAI PSA PSD IVA IVD CON HIS TEL OUR DEL SAM OTR IND OTC"'}