In [1]:
# script to download documents from Nexis Uni using the Web Services Kit API for bulk downloads
from oauthlib.oauth2 import BackendApplicationClient
from requests_oauthlib import OAuth2Session
from requests.auth import HTTPBasicAuth
import os
import time

In [2]:
# be sure to set up the client_id and client_secret as environment variables in your shell
client_id = os.getenv('CLIENT_ID')
client_secret = os.getenv('CLIENT_SECRET')
scope = 'http://oauth.lexisnexis.com/all'
grant_type = 'client_credentials'
content_type = 'application/x-www-form-urlencoded'

In [3]:
# for details on authentication, see the docs at https://www.lexisnexis.com/lextalk/developers/ln-webservice-api/p/apidocs.aspx#introduction, section "Service to Service ID"
# also see https://requests-oauthlib.readthedocs.io/en/latest/oauth2_workflow.html#backend-application-flow 
auth = HTTPBasicAuth(client_id, client_secret)

client = BackendApplicationClient(client_id=client_id, scope=scope, grant_type=grant_type, token_type='Bearer', client_secret=client_secret, content_type=content_type)

oauth = OAuth2Session(client=client)

token = oauth.fetch_token(token_url='https://auth-api.lexisnexis.com/oauth/v2/token', auth=auth)

In [4]:
# get the token
token

{'access_token': 'NzYzZjk3MDQtMjM3OS00ZmMzLTk1MjktZmMxOWZhMTI4Yjcx',
 'token_type': 'Bearer',
 'expires_in': 86400,
 'expires_at': 1726345775.546706}

## Searching for documents

In [5]:
# set up a request to the BatchNews endpoint; should only be used anytime you are returning > 450 documents. For smaller queries, use the 'News' endpoint
base_url = 'https://services-api.lexisnexis.com/v1/'
search_endpoint = "BatchNews"
# Documentation says you can add these search fields: agg-copyright, body, byline, cite, company, headline, hlead, length, publication, publication-type, pub-copyright, section, show, term, ticker, title, jurisinfo, load-date, xce-date, subject, industry, geography, document-type, country
search_string = "biden w/10 (poll* OR survey) and publication('The Associated Press')"
filter = "Date ge 2024-01-01" # comparison operators: gt, ge, lt, le
results_per_page = 50 # max is 50 for BatchNews
select = "ResultId,Title,Date,Source"

# Rate limits and downloading full text
Rate limits for API calls are expressed as calls per 10s/1m/1h/1d. For getting the full text, you have two options. If using the `$expand=Document` parameter, your search results will return the full text of the document. You can make 1,000 of these calls a day (2/5/200/1000) and using the max of 50 documents per search, you can get 50k documents a day. The other method is to search and retrieve just metadata. You can do 12,000 of these searches a day (60/125/1500/12000). From these searches you can get the `ResultId` and individually download 24k documents a day (25/50/1500/24000). All three of these throttling buckets are separate. 

## Getting the full text of the documents with your search results

In [6]:
url_params = {'$search': search_string, '$filter': filter, '$top': results_per_page, '$select': select, '$expand': 'Document'}

In [7]:
# you need to pass the token in the headers by calling 'get' on the oauth object
r = oauth.get(base_url+search_endpoint, params=url_params, timeout=100)

In [34]:
# see the results. in 'r.json()' the 'value' key contains the search results. For 'value' in r.json(), you will see the 'ResultId', 'Title', 'Date', 'Source', 'Document' fields. The 'Document' field contains the full text of the document. For this notebook, I'm limiting the output of the 'Document' field to the first 1000 characters
for key, value in r.json().items():
    if key == 'value':
        for doc in value[:10]:
            print(f"ResultId: {doc['ResultId']}, Title: {doc['Title']}, Date: {doc['Date']}, Source: {doc['Source']}, 'Content': {doc['Document']['Content'][:1000]}")
    else:
        print(f"Key: {key}, Value: {value}" )


Key: @odata.context, Value: https://services-api.lexisnexis.com/v1/$metadata#BatchNews(ResultId,Title,Date,Source,Document)
Key: @odata.count, Value: 317
ResultId: urn:contentItem:6CH3-PJR1-JC5B-G3Y0-00000-00, Title: Takeaways from AP-NORC poll showing majority of Democrats want Biden to drop out, Date: 2024-07-17T00:00:00Z, Source: {'Id': '', 'Name': 'The Associated Press', 'ContentType': ''}, 'Content': <entry xmlns="http://www.w3.org/2005/Atom"><id>urn:contentItem:6CH3-PJR1-JC5B-G3Y0-00000-00</id><title>Takeaways from AP-NORC poll showing majority of Democrats want Biden to drop out</title><published>2024-07-17T00:00:00Z</published><updated>2024-09-13T20:30:07Z</updated><author><name>LexisNexis</name></author><content type="application/xml"><!--Transformation version 1.25--><articleDoc xmlns="" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.lexisnexis.com/xmlschemas/content/public/articledoc/1/" schemaVersion="1.8" xml:lang="en"><arti

In [16]:
# save the results to a new list
results = r.json()['value']

In [17]:
# iterate through the pages of results and add the results to the 'results' list. Because of rate limits, you may need to add a sleep statement of 2-3 seconds, depending on the number of results

while '@odata.nextLink' in r.json().keys(): #and counter < pages:
    r = oauth.get(r.json()['@odata.nextLink'])
    results.extend(r.json()['value'])
    #time.sleep(2)

In [18]:
# get the headers to check on your rate limits. X-RateLimit-Limit is for 10 second/1 minute/1 hour/1 day thresholds
r.headers

{'Date': 'Fri, 13 Sep 2024 18:14:52 GMT', 'Content-Type': 'application/json; odata.metadata=minimal', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Vary': 'Origin, Access-Control-Request-Method, Access-Control-Request-Headers, Accept-Encoding', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'Expires': '-1', 'Server': 'auth_api.lexisnexis.com  3000', 'X-RateLimit-Limit': '2/5/200/1000', 'X-RateLimit-Remaining': '1/0/193/993', 'X-RateLimit-Reset': '1726251299/1726251316/1726254856/1726337656', 'X-RateLimit-Limit-Time-In-Queue': '3.0227233', 'OData-Version': '4.0', 'X-AspNet-Version': '4.0.30319', 'X-Powered-By': 'ASP.NET', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block', 'X-Frame-Options': 'DENY', 'X-RE-Ref': '1 1726250852643951', 'P3P': 'CP="IDC DSP LAW ADM DEV TAI PSA PSD IVA IVD CON HIS TEL OUR DEL SAM OTR IND OTC"'}

#### Save the full text of the documents

In [91]:
# iterate through the results, get the 'Content' field, and save as an XML file with the document ID as the filename
for result in results:
    doc_id = result['ResultId']
    doc = result['Document']['Content']
    with open(f'{doc_id}.xml', 'w') as f:
        f.write(doc)

# Downloading just metadata and/or individual documents

In [6]:
url_params = {'$search': search_string, '$filter': filter, '$top': results_per_page, '$select': select}

# you need to pass the token in the headers by calling 'get' on the oauth object
r = oauth.get(base_url + search_endpoint, params=url_params, timeout=20)

In [7]:
# see the results
r.json()

{'@odata.context': 'https://services-api.lexisnexis.com/v1/$metadata#BatchNews(ResultId,Title,Date,Source)',
 '@odata.count': 317,
 'value': [{'ResultId': 'urn:contentItem:6CH3-PJR1-JC5B-G3Y0-00000-00',
   'Title': 'Takeaways from AP-NORC poll showing majority of Democrats want Biden to drop out',
   'Date': '2024-07-17T00:00:00Z',
   'Source': {'Id': '', 'Name': 'The Associated Press', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6CT6-CTR1-JC5B-G0G4-00000-00',
   'Title': "What polling shows about Americans' views of Robert F. Kennedy Jr.",
   'Date': '2024-08-23T00:00:00Z',
   'Source': {'Id': '', 'Name': 'The Associated Press', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6CH3-6RR1-JC5B-G3JW-00000-00',
   'Title': 'Nearly two-thirds of Democrats want Biden to withdraw, new AP-NORC poll finds',
   'Date': '2024-07-17T00:00:00Z',
   'Source': {'Id': '', 'Name': 'The Associated Press', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6CSS-PB31-JC5B-G2V7-00000-00',
 

In [8]:
# get the document IDs
doc_ids = [doc['ResultId'] for doc in r.json()['value']]

In [9]:
# iterate through the pages of results and add the doc_ids to the list. Because of rate limits, you may need to add a sleep statement of 2-3 seconds, depending on the number of results

#counter = 0
#pages = 5  # number of pages to fetch

while '@odata.nextLink' in r.json().keys(): # and counter < pages:
    r = oauth.get(r.json()['@odata.nextLink'])
    doc_ids.extend([doc['ResultId'] for doc in r.json()['value']])
    #counter += 1  # Increment the counter while '@odata.nextLink' in r.json().keys():
    #time.sleep(2)

In [10]:
r.headers

{'Date': 'Fri, 13 Sep 2024 18:09:57 GMT', 'Content-Type': 'application/json; odata.metadata=minimal', 'Content-Length': '4944', 'Connection': 'keep-alive', 'Vary': 'Origin, Access-Control-Request-Method, Access-Control-Request-Headers, Accept-Encoding', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'Expires': '-1', 'Server': 'auth_api.lexisnexis.com  3000', 'X-RateLimit-Limit': '60/125/1500/12000', 'X-RateLimit-Remaining': '54/119/1490/11988', 'X-RateLimit-Reset': '1726251006/1726251056/1726253017/1726331906', 'OData-Version': '4.0', 'X-AspNet-Version': '4.0.30319', 'X-Powered-By': 'ASP.NET', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block', 'X-Frame-Options': 'DENY', 'X-RE-Ref': '1 1726250852643951', 'P3P': 'CP="IDC DSP LAW ADM DEV TAI PSA PSD IVA IVD CON HIS TEL OUR DEL SAM OTR IND OTC"'}

#### Download and save the full text of the documents

In [11]:
# iterate over the doc_ids and fetch the documents and save them to a file. This will take a while, depending on the number of documents
counter = 1
for doc_id in doc_ids[:10]: # limit to 10 documents for testing purposes
    docs_url = f"{base_url}Documents(DocumentId='/shared/document/news/{doc_id}',DocumentIdType='DocFullPath')/$value"
    doc = oauth.get(docs_url)
    print(f"Fetching document {counter}: {doc_id}")
    with open(f'{doc_id}.xml', 'w') as f:
        f.write(doc.text)
    counter += 1


Fetching document 1: urn:contentItem:6CH3-PJR1-JC5B-G3Y0-00000-00
Fetching document 2: urn:contentItem:6CT6-CTR1-JC5B-G0G4-00000-00
Fetching document 3: urn:contentItem:6CH3-6RR1-JC5B-G3JW-00000-00
Fetching document 4: urn:contentItem:6CSS-PB31-JC5B-G2V7-00000-00
Fetching document 5: urn:contentItem:6CKD-9N21-JC5B-G1NH-00000-00
Fetching document 6: urn:contentItem:6C7D-8681-DYMD-61FS-00000-00
Fetching document 7: urn:contentItem:6C50-W0V1-JC5B-G3MG-00000-00
Fetching document 8: urn:contentItem:6CD3-W6R1-JC5B-G0DX-00000-00
Fetching document 9: urn:contentItem:6B8R-K1R1-JC5B-G0TC-00000-00
Fetching document 10: urn:contentItem:6CD7-5PH1-JC5B-G180-00000-00


In [12]:
doc.headers

{'Date': 'Fri, 13 Sep 2024 18:10:49 GMT', 'Content-Type': 'application/atom+xml;charset=utf-8', 'Content-Length': '18173', 'Connection': 'keep-alive', 'Vary': 'Origin, Access-Control-Request-Method, Access-Control-Request-Headers', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'Expires': '-1', 'Server': 'auth_api.lexisnexis.com  3000', 'X-RateLimit-Limit': '25/50/1500/24000', 'X-RateLimit-Remaining': '15/40/1490/23987', 'X-RateLimit-Reset': '1726251053/1726251103/1726254643/1726251527', 'X-AspNet-Version': '4.0.30319', 'X-Powered-By': 'ASP.NET', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block', 'X-Frame-Options': 'DENY', 'X-RE-Ref': '1 1726250852643951', 'P3P': 'CP="IDC DSP LAW ADM DEV TAI PSA PSD IVA IVD CON HIS TEL OUR DEL SAM OTR IND OTC"'}