In [1]:
# script to download documents from Nexis Uni using the Web Services Kit API for bulk downloads
from oauthlib.oauth2 import BackendApplicationClient
from requests_oauthlib import OAuth2Session
from requests.auth import HTTPBasicAuth
import os
import time

In [2]:
# be sure to set up the client_id and client_secret as environment variables in your shell
client_id = os.getenv('CLIENT_ID')
client_secret = os.getenv('CLIENT_SECRET')
scope = 'http://oauth.lexisnexis.com/all'
grant_type = 'client_credentials'
content_type = 'application/x-www-form-urlencoded'

In [3]:
# for details on authentication, see the docs at https://www.lexisnexis.com/lextalk/developers/ln-webservice-api/p/apidocs.aspx#introduction, section "Service to Service ID"
# also see https://requests-oauthlib.readthedocs.io/en/latest/oauth2_workflow.html#backend-application-flow 
auth = HTTPBasicAuth(client_id, client_secret)

client = BackendApplicationClient(client_id=client_id, scope=scope, grant_type=grant_type, token_type='Bearer', client_secret=client_secret, content_type=content_type)

oauth = OAuth2Session(client=client)

token = oauth.fetch_token(token_url='https://auth-api.lexisnexis.com/oauth/v2/token', auth=auth)

In [44]:
# get the token
token

{'access_token': 'ODQ5ZTA2MDktNmFiMi00ZWM4LWIyNmQtYWIzNmNjYjVkZjRk',
 'token_type': 'Bearer',
 'expires_in': 86400,
 'expires_at': 1726607392.788926}

In [43]:
# get the base URL for the API
base_url = 'https://services-api.lexisnexis.com/v1/'

# Rate limits and downloading full text
Rate limits for API calls are expressed as calls per 10s/1m/1h/1d. For getting the full text, you have two options. If using the `$expand=Document` parameter, your search results will return the full text of the document. You can make 1,000 of these calls a day (2/5/200/1000) and using the max of 50 documents per search, you can get 50k documents a day. The other method is to search and retrieve just metadata. You can do 12,000 of these searches a day (60/125/1500/12000). From these searches you can get the `ResultId` and individually download 24k documents a day (25/50/1500/24000). All three of these throttling buckets are separate. 

## Searching for sources

In [91]:
# set up a request to the Sources endpoint
sources_endpoint = "Sources"
# define your search string
search_string = "Associated Press"
# the documentation indicates that there are filterable fields, but seems like all news sources say Geography is "US" even when they're international. Jurisdiction is a better filter. If you don't need a filter, you can leave this blank
filter = "Jurisdiction eq 'U.S. Federal'"

In [92]:
url_params = {'$search': search_string, '$filter': filter}

In [93]:
r = oauth.get(base_url+sources_endpoint, params=url_params, timeout=100)

In [94]:
r.json()

{'@odata.context': 'https://services-api.lexisnexis.com/v1/$metadata#Sources',
 '@odata.count': 11,
 'value': [{'Id': 'MTA2MjYxNg',
   'Geography': 'US',
   'Name': 'Associated Press News Briefs',
   'ContentType': 'news',
   'Jurisdiction': 'U.S. Federal; International',
   'Publisher': 'The Associated Press',
   'PublishFrequency': 'Daily/Monday - Sunday',
   'Coverage': 'Most recent two weeks',
   'UpdateSchedule': 'Same day',
   'Description': 'Compiled by the Associated Press and extracted from the AP ONLINE service, the Associated Press News Briefs highlight the top stories of the moment as determined by the Associated Press.  The Associated Press staff compiles a summary of the top stories approximately once each hour.  Each document contains 10 to 15 summaries, each of which runs approximately 50 to 100 words in length.  As breaking news occurs, the summaries will change.  A total of 14 days of material is maintained online.  The AP News Briefs cover the entire spectrum of the 

In [109]:
r.headers

{'Date': 'Tue, 17 Sep 2024 15:07:36 GMT', 'Content-Type': 'application/json; odata.metadata=minimal', 'Content-Length': '14628', 'Connection': 'keep-alive', 'Vary': 'Origin, Access-Control-Request-Method, Access-Control-Request-Headers, Accept-Encoding', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'Expires': '-1', 'Server': 'auth_api.lexisnexis.com  3000', 'X-RateLimit-Limit': '60/125/1500/12000', 'X-RateLimit-Remaining': '59/123/1494/11994', 'X-RateLimit-Reset': '1726585663/1726585678/1726587740/1726670540', 'OData-Version': '4.0', 'X-AspNet-Version': '4.0.30319', 'X-Powered-By': 'ASP.NET', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block', 'X-Frame-Options': 'DENY', 'X-RE-Ref': '1 1726585618296818', 'P3P': 'CP="IDC DSP LAW ADM DEV TAI PSA PSD IVA IVD CON HIS TEL OUR DEL SAM OTR IND OTC"'}

## Define your search parameters

In [106]:
# set up a request to the BatchNews endpoint; should only be used anytime you are returning > 450 documents. For smaller queries, use the 'News' endpoint
base_url = 'https://services-api.lexisnexis.com/v1/'
search_endpoint = "BatchNews"
'''You can add these search fields: agg-copyright, body, byline, cite, company, headline, hlead, length, publication, publication-type, pub-copyright, section, show, term, ticker, title, jurisinfo, load-date, xce-date, subject, industry, geography, document-type, country. 

You can use boolean operators like 'AND', 'OR', 'NOT' and proximity operators like 'w/' and 'pre/' to search within a certain number of words or paragraphs. You can also use wildcards like '*' and '?'. You can also use parentheses for grouping. 

There are help documents in the main platform interface: https://libguides.princeton.edu/az/nexis-uni-former-lexis-nexis-academic. In particular, see search commands: https://p.widencdn.net/dykvnj/Nexis_Commands_AtAGlance and news searching tips: https://p.widencdn.net/x1tozf/Nexis_News-Searching-Tips. You can use the source menu in the platform or this list of current content sources: https://supportcenter.lexisnexis.com/app/answers/answer_view/a_id/1098614 (Excel file at bottom of page).
'''
# you can search within specific publications either by including it in your search string, e.g. "publication('The Associated Press')", or by filtering on the 'Source/Id' field. 
search_string = "biden w/10 (poll* OR survey)" 
# You can separate multiple sources with an 'or'. You can find the ID by searching for the source in the 'Sources' endpoint
filter = "Date ge 2024-01-01 and Source/Id eq 'MTA1MzI3Mw'" # comparison operators: 'gt', 'ge', 'lt', 'le' (greater than, greater than or equal to, less than, less than or equal to) 
results_per_page = 50 # max is 50 for BatchNews
select = "ResultId,Title,Date,Source"

## Getting the full text of the documents with your search results

In [99]:
url_params = {'$search': search_string, '$filter': filter, '$top': results_per_page, '$select': select, '$expand': 'Document'}

In [100]:
# you need to pass the token in the headers by calling 'get' on the oauth object
r = oauth.get(base_url+search_endpoint, params=url_params, timeout=100)

In [101]:
# see the results. Can use just 'r.json'. Here, I'm truncating for the public notebook. In 'r.json()' the 'value' key contains the search results. For 'value' in r.json(), you will see the 'ResultId', 'Title', 'Date', 'Source', 'Document' fields. The 'Document' field contains the full text of the document. For this notebook, I'm limiting the output of the 'Document' field to the first 1000 characters
for key, value in r.json().items():
    if key == 'value':
        for doc in value[:10]:
            print(f"ResultId: {doc['ResultId']}, Title: {doc['Title']}, Date: {doc['Date']}, Source: {doc['Source']}, 'Content': {doc['Document']['Content'][:1000]}")
    else:
        print(f"Key: {key}, Value: {value}" )


Key: @odata.context, Value: https://services-api.lexisnexis.com/v1/$metadata#BatchNews(ResultId,Title,Date,Source,Document)
Key: @odata.count, Value: 319
ResultId: urn:contentItem:6CH3-PJR1-JC5B-G3Y0-00000-00, Title: Takeaways from AP-NORC poll showing majority of Democrats want Biden to drop out, Date: 2024-07-17T00:00:00Z, Source: {'Id': '', 'Name': 'The Associated Press', 'ContentType': ''}, 'Content': <entry xmlns="http://www.w3.org/2005/Atom"><id>urn:contentItem:6CH3-PJR1-JC5B-G3Y0-00000-00</id><title>Takeaways from AP-NORC poll showing majority of Democrats want Biden to drop out</title><published>2024-07-17T00:00:00Z</published><updated>2024-09-17T14:57:39Z</updated><author><name>LexisNexis</name></author><content type="application/xml"><!--Transformation version 1.25--><articleDoc xmlns="" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://www.lexisnexis.com/xmlschemas/content/public/articledoc/1/" schemaVersion="1.8" xml:lang="en"><arti

In [16]:
# save the results to a new list
results = r.json()['value']

In [17]:
# iterate through the pages of results and add the results to the 'results' list. Because of rate limits, you may need to add a sleep statement of 2-3 seconds, depending on the number of results

while '@odata.nextLink' in r.json().keys(): #and counter < pages:
    r = oauth.get(r.json()['@odata.nextLink'])
    results.extend(r.json()['value'])
    #time.sleep(2)

In [102]:
# get the headers to check on your rate limits. X-RateLimit-Limit is for 10 second/1 minute/1 hour/1 day thresholds
r.headers

{'Date': 'Tue, 17 Sep 2024 14:57:49 GMT', 'Content-Type': 'application/json; odata.metadata=minimal', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Vary': 'Origin, Access-Control-Request-Method, Access-Control-Request-Headers, Accept-Encoding', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'Expires': '-1', 'Server': 'auth_api.lexisnexis.com  3000', 'X-RateLimit-Limit': '2/5/200/1000', 'X-RateLimit-Remaining': '1/4/196/996', 'X-RateLimit-Reset': '1726585068/1726585118/1726587694/1726670494', 'OData-Version': '4.0', 'X-AspNet-Version': '4.0.30319', 'X-Powered-By': 'ASP.NET', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block', 'X-Frame-Options': 'DENY', 'X-RE-Ref': '1 1726584123482385', 'P3P': 'CP="IDC DSP LAW ADM DEV TAI PSA PSD IVA IVD CON HIS TEL OUR DEL SAM OTR IND OTC"'}

#### Save the full text of the documents

In [91]:
# iterate through the results, get the 'Content' field, and save as an XML file with the document ID as the filename
for result in results:
    doc_id = result['ResultId']
    doc = result['Document']['Content']
    with open(f'{doc_id}.xml', 'w') as f:
        f.write(doc)

# Downloading just metadata and/or individual documents

In [107]:
url_params = {'$search': search_string, '$filter': filter, '$top': results_per_page, '$select': select}

# you need to pass the token in the headers by calling 'get' on the oauth object
r = oauth.get(base_url + search_endpoint, params=url_params, timeout=20)

In [108]:
# see the results
r.json()

{'@odata.context': 'https://services-api.lexisnexis.com/v1/$metadata#BatchNews(ResultId,Title,Date,Source)',
 '@odata.count': 319,
 'value': [{'ResultId': 'urn:contentItem:6CH3-PJR1-JC5B-G3Y0-00000-00',
   'Title': 'Takeaways from AP-NORC poll showing majority of Democrats want Biden to drop out',
   'Date': '2024-07-17T00:00:00Z',
   'Source': {'Id': '', 'Name': 'The Associated Press', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6CT6-CTR1-JC5B-G0G4-00000-00',
   'Title': "What polling shows about Americans' views of Robert F. Kennedy Jr.",
   'Date': '2024-08-23T00:00:00Z',
   'Source': {'Id': '', 'Name': 'The Associated Press', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6CH3-6RR1-JC5B-G3JW-00000-00',
   'Title': 'Nearly two-thirds of Democrats want Biden to withdraw, new AP-NORC poll finds',
   'Date': '2024-07-17T00:00:00Z',
   'Source': {'Id': '', 'Name': 'The Associated Press', 'ContentType': ''}},
  {'ResultId': 'urn:contentItem:6CSS-PB31-JC5B-G2V7-00000-00',
 

In [110]:
# get the document IDs
doc_ids = [doc['ResultId'] for doc in r.json()['value']]

In [9]:
# iterate through the pages of results and add the doc_ids to the list. Because of rate limits, you may need to add a sleep statement of 2-3 seconds, depending on the number of results

#counter = 0
#pages = 5  # number of pages to fetch

while '@odata.nextLink' in r.json().keys(): # and counter < pages:
    r = oauth.get(r.json()['@odata.nextLink'])
    doc_ids.extend([doc['ResultId'] for doc in r.json()['value']])
    #counter += 1  # Increment the counter while '@odata.nextLink' in r.json().keys():
    #time.sleep(2)

In [105]:
r.headers

{'Date': 'Tue, 17 Sep 2024 15:06:59 GMT', 'Content-Type': 'application/json; odata.metadata=minimal', 'Content-Length': '14618', 'Connection': 'keep-alive', 'Vary': 'Origin, Access-Control-Request-Method, Access-Control-Request-Headers, Accept-Encoding', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'Expires': '-1', 'Server': 'auth_api.lexisnexis.com  3000', 'X-RateLimit-Limit': '60/125/1500/12000', 'X-RateLimit-Remaining': '59/124/1495/11995', 'X-RateLimit-Reset': '1726585628/1726585678/1726587740/1726670540', 'OData-Version': '4.0', 'X-AspNet-Version': '4.0.30319', 'X-Powered-By': 'ASP.NET', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block', 'X-Frame-Options': 'DENY', 'X-RE-Ref': '1 1726585618296818', 'P3P': 'CP="IDC DSP LAW ADM DEV TAI PSA PSD IVA IVD CON HIS TEL OUR DEL SAM OTR IND OTC"'}

#### Download and save the full text of the documents

In [111]:
# iterate over the doc_ids and fetch the documents and save them to a file. This will take a while, depending on the number of documents
counter = 1
for doc_id in doc_ids[:5]: # limit to 5 documents for testing purposes
    docs_url = f"{base_url}Documents(DocumentId='/shared/document/news/{doc_id}',DocumentIdType='DocFullPath')/$value"
    doc = oauth.get(docs_url)
    print(f"Fetching document {counter}: {doc_id}")
    with open(f'{doc_id}.xml', 'w') as f:
        f.write(doc.text)
    counter += 1


Fetching document 1: urn:contentItem:6CH3-PJR1-JC5B-G3Y0-00000-00
Fetching document 2: urn:contentItem:6CT6-CTR1-JC5B-G0G4-00000-00
Fetching document 3: urn:contentItem:6CH3-6RR1-JC5B-G3JW-00000-00
Fetching document 4: urn:contentItem:6CSS-PB31-JC5B-G2V7-00000-00
Fetching document 5: urn:contentItem:6CKD-9N21-JC5B-G1NH-00000-00


In [112]:
doc.headers

{'Date': 'Tue, 17 Sep 2024 15:17:09 GMT', 'Content-Type': 'application/atom+xml;charset=utf-8', 'Content-Length': '18262', 'Connection': 'keep-alive', 'Vary': 'Origin, Access-Control-Request-Method, Access-Control-Request-Headers', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'Expires': '-1', 'Server': 'auth_api.lexisnexis.com  3000', 'X-RateLimit-Limit': '25/50/1500/24000', 'X-RateLimit-Remaining': '20/45/1495/23995', 'X-RateLimit-Reset': '1726586231/1726586281/1726589821/1726672621', 'X-AspNet-Version': '4.0.30319', 'X-Powered-By': 'ASP.NET', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block', 'X-Frame-Options': 'DENY', 'X-RE-Ref': '1 1726586221591172', 'P3P': 'CP="IDC DSP LAW ADM DEV TAI PSA PSD IVA IVD CON HIS TEL OUR DEL SAM OTR IND OTC"'}