In [129]:
import json
import os
import requests
import time
import urllib

### Helper functions

In [130]:
def get_nb_hits(json_response):
    return len(json_response['hits']['hits'])

def get_next_link(json_response):
    return json_response['links'].get('next', None)

def reget(url, params=None, **kwargs):
    """
    Sends a GET request and resends it with increasing delays
    when status code 429 (too many requests).

    :param url: URL for the new :class:`Request` object.
    :param params: (optional) Dictionary, list of tuples or bytes to send
        in the query string for the :class:`Request`.
    :param \*\*kwargs: Optional arguments that ``request`` takes.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
    """
    if params is None:
        print(f"URL: {url}")
    else:
        print(f"URL: {url}, params:")
        redacted_params = params
        del redacted_params['access_token']
        print(json.dumps(redacted_params, indent = 4))
    delay = 0.0
    while True:  
        response = requests.get(url, params=params, **kwargs)
        if response.status_code != 429:
            return response
        delay += 2
        print(f"delay: {delay}s to circumvent rate limiting...")
        time.sleep(delay)

### Query and display hitcount on page 1

The query string uses [elastic search syntax](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax).

In [139]:
query = "stromboli"
params = {
    'q': urllib.parse.quote(query),
    'type': 'publication',
    'size': 2,
    'page': 1,
    'access_token': os.environ['ZENODO_API_TOKEN']
}
response = reget("https://zenodo.org/api/records", params = params)
assert response.status_code == 200
j = response.json()
print(f"Hits on page 1: {get_nb_hits(j)}")

URL: https://zenodo.org/api/records, params:
{
    "q": "stromboli",
    "type": "publication",
    "size": 2,
    "page": 1
}
Hits on page 1: 2


### Pretty print JSON result

In [140]:
print(json.dumps(j, indent = 2))

{
  "aggregations": {
    "access_right": {
      "buckets": [
        {
          "doc_count": 12,
          "key": "open"
        },
        {
          "doc_count": 2,
          "key": "closed"
        }
      ],
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0
    },
    "file_type": {
      "buckets": [
        {
          "doc_count": 8,
          "key": "pdf"
        },
        {
          "doc_count": 2,
          "key": "html"
        },
        {
          "doc_count": 2,
          "key": "zip"
        }
      ],
      "doc_count_error_upper_bound": 0,
      "sum_other_doc_count": 0
    },
    "keywords": {
      "buckets": [
        {
          "doc_count": 4,
          "key": "Biodiversity"
        },
        {
          "doc_count": 4,
          "key": "Taxonomy"
        },
        {
          "doc_count": 3,
          "key": "Animalia"
        },
        {
          "doc_count": 3,
          "key": "Arthropoda"
        },
        {
          "doc_cou

### Determine total number of hits

In [141]:
total_hits = get_nb_hits(j)
while get_next_link(j):    
    next_response = reget(get_next_link(j))
    assert next_response.status_code == 200
    j = next_response.json()
    total_hits += get_nb_hits(j)
    print(f"Cumulative hits: {total_hits}")

print(f"Total number of hits: {total_hits}")

URL: https://zenodo.org/api/records/?sort=bestmatch&q=stromboli&type=publication&page=2&size=2
Cumulative hits: 4
URL: https://zenodo.org/api/records/?sort=bestmatch&q=stromboli&type=publication&page=3&size=2
Cumulative hits: 6
URL: https://zenodo.org/api/records/?sort=bestmatch&q=stromboli&type=publication&page=4&size=2
Cumulative hits: 8
URL: https://zenodo.org/api/records/?sort=bestmatch&q=stromboli&type=publication&page=5&size=2
Cumulative hits: 10
URL: https://zenodo.org/api/records/?sort=bestmatch&q=stromboli&type=publication&page=6&size=2
Cumulative hits: 12
Total number of hits: 12


In [134]:
help(reget)

Help on function reget in module __main__:

reget(url, params=None, **kwargs)
    Sends a GET request and resends it with increasing delays
    when status code 429 (too many requests).
    
    :param url: URL for the new :class:`Request` object.
    :param params: (optional) Dictionary, list of tuples or bytes to send
        in the query string for the :class:`Request`.
    :param \*\*kwargs: Optional arguments that ``request`` takes.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response

