In [1]:
import json
import os
import requests
import time
import urllib
import yaml

### Helper functions

In [2]:
def get_nb_hits(json_response):
    return len(json_response['hits']['hits'])

def get_next_link(json_response):
    return json_response['links'].get('next', None)

def reget(url, params=None, **kwargs):
    """
    Sends a GET request and resends it with increasing delays
    when status code 429 (too many requests) is received.

    :param url: URL for the new :class:`Request` object.
    :param params: (optional) Dictionary, list of tuples or bytes to send
        in the query string for the :class:`Request`.
    :param **kwargs: Optional arguments that ``request`` takes.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
    """
    if params is None:
        print(f"URL: {url}")
    else:
        print(f"URL: {url}, params:")
        redacted_params = params
        del redacted_params['access_token'] # don't want to leak the token
        print(json.dumps(redacted_params, indent = 4))
    delay = 0.0
    while True:  
        response = requests.get(url, params=params, **kwargs)
        if response.status_code != 429: # not too many requests
            return response
        delay += 2
        print(f"delay: {delay}s to circumvent rate limiting...")
        time.sleep(delay)

### Query with one page per hit

The query string uses [elastic search syntax](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax).

In [3]:
query = ""
params = {
    'q': urllib.parse.quote(query),
    #'type': 'publication',
    'type': 'dataset',
    'communities': "coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project",
    'size': 1, # hits per page
    'page': 1,
    'access_token': os.environ['ZENODO_API_TOKEN']
}
response = reget("https://zenodo.org/api/records", params = params)
assert response.status_code == 200 # success
j = response.json()
assert len(j['hits']['hits']) == 1 # yup, one hit on the first page

URL: https://zenodo.org/api/records, params:
{
    "q": "",
    "type": "dataset",
    "communities": "coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project",
    "size": 1,
    "page": 1
}


### Show hit (present on this page, presumably)

Since the page size is 1, we can zero-index the single hit on this page.

In [4]:
print(yaml.dump(j['hits']['hits'][0]))

conceptdoi: 10.5281/zenodo.5541326
conceptrecid: '5541326'
created: '2021-09-30T10:27:41.917833+00:00'
doi: 10.5281/zenodo.5541327
files:
- bucket: caa37be2-8cb3-4fa5-be8a-b4270581b143
  checksum: md5:374d793ec78012c868d8a27dc73edaa2
  key: COACCH_MetaData_WP2_4_Trade_and_Supply_Chain_Shocks.csv
  links:
    self: https://zenodo.org/api/files/caa37be2-8cb3-4fa5-be8a-b4270581b143/COACCH_MetaData_WP2_4_Trade_and_Supply_Chain_Shocks.csv
  size: 1175
  type: csv
- bucket: caa37be2-8cb3-4fa5-be8a-b4270581b143
  checksum: md5:5f2fa447c0d765d82273c02d0b852ffa
  key: COACCH_WP2_4_Trade_and_Supply_Chain_Shocks.csv
  links:
    self: https://zenodo.org/api/files/caa37be2-8cb3-4fa5-be8a-b4270581b143/COACCH_WP2_4_Trade_and_Supply_Chain_Shocks.csv
  size: 20359322
  type: csv
id: 5541327
links:
  badge: https://zenodo.org/badge/doi/10.5281/zenodo.5541327.svg
  bucket: https://zenodo.org/api/files/caa37be2-8cb3-4fa5-be8a-b4270581b143
  conceptbadge: https://zenodo.org/badge/doi/10.5281/zenodo.554132

### Define a function to custom-process each hit

In [9]:
def process_hit(hit):
    """
    Process a query hit and generate a ReST page.
    """
    print(f"Title: {hit['metadata']['title']}")
    # Define a templatedReST page for  the hit,
    page = f"""
`{hit['metadata']['title']} <{hit['links']['html']}>`_
{'=' * (len(hit['metadata']['title']) + len(hit['links']['html']) + 6)}

.. image:: {hit['links']['badge']}
   :target: {hit['links']['doi']}

Description:
------------

{hit['metadata']['description']}

Authors:
--------
{'; '.join([creator['name'] for creator in hit['metadata']['creators']])}

.. meta::
   :keywords: {'' if 'keywords' not in hit['metadata'] else ', '.join([keyword for keyword in hit['metadata']['keywords']])}
    """
    # Write ReST page, basing the filename on the Zenodo ID
    index_path = f"{params['type']}s/{hit['id']}"
    with open(f"../docs/{index_path}.rst", "w", encoding = 'utf-8') as rst:
        rst.write(page)
    return index_path

### Process each hit, getting the next page/hit every iteration

In [40]:
# Process initial hit
index_path = process_hit(j['hits']['hits'][0])
index_paths = [index_path]
# Process next hits/pages until they exhaust
jn = j
while get_next_link(jn): 
    next_response = reget(get_next_link(jn))
    assert next_response.status_code == 200 # success
    jn = next_response.json()
    index_path = process_hit(jn['hits']['hits'][0])
    index_paths.append(index_path)
# Check that processed hits equals total hits
assert len(index_paths) == j['hits']['total']
index_paths

Title: Supply Chain Shocks due to extreme weather events
URL: https://zenodo.org/api/records/?sort=mostrecent&q=&communities=coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project&type=dataset&page=2&size=1
Title: Riverine Flood Insurance assessment indicators under climate and socio-economic change
URL: https://zenodo.org/api/records/?sort=mostrecent&q=&communities=coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project&type=dataset&page=3&size=1
Title: Time of emergence of climate change impacts
URL: https://zenodo.org/api/records/?sort=mostrecent&q=&communities=coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project&type=dataset&page=4&size=1
Title: Climate induced economic shocks using CLIMRISK
URL: https://zenodo.org/api/records/?sort=mostrecent&q=&communities=coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project&type=dataset&page=5&size=1
Title: River flooding impacts using CLIMRISK-RIVER
URL: https://zenodo.o

['datasets/5541327',
 'datasets/5534190',
 'datasets/5530237',
 'datasets/5530146',
 'datasets/5529888',
 'datasets/5529732',
 'datasets/5513871',
 'datasets/4733499']

### Add index paths to index

In [45]:
# read the index file
with open(f"../docs/index.rst", "r", encoding = 'utf-8') as index:
    lines = index.readlines()
# find the toctree
i = 0
while lines[i].find(".. toctree::") < 0:
    i += 1
# find the start of the index
while lines[i] != '\n':
    i += 1
i += 1
assert start > 2
start = i
# remove index entries
while lines[i] != '\n':
    lines.pop(i)
# add new index paths
for path in reversed(index_paths):
    lines.insert(start, f"   {path}\n")
# write the index file
with open(f"../docs/index.rst", "w", encoding = 'utf-8') as index:
    index.writelines(lines)

### List root-level keys and value types of the JSON result dictionary

In [173]:
for key in j:
    print(f"'{key}' {type(j[key])}")

'aggregations' <class 'dict'>
'hits' <class 'dict'>
'links' <class 'dict'>


### Show aggregations (aggregate data over all query hits, presumably)

In [174]:
print(yaml.dump(j['aggregations']))

access_right:
  buckets:
  - doc_count: 25
    key: open
  - doc_count: 1
    key: restricted
  doc_count_error_upper_bound: 0
  sum_other_doc_count: 0
file_type:
  buckets:
  - doc_count: 19
    key: pdf
  - doc_count: 5
    key: csv
  - doc_count: 2
    key: xlsx
  - doc_count: 1
    key: docx
  - doc_count: 1
    key: zip
  doc_count_error_upper_bound: 0
  sum_other_doc_count: 0
keywords:
  buckets:
  - doc_count: 5
    key: COACCH
  - doc_count: 2
    key: agriculture
  - doc_count: 1
    key: CLIMRISK
  - doc_count: 1
    key: CLIMRISK-RIVER
  - doc_count: 1
    key: Climate change mitigation; Electricity; Europe; Risk; Capital costs
  - doc_count: 1
    key: Climate-change impacts, Geology
  - doc_count: 1
    key: Coastal cities
  - doc_count: 1
    key: Damage risk
  - doc_count: 1
    key: ERA-5
  - doc_count: 1
    key: GLOFRIS
  doc_count_error_upper_bound: 0
  sum_other_doc_count: 27
type:
  buckets:
  - doc_count: 19
    key: publication
    subtype:
      buckets:
      -

### Show links (navigation links on this page, presumably)

In [175]:
print(yaml.dump(j['links']))

next: https://zenodo.org/api/records/?sort=mostrecent&q=&communities=coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project&type=dataset&page=2&size=1
self: https://zenodo.org/api/records/?sort=mostrecent&q=&communities=coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project&type=dataset&page=1&size=1



### List root-level keys and value types of the 'hits' sub dictionary

Presumably, this is a list of hits returned on the current page as well as a total number of hits, which could have been classified as an 'aggregation' entry.

In [176]:
for key in j['hits']:
    print(f"'{key}' {type(j['hits'][key])}")

'hits' <class 'list'>
'total' <class 'int'>


In [162]:
hit = j['hits']['hits'][0]
hit['metadata'].keys()

dict_keys(['access_right', 'access_right_category', 'communities', 'creators', 'description', 'doi', 'grants', 'keywords', 'license', 'publication_date', 'related_identifiers', 'relations', 'resource_type', 'title', 'version'])

In [172]:
params['type']

'dataset'