In [10]:
import chardet
import unicodecsv as csv
import io
import json
import os
import pandas as pd
import requests
import time
import urllib
import yaml

from pprint import pprint

### Helper functions

In [11]:
def get_nb_hits(json_response):
    return len(json_response['hits']['hits'])

def get_next_link(json_response):
    return json_response['links'].get('next', None)

def reget(url, params=None, **kwargs):
    """
    Sends a GET request and resends it with increasing delays
    when status code 429 (too many requests) is received.

    :param url: URL for the new :class:`Request` object.
    :param params: (optional) Dictionary, list of tuples or bytes to send
        in the query string for the :class:`Request`.
    :param **kwargs: Optional arguments that ``request`` takes.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
    """
    if params is None:
        print(f"URL: {url}")
    else:
        print(f"URL: {url}, params:")
        redacted_params = params
        del redacted_params['access_token'] # don't want to leak the token
        print(json.dumps(redacted_params, indent = 4))
    delay = 0.0
    while True:  
        response = requests.get(url, params=params, **kwargs)
        if response.status_code != 429: # not too many requests
            return response
        delay += 2
        print(f"delay: {delay}s to circumvent rate limiting...")
        time.sleep(delay)

def guess_encoding(file, n_lines=20):
    '''Guess a file's encoding using chardet'''
    import chardet

    # Open the file as binary data
    if issubclass(type(file), io.BufferedIOBase):
        file.seek(0)
        rawdata = b''.join([file.readline() for _ in range(n_lines)])
        file.seek(0)
    else:
        # assume we were handed a file path
        with open(file_path, 'rb') as f:
            # Join binary lines for specified number of lines
            rawdata = b''.join([f.readline() for _ in range(n_lines)])

    return chardet.detect(rawdata)['encoding']

### Query and collect hits from result pages

The query string uses [elastic search syntax](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax).

In [39]:
query = ""
# Perform query and collect initial response
params = {
    'q': urllib.parse.quote(query),
    #'type': 'publication',
    'type': 'dataset',
    'communities': "coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project",
    'size': 10, # hits per page
    'page': 1,
    'access_token': os.environ['ZENODO_API_TOKEN']
}
response = reget("https://zenodo.org/api/records", params = params)
assert response.status_code == 200 # success
j = response.json()
hits = j['hits']['hits'] # initial set of hits, to be appended to for next pages
print(f"--> {len(hits)}")
hits_total = j['hits']['total']
aggregations = j['aggregations']

# Process further hits/pages until they exhaust
while get_next_link(j): 
    next_response = reget(get_next_link(j))
    assert next_response.status_code == 200 # success
    j = next_response.json()
    hits.extend(j['hits']['hits'])
assert len(hits) == hits_total

# Report total hits and ids
print(f"Query resulted in a total of {hits_total} hits with Zenodo IDs:")
ids = [hit['id'] for hit in hits]
pprint(ids)

URL: https://zenodo.org/api/records, params:
{
    "q": "",
    "type": "dataset",
    "communities": "coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project",
    "size": 10,
    "page": 1
}
--> 10
URL: https://zenodo.org/api/records/?sort=mostrecent&q=&communities=coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project&type=dataset&page=2&size=10
Query resulted in a total of 13 hits with Zenodo IDs:
[5549953,
 5546264,
 5546248,
 5541337,
 5541894,
 5541327,
 5534190,
 5530237,
 5530146,
 5529888,
 5529732,
 5513871,
 4733499]


### Define a function to custom-process each hit

In [40]:
def process_hit(hit):
    """
    Process a query hit and generate a ReST page.
    """
    print(f"Title: {hit['metadata']['title']}")

    # Define a templated ReST page for the hit
    # ------ BEGIN TEMPLATE ----------
    page = f"""
.. This file is automaticaly generted. Do not edit.

`{hit['metadata']['title']} <{hit['links']['html']}>`_
{'=' * (len(hit['metadata']['title']) + len(hit['links']['html']) + 6)}

.. image:: {hit['links']['badge']}
   :target: {hit['links']['doi']}

Description:
------------

{hit['metadata']['description']}

Authors:
--------
{'; '.join([creator['name'] for creator in hit['metadata']['creators']])}

.. meta::
   :keywords: {'' if 'keywords' not in hit['metadata'] else ', '.join([keyword for keyword in hit['metadata']['keywords']])}
    """
    # ------ END TEMPLATE ------------

    # Write ReST page, basing the filename on the Zenodo ID
    with open(f"../docs/{params['type']}s/{hit['id']}.rst", "w", encoding = 'utf-8', newline = '\n') as rst:
        rst.write(page)
    return hit['id']

### Process each hit, getting the next page/hit every iteration

In [41]:
for hit in hits:
    process_hit(hit)

Title: WP3-CT2 Alpine Glaciers Disappearance Tipping Point
Title: Reduced-form Climate Change Damage Functions
Title: Macroeconomic assessment of Climate Change Impacts
Title: Valuation of heat related mortality risk and tick-borne diseases
Title: Climate change impacts on energy demand
Title: Supply Chain Shocks due to extreme weather events
Title: Riverine Flood Insurance assessment indicators under climate and socio-economic change
Title: Time of emergence of climate change impacts
Title: Climate induced economic shocks using CLIMRISK
Title: River flooding impacts using CLIMRISK-RIVER
Title: SETP Food
Title: ImpactChains_GLOBIOM
Title: Additional dataset to "Comparing urban coastal flood risk in 136 cities under two alternative sea-level projections: RCP 8.5 and an expert opinion-based high-end scenario"


### Add datasets to their respective class pages

In [42]:
exec(open('classes.py').read())
written = {}
for c in classes:
    # read the class file
    with open(f"../docs/classes/{c}.rst", "r", encoding = 'utf-8') as class_rst:
        lines = class_rst.readlines()
    # find the toctree
    i = 0
    while lines[i].find(".. toctree::") < 0:
        i += 1
    # find the start of the ToC entries
    while lines[i] != '\n':
        i += 1
    i += 1
    assert i > 2
    start = i
    # remove ToC entries
    while i < len(lines) and lines[i] != '\n':
        lines.pop(i)
    # add new ToC entries
    for id in reversed(ids):
        if id in classes[c] or c =='other' and id not in written:
            written[id] = True
            lines.insert(start, f"   ../datasets/{id}\n")
    # write the index file
    with open(f"../docs/classes/{c}.rst", "w", encoding = 'utf-8', newline = '\n') as class_rst:
        class_rst.writelines(lines)

### Show aggregations (aggregate data over all query hits, presumably)

In [43]:
print(yaml.dump(aggregations))

access_right:
  buckets:
  - doc_count: 32
    key: open
  - doc_count: 1
    key: restricted
  doc_count_error_upper_bound: 0
  sum_other_doc_count: 0
file_type:
  buckets:
  - doc_count: 19
    key: pdf
  - doc_count: 12
    key: csv
  - doc_count: 5
    key: xlsx
  - doc_count: 1
    key: 7z
  - doc_count: 1
    key: docx
  - doc_count: 1
    key: nc
  - doc_count: 1
    key: zip
  doc_count_error_upper_bound: 0
  sum_other_doc_count: 0
keywords:
  buckets:
  - doc_count: 11
    key: COACCH
  - doc_count: 2
    key: Climate Change impacts
  - doc_count: 2
    key: Macroeconomic assessment
  - doc_count: 2
    key: agriculture
  - doc_count: 1
    key: CLIMRISK
  - doc_count: 1
    key: CLIMRISK-RIVER
  - doc_count: 1
    key: Climate Change
  - doc_count: 1
    key: Climate change
  - doc_count: 1
    key: Climate change mitigation; Electricity; Europe; Risk; Capital costs
  - doc_count: 1
    key: Climate-change impacts, Geology
  doc_count_error_upper_bound: 0
  sum_other_doc_coun

### Dump a hit as YAML

In [44]:
print(yaml.dump(hits[0]))

conceptdoi: 10.5281/zenodo.5549952
conceptrecid: '5549952'
created: '2021-10-05T12:22:18.934163+00:00'
doi: 10.5281/zenodo.5549953
files:
- bucket: b049c94c-88b1-4c04-9ed6-6c46566eb1f5
  checksum: md5:a7d40815c45ff1efa12e7c83517765c4
  key: COACCH_MetaData_T3.2_CT2.csv
  links:
    self: https://zenodo.org/api/files/b049c94c-88b1-4c04-9ed6-6c46566eb1f5/COACCH_MetaData_T3.2_CT2.csv
  size: 16369
  type: csv
- bucket: b049c94c-88b1-4c04-9ed6-6c46566eb1f5
  checksum: md5:0c2003afe9eb8becc8f9ec130e6c1e1d
  key: L10000-glacier_CNRM-CM_SMHI-RCA4_rcp45_1985-2100_mon.nc
  links:
    self: https://zenodo.org/api/files/b049c94c-88b1-4c04-9ed6-6c46566eb1f5/L10000-glacier_CNRM-CM_SMHI-RCA4_rcp45_1985-2100_mon.nc
  size: 327017524
  type: nc
- bucket: b049c94c-88b1-4c04-9ed6-6c46566eb1f5
  checksum: md5:aac5bc7de8970720e2cdff7ec49156cb
  key: L10000-glacier_CNRM-CM_SMHI-RCA4_rcp85_1985-2100_mon.nc
  links:
    self: https://zenodo.org/api/files/b049c94c-88b1-4c04-9ed6-6c46566eb1f5/L10000-glacier_CN