# Query COACCCH data sets and generate Data Repository ReST

In [52]:
import chardet
import unicodecsv as csv
import io
import json
import os
import re
import requests
import time
import urllib
import yaml

from pprint import pprint

### Define helper functions

In [53]:
def get_nb_hits(json_response):
    return len(json_response['hits']['hits'])

def get_next_link(json_response):
    return json_response['links'].get('next', None)

def reget(url, params=None, **kwargs):
    """
    Sends a GET request and resends it with increasing delays
    when status code 429 (too many requests) is received.

    :param url: URL for the new :class:`Request` object.
    :param params: (optional) Dictionary, list of tuples or bytes to send
        in the query string for the :class:`Request`.
    :param **kwargs: Optional arguments that ``request`` takes.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
    """
    if params is None:
        print(f"URL: {url}")
    else:
        print(f"URL: {url}, params:")
        redacted_params = params
        del redacted_params['access_token'] # don't want to leak the token
        print(json.dumps(redacted_params, indent = 4))
    delay = 0.0
    while True:  
        response = requests.get(url, params=params, **kwargs)
        if response.status_code != 429: # not too many requests
            return response
        delay += 2
        print(f"delay: {delay}s to circumvent rate limiting...")
        time.sleep(delay)

def guess_encoding(file, n_lines=20):
    '''Guess a file's encoding using chardet'''
    import chardet

    # Open the file as binary data
    if issubclass(type(file), io.BufferedIOBase):
        file.seek(0)
        rawdata = b''.join([file.readline() for _ in range(n_lines)])
        file.seek(0)
    else:
        # assume we were handed a file path
        with open(file_path, 'rb') as f:
            # Join binary lines for specified number of lines
            rawdata = b''.join([f.readline() for _ in range(n_lines)])

    return chardet.detect(rawdata)['encoding']

def strip_html_markup(html):
    tagpat = re.compile(r'</?[a-zA-Z]+>')
    html = tagpat.sub('', html)
    html = re.sub(r'&nbsp;', '', html)
    return html

### Define a function and template to process a hit to reStructuredText

In [54]:
def rest_hit(hit):
    """
    Process a query hit and generate a ReST page.
    """
    if 'coacch' in hit and 'metadata_rows' in hit['coacch']:
        cm = hit['coacch']['metadata_rows'][0]
        # Define a templated ReST page for a hit _with_ COACCH metadata
        # ------ BEGIN TEMPLATE ----------
        page = f"""
.. This file is automaticaly generated. Do not edit.

`{hit['metadata']['title']} <{hit['links']['html']}>`_
{'=' * (len(hit['metadata']['title']) + len(hit['links']['html']) + 6)}

.. image:: {hit['links']['badge']}
   :target: {hit['links']['doi']}

Description:
------------

{hit['metadata']['description']}

COACCH-Specific Metadata:
-------------------------

- **Sector**: {cm['Sector']}
- **Partner**: {cm['Partner']}
- **SSP**: {cm['SSP']}
- **RCP**: {cm['RCP']}
- **Spatial resolution Europe**: {cm['Spatial resolution unit Europe']}
- **Keywords**: {cm['Keywords']}

Authors:
--------
{'; '.join([creator['name'] for creator in hit['metadata']['creators']])}

.. meta::
   :keywords: {'' if 'keywords' not in hit['metadata'] else ', '.join([keyword for keyword in hit['metadata']['keywords']])}
    """
        # ------ END TEMPLATE ------------
    else:
        # Define a templated ReST page for a hit _without_ COACCH metadata
        # ------ BEGIN TEMPLATE ----------
        page = f"""
.. This file is automaticaly generated. Do not edit.

`{hit['metadata']['title']} <{hit['links']['html']}>`_
{'=' * (len(hit['metadata']['title']) + len(hit['links']['html']) + 6)}

.. image:: {hit['links']['badge']}
   :target: {hit['links']['doi']}

Description:
------------

{hit['metadata']['description']}

Authors:
--------
{'; '.join([creator['name'] for creator in hit['metadata']['creators']])}

.. meta::
   :keywords: {'' if 'keywords' not in hit['metadata'] else ', '.join([keyword for keyword in hit['metadata']['keywords']])}
    """
        # ------ END TEMPLATE ------------

    # Write ReST page, basing the filename on the Zenodo ID
    with open(f"../docs/{params['type']}s/{hit['id']}.rst", "w", encoding = 'utf-8', newline = '\n') as rst:
        rst.write(page)
    return hit['id']

### Query and collect hits from result pages

The query string uses [elastic search syntax](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax).

In [55]:
query = ""
# Perform query and collect initial response
params = {
    'q': urllib.parse.quote(query),
    #'type': 'publication',
    'type': 'dataset',
    'communities': "coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project",
    'size': 10, # hits per page
    'page': 1,
    'sort': '-mostrecent',
    'access_token': os.environ['ZENODO_API_TOKEN']
}
response = reget("https://zenodo.org/api/records", params = params)
assert response.status_code == 200 # success
j = response.json()
hits = j['hits']['hits'] # initial set of hits, to be appended to for next pages
print(f"--> {len(hits)}")
hits_total = j['hits']['total']
aggregations = j['aggregations']

# Process further hits/pages until they exhaust
while get_next_link(j): 
    next_response = reget(get_next_link(j))
    assert next_response.status_code == 200 # success
    j = next_response.json()
    hits.extend(j['hits']['hits'])
assert len(hits) == hits_total

# Report total hits and ids
print(f"Query resulted in a total of {hits_total} hits with Zenodo IDs:")
pprint([hit['id'] for hit in hits])

URL: https://zenodo.org/api/records, params:
{
    "q": "",
    "type": "dataset",
    "communities": "coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project",
    "size": 10,
    "page": 1,
    "sort": "-mostrecent"
}
--> 10
URL: https://zenodo.org/api/records/?sort=-mostrecent&q=&communities=coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project&type=dataset&page=2&size=10
Query resulted in a total of 13 hits with Zenodo IDs:
[4733499,
 5513871,
 5529732,
 5529888,
 5530146,
 5530237,
 5534190,
 5541327,
 5541894,
 5541337,
 5546248,
 5546264,
 5549953]


### Strip HTML markup from description metadata

In [56]:
for hit in hits:
    hit['metadata']['description'] = strip_html_markup(hit['metadata']['description'])

### Try to determine the URL of the COACCH metadata file for each hit

In [57]:
meta_urls = []
for hit in hits:
    print(f"-------- ID: {hit['id']}")
    files = hit['files']
    meta_url = None
    for f in files:
        link = f['links']['self']
        if link.lower().find("metadata") >= 0 or link.lower().find("meta%20data") >= 0:
            assert meta_url is None # data set should have only one meta data file
            meta_url = link
            print(f"{link} <-- metadata?")
        else:
            print(f"{link}")
    if meta_url is None:
        print(f"WARNING: data set {hit['id']} includes no obvious metadata file!")
    else:
        hit['coacch'] = {} # add empty dict to hold COACCH-specifics
        hit['coacch']['meta_url'] = meta_url

-------- ID: 4733499
https://zenodo.org/api/files/cca08a1a-ad35-489a-8049-bfe951fda371/Abadie%20et%20al_2020_Additional%20scenarios.xlsx
-------- ID: 5513871
https://zenodo.org/api/files/4406e796-3221-4985-925e-ea43cd89ac2f/IIASA_ImpactChains_GLOBIOM_Data.csv
https://zenodo.org/api/files/4406e796-3221-4985-925e-ea43cd89ac2f/ImpactChains_GLOBIOM_MetaData.csv <-- metadata?
-------- ID: 5529732
https://zenodo.org/api/files/0beb8030-0676-45fe-9134-3406d6116a73/SETP_Food.csv
https://zenodo.org/api/files/0beb8030-0676-45fe-9134-3406d6116a73/T3.4_SETP_Food_MetaData.csv <-- metadata?
-------- ID: 5529888
https://zenodo.org/api/files/3399b37e-2171-44d9-8ba5-088474b03d69/BaselineHeightAdaptation.csv
https://zenodo.org/api/files/3399b37e-2171-44d9-8ba5-088474b03d69/COACCH_MetaData.csv <-- metadata?
https://zenodo.org/api/files/3399b37e-2171-44d9-8ba5-088474b03d69/OptimalAdaptation.csv
-------- ID: 5530146
https://zenodo.org/api/files/22c9648d-617e-4cd7-8a3b-fd85bb567dac/absolute_risk.xlsx
https:/

### Retrieve COACCH metadata files for the hits

Store as an in-memory binary-file-like object

In [58]:
for hit in hits:
    if 'coacch' in hit and 'meta_url' in hit['coacch']:
        r = reget(hit['coacch']['meta_url'])
        hit['coacch']['metadata'] = io.BytesIO(r.content)
print("Done retrieving COACCH metadata for hits.")

URL: https://zenodo.org/api/files/4406e796-3221-4985-925e-ea43cd89ac2f/ImpactChains_GLOBIOM_MetaData.csv
URL: https://zenodo.org/api/files/0beb8030-0676-45fe-9134-3406d6116a73/T3.4_SETP_Food_MetaData.csv
URL: https://zenodo.org/api/files/3399b37e-2171-44d9-8ba5-088474b03d69/COACCH_MetaData.csv
URL: https://zenodo.org/api/files/22c9648d-617e-4cd7-8a3b-fd85bb567dac/COACCH_MetaData.csv
URL: https://zenodo.org/api/files/d69d64c8-affa-4166-9635-d14660bdcd60/metadata.csv
URL: https://zenodo.org/api/files/bbbc0e7e-dff8-4737-bd93-67da222b1478/metadata.csv
URL: https://zenodo.org/api/files/caa37be2-8cb3-4fa5-be8a-b4270581b143/COACCH_MetaData_WP2_4_Trade_and_Supply_Chain_Shocks.csv
URL: https://zenodo.org/api/files/47e4e7fc-084d-4e22-83ef-59337a2f86a9/COACCH_MetaData_energy_demand.csv
URL: https://zenodo.org/api/files/ace61379-f532-4208-b241-091201337012/COACCH_MetaData_T2.6_CUNI.csv
URL: https://zenodo.org/api/files/c5de69df-aafb-46ca-8e57-a8a2aeafa8ba/COACCH_ICES_MetaData-CMCC.csv
URL: https:/

### Special handling: Retrieve COACCH metadata for 5541894

In [59]:
for hit in hits:
    if hit['id'] == 5541894:
        hit5541894 = hit
with open("COACCH_MetaData_BC3_WP4.csv", "rb") as f:
    contents = f.read()
    hit5541894['coacch'] = {}
    hit5541894['coacch']['metadata'] = io.BytesIO(contents)

### For each hit, check and convert the metadata CSV to dictionaries for each row

In [48]:
for hit in hits:
    print(f"-------- ID: {hit['id']}")
    if 'coacch' in hit and 'metadata' in hit['coacch']:
        metadata = hit['coacch']['metadata']
        encoding = guess_encoding(metadata)
        try:
            metadata.seek(0)
            metadata_chunk = str(metadata.read(8000), encoding)
        except UnicodeDecodeError as error:
            try:
                metadata.seek(0)
                encoding = 'Windows-1252'
                metadata_chunk = str(metadata.read(8000), encoding)
            except UnicodeDecodeError as error:
                metadata.seek(0)
                encoding = 'utf-8'
                metadata_chunk = str(metadata.read(8000), encoding)
        metadata.seek(0)
        if csv.Sniffer().has_header(metadata_chunk):
            dialect = csv.Sniffer().sniff(metadata_chunk)
            reader = csv.DictReader(metadata, dialect=dialect, encoding=encoding)
            rows = []
            for row in reader:
                rows.append(row)
            # Special handling for dataset 5541894: want the 2nd of 2 rows
            if hit['id'] == 5541894:
                rows.reverse()
            hit['coacch']['metadata_rows'] = rows
            print(f"Converted {len(rows)} row{'s' if len(rows) > 1 else ''} of metadata")
        else:
            print(f"WARNING: metadata of dataset https://zenodo.org/record/{hit['id']} has no CSV header. The metadata URL is {hit['coacch']['meta_url']}")
    else:
        print("WARNING: metadata absent")

-------- ID: 4733499
-------- ID: 5513871
Converted 1 row of metadata
-------- ID: 5529732
Converted 1 row of metadata
-------- ID: 5529888
Converted 1 row of metadata
-------- ID: 5530146
Converted 1 row of metadata
-------- ID: 5530237
Converted 1 row of metadata
-------- ID: 5534190
Converted 1 row of metadata
-------- ID: 5541327
Converted 1 row of metadata
-------- ID: 5541894
Converted 2 rows of metadata
-------- ID: 5541337
Converted 2 rows of metadata
-------- ID: 5546248
Converted 1 row of metadata
-------- ID: 5546264
Converted 1 row of metadata
-------- ID: 5549953
Converted 24 rows of metadata


### Check metadata column headers

In [49]:
for hit in hits:
    print(f"-------- ID: {hit['id']}")
    if 'coacch' in hit and 'metadata_rows' in hit['coacch']:
        rows = hit['coacch']['metadata_rows']
        headers = rows[0].keys()
        print(headers)
        template_headers = ['Name', 'Entry date', 'Dataset version', 'Author/Contact person', 'Short description', 'Partner', 'Model type/method', 'Model', 'Model version', 'Documentation', 'Sector', 'Keywords', 'SSP', 'RCP', 'GCM', 'Variables and units', 'Time start', 'Time end', 'Time resolution', 'Spatial coverage', 'Spatial resolution unit Europe', 'Spatial resolution Rest of World', 'Spatial projection', 'Data type', 'File format', 'Recommended citation', 'Other comments']
        for t in template_headers:
            if t not in headers:
                print(f"WARNING: required header '{t}' is absent.")
        for h in headers:
            if  h not in template_headers:
                print(f"WARNING: header '{h}' is present but not required.")
    else:
        print("No metadata rows were converted")

-------- ID: 4733499
No metadata rows were converted
-------- ID: 5513871
odict_keys(['Name', 'Entry date', 'Dataset version', 'Author/Contact person', 'Short description', 'Partner', 'Model type/method', 'Model', 'Model version', 'Documentation', 'Sector', 'Keywords', 'SSP', 'RCP', 'GCM', 'Variables and units', 'Time start', 'Time end', 'Time resolution', 'Spatial coverage', 'Spatial resolution unit Europe', 'Spatial resolution Rest of World', 'Spatial projection', 'File format', 'Recommended citation', 'Other comments'])
-------- ID: 5529732
odict_keys(['Name', 'Entry date', 'Dataset version', 'Author/Contact person', 'Short description', 'Partner', 'Model type/method', 'Model', 'Model version', 'Documentation', 'Sector', 'Keywords', 'SSP', 'RCP', 'GCM', 'Variables and units', 'Time start', 'Time end', 'Time resolution', 'Spatial coverage', 'Spatial resolution unit Europe', 'Spatial resolution Rest of World', 'Spatial projection', 'File format', 'Recommended citation', 'Other comment

### Produce a reStructuredText page for each hit

In [50]:
for hit in hits:
    print(f"Producing reStructuredText for hit {hit['id']}: {hit['metadata']['title']}")
    rest_hit(hit)

Producing reStructuredText for hit 4733499: Additional dataset to "Comparing urban coastal flood risk in 136 cities under two alternative sea-level projections: RCP 8.5 and an expert opinion-based high-end scenario"
Producing reStructuredText for hit 5513871: ImpactChains_GLOBIOM
Producing reStructuredText for hit 5529732: SETP Food
Producing reStructuredText for hit 5529888: River flooding impacts using CLIMRISK-RIVER
Producing reStructuredText for hit 5530146: Climate induced economic shocks using CLIMRISK
Producing reStructuredText for hit 5530237: Time of emergence of climate change impacts
Producing reStructuredText for hit 5534190: Riverine Flood Insurance assessment indicators under climate and socio-economic change
Producing reStructuredText for hit 5541327: Supply Chain Shocks due to extreme weather events
Producing reStructuredText for hit 5541894: Climate change impacts on energy demand
Producing reStructuredText for hit 5541337: Valuation of heat related mortality risk and 

### Add datasets to their respective class pages

In [51]:
# read the class file
exec(open('classes.py').read())
# collect the IDs of the hits
ids = [hit['id'] for hit in hits]
# set up a dictionary to keep track of which IDs were written
written = {}

for c in classes:
    page_path = f"../docs/classes/{c}.rst"
    # check that the IDs in the class were returned by the query
    for id in classes[c]:
        if id not in ids:
            print(f"WARNING: no hit with ID {id} in class '{c}' was returned by the Zenodo query!")
    # read the reStructuredText page of the class
    with open(page_path, "r", encoding = 'utf-8') as class_rst:
        lines = class_rst.readlines()
    # find the toctree
    i = 0
    while lines[i].find(".. toctree::") < 0:
        i += 1
    # find the start of the ToC entries
    while lines[i] != '\n':
        i += 1
    i += 1
    assert i > 2
    start = i
    # remove ToC entries
    while i < len(lines) and lines[i] != '\n':
        lines.pop(i)
    # add new ToC entries
    for id in reversed([hit['id'] for hit in hits]):
        if id in classes[c] or c =='other' and id not in written:
            written[id] = True
            lines.insert(start, f"   ../datasets/{id}\n")
    # write the reStructuredText page of the class
    with open(page_path, "w", encoding = 'utf-8', newline = '\n') as class_rst:
        class_rst.writelines(lines)

### Show aggregations (aggregate data over all query hits, presumably)

In [26]:
print(yaml.dump(aggregations))

access_right:
  buckets:
  - doc_count: 32
    key: open
  - doc_count: 1
    key: restricted
  doc_count_error_upper_bound: 0
  sum_other_doc_count: 0
file_type:
  buckets:
  - doc_count: 19
    key: pdf
  - doc_count: 12
    key: csv
  - doc_count: 5
    key: xlsx
  - doc_count: 1
    key: 7z
  - doc_count: 1
    key: docx
  - doc_count: 1
    key: nc
  - doc_count: 1
    key: zip
  doc_count_error_upper_bound: 0
  sum_other_doc_count: 0
keywords:
  buckets:
  - doc_count: 11
    key: COACCH
  - doc_count: 2
    key: Climate Change impacts
  - doc_count: 2
    key: Macroeconomic assessment
  - doc_count: 2
    key: agriculture
  - doc_count: 1
    key: CLIMRISK
  - doc_count: 1
    key: CLIMRISK-RIVER
  - doc_count: 1
    key: Climate Change
  - doc_count: 1
    key: Climate change
  - doc_count: 1
    key: Climate change mitigation; Electricity; Europe; Risk; Capital costs
  - doc_count: 1
    key: Climate-change impacts, Geology
  doc_count_error_upper_bound: 0
  sum_other_doc_coun

### Dump a hit as YAML

In [8]:
print(yaml.dump(hits[0]))

conceptdoi: 10.5281/zenodo.5549952
conceptrecid: '5549952'
created: '2021-10-05T12:22:18.934163+00:00'
doi: 10.5281/zenodo.5549953
files:
- bucket: b049c94c-88b1-4c04-9ed6-6c46566eb1f5
  checksum: md5:a7d40815c45ff1efa12e7c83517765c4
  key: COACCH_MetaData_T3.2_CT2.csv
  links:
    self: https://zenodo.org/api/files/b049c94c-88b1-4c04-9ed6-6c46566eb1f5/COACCH_MetaData_T3.2_CT2.csv
  size: 16369
  type: csv
- bucket: b049c94c-88b1-4c04-9ed6-6c46566eb1f5
  checksum: md5:0c2003afe9eb8becc8f9ec130e6c1e1d
  key: L10000-glacier_CNRM-CM_SMHI-RCA4_rcp45_1985-2100_mon.nc
  links:
    self: https://zenodo.org/api/files/b049c94c-88b1-4c04-9ed6-6c46566eb1f5/L10000-glacier_CNRM-CM_SMHI-RCA4_rcp45_1985-2100_mon.nc
  size: 327017524
  type: nc
- bucket: b049c94c-88b1-4c04-9ed6-6c46566eb1f5
  checksum: md5:aac5bc7de8970720e2cdff7ec49156cb
  key: L10000-glacier_CNRM-CM_SMHI-RCA4_rcp85_1985-2100_mon.nc
  links:
    self: https://zenodo.org/api/files/b049c94c-88b1-4c04-9ed6-6c46566eb1f5/L10000-glacier_CN