In [117]:
import json
import os
import requests
import tempfile
import time
import urllib
from pprint import pprint

### Helper functions

In [3]:
def get_nb_hits(json_response):
    return len(json_response['hits']['hits'])

def get_next_link(json_response):
    return json_response['links'].get('next', None)

def reget(url, params=None, **kwargs):
    """
    Sends a GET request and resends it with increasing delays
    when status code 429 (too many requests) is received.

    :param url: URL for the new :class:`Request` object.
    :param params: (optional) Dictionary, list of tuples or bytes to send
        in the query string for the :class:`Request`.
    :param **kwargs: Optional arguments that ``request`` takes.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
    """
    if params is None:
        print(f"URL: {url}")
    else:
        print(f"URL: {url}, params:")
        redacted_params = params
        del redacted_params['access_token'] # don't want to leak the token
        print(json.dumps(redacted_params, indent = 4))
    delay = 0.0
    while True:  
        response = requests.get(url, params=params, **kwargs)
        if response.status_code != 429: # not too many requests
            return response
        delay += 2
        print(f"delay: {delay}s to circumvent rate limiting...")
        time.sleep(delay)

### Query and check that we got all hits on the first page

The query string uses [elastic search syntax](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax).

In [142]:
query = ""
hits_per_page = 100
params = {
    'q': urllib.parse.quote(query),
    #'type': 'publication',
    'type': 'dataset',
    'communities': "coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project",
    'size': hits_per_page,
    'page': 1,
    'access_token': os.environ['ZENODO_API_TOKEN']
}
response = reget("https://zenodo.org/api/records", params = params)
assert response.status_code == 200 # success
j = response.json()
assert len(j['hits']['hits']) < hits_per_page
print(f"{len(j['hits']['hits'])} hits on the page.")

URL: https://zenodo.org/api/records, params:
{
    "q": "",
    "type": "dataset",
    "communities": "coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project",
    "size": 100,
    "page": 1
}
12 hits on the page.


### Pretty print first hit

In [143]:
print(json.dumps(j['hits']['hits'][0], indent = 2))

{
  "conceptdoi": "10.5281/zenodo.5546263",
  "conceptrecid": "5546263",
  "created": "2021-10-03T10:39:09.696683+00:00",
  "doi": "10.5281/zenodo.5546264",
  "files": [
    {
      "bucket": "cd6c851e-1d5a-4cce-93b6-fdb73d3f9e4f",
      "checksum": "md5:53d000634c2fdaae19ad4f5e2bd4280f",
      "key": "damage_coefficients-COACCH-WP4.xlsx",
      "links": {
        "self": "https://zenodo.org/api/files/cd6c851e-1d5a-4cce-93b6-fdb73d3f9e4f/damage_coefficients-COACCH-WP4.xlsx"
      },
      "size": 66047,
      "type": "xlsx"
    },
    {
      "bucket": "cd6c851e-1d5a-4cce-93b6-fdb73d3f9e4f",
      "checksum": "md5:16c58731d29607003a73d82461da15a5",
      "key": "damage_coefficients-ICES-COACCH-WP4.xlsx",
      "links": {
        "self": "https://zenodo.org/api/files/cd6c851e-1d5a-4cce-93b6-fdb73d3f9e4f/damage_coefficients-ICES-COACCH-WP4.xlsx"
      },
      "size": 144927,
      "type": "xlsx"
    },
    {
      "bucket": "cd6c851e-1d5a-4cce-93b6-fdb73d3f9e4f",
      "checksum": "md5:

### List URLs of files in the data-set of a hit, and guess the metadata file

In [154]:
meta_urls = []
for hit in j['hits']['hits']:
    print(f"-------- ID: {hit['id']}")
    files = hit['files']
    meta_url = None
    for f in files:
        link = f['links']['self']
        if link.lower().find("metadata") >= 0 or link.lower().find("meta%20data") >= 0:
            assert meta_url is None
            meta_url = link
            print(f"{link} <-- metadata?")
        else:
            print(f"{link}")
    if meta_url is None:
        print(f"WARNING: data set {hit['id']} includes no obvious metadata file!")
    meta_urls.append(meta_url)

-------- ID: 5546264
https://zenodo.org/api/files/cd6c851e-1d5a-4cce-93b6-fdb73d3f9e4f/damage_coefficients-COACCH-WP4.xlsx
https://zenodo.org/api/files/cd6c851e-1d5a-4cce-93b6-fdb73d3f9e4f/damage_coefficients-ICES-COACCH-WP4.xlsx
https://zenodo.org/api/files/cd6c851e-1d5a-4cce-93b6-fdb73d3f9e4f/DFs_All_impacts.xlsx
https://zenodo.org/api/files/cd6c851e-1d5a-4cce-93b6-fdb73d3f9e4f/Metadata%20damage%20functions%20T4.3.csv <-- metadata?
-------- ID: 5546248
https://zenodo.org/api/files/c5de69df-aafb-46ca-8e57-a8a2aeafa8ba/COACCH_ICES-CMCC_data.7z
https://zenodo.org/api/files/c5de69df-aafb-46ca-8e57-a8a2aeafa8ba/COACCH_ICES_MetaData-CMCC.csv <-- metadata?
-------- ID: 5541337
https://zenodo.org/api/files/ace61379-f532-4208-b241-091201337012/COACCH_MetaData_T2.6_CUNI.csv <-- metadata?
https://zenodo.org/api/files/ace61379-f532-4208-b241-091201337012/COACCH_T2.6_Valuation%20of%20heat%20related%20mortality%20risk_CUNI.xlsx
https://zenodo.org/api/files/ace61379-f532-4208-b241-091201337012/COAC

### Download meta data of one of the hits as a binary file

In [164]:
meta_url = meta_urls[4]
meta_file_name = urllib.parse.unquote(meta_url.split("/")[-1])
r = requests.get(meta_url)
with open(meta_file_name, 'wb') as f:
    f.write(r.content)
r.content

b'Name,Entry date,Dataset version,Author/Contact person,Short description,Partner,Model type/method,Model,Model version,Documentation,Sector,Keywords,SSP,RCP,GCM,Variables and units,Time start,Time end,Time resolution,Spatial coverage,Spatial resolution unit Europe,Spatial resolution Rest of World,Spatial projection,Data type,File format,Recommended citation,Other comments\r\nSupply Chain Shocks due to extreme weather events,24/08/2021,v1,Stefan Borsky,Projected supply chain shocks due to extrem weather events measured in annual percentage change in a country-sector\'s export activity compared to the baseline period ,UNI GRAZ,Econometrics,,,,Industry energy services and trade,"supply chain shocks, extreme weather events, climate change",SSP2,"RCP2.6, RCP4.5","HadGEM2-ES, IPSL-CM5A-LR, GFDL-ESM2M, NorESM1-M, MIROC-ESM-CHEM",SupplyChainShock (annual percentage change in export activity to the baseline period (1990\x962015)),1/1/2015,31/12/2099,Annual,Global,Country,Country,,,CSV,"Schleyp

### Import as CSV with Pandas

In [165]:
#encoding = 'utf-8'
encoding = 'Windows-1252'

import pandas as pd
print(meta_file_name)
mf = pd.read_csv(meta_file_name, encoding=encoding)
mf

COACCH_MetaData_WP2_4_Trade_and_Supply_Chain_Shocks.csv


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x96 in position 920: invalid start byte

### Sniff and read the CSV with the csv module

In [118]:
with open(meta_file_name, newline='') as csvfile:
    assert csv.Sniffer().has_header(csvfile.read(1024)) # must have a header
    csvfile.seek(0)
    dialect = csv.Sniffer().sniff(csvfile.read(1024))
    csvfile.seek(0)
    reader = csv.DictReader(csvfile, dialect=dialect)
    first_row = None
    for row in reader:
        assert first_row is None # expect only one row
        first_row = row
    pprint(first_row)

OrderedDict([('Name', 'Reduced-form Climate Change Damage Functions '),
             ('Entry date', '29092021'),
             ('Dataset version', '1'),
             ('Author/Contact person',
              'Francesco Bosello, Ramiro Parrado, Kaj-Ivar Van der Wijst, '
              'Gabriele Standardi'),
             ('Short description',
              'Reduced-form Climate Change Damage Functions of impacts on: '
              'Agriculture, Fishery, Forestry, Sea level rise, Riverine '
              'floods, Transport, Energy supply, Energy demand, Labour '
              'productivity.'),
             ('Partner', 'CMCC/PBL'),
             ('Model type/method', 'Reduced-form Damage Function'),
             ('Model', 'NA'),
             ('Model version', 'NA'),
             ('Documentation',
              'https://www.coacch.eu/wp-content/uploads/2018/03/COACCH-Deliverable-4.3-to-upload.pdf'),
             ('Sector', 'Macro-economic'),
             ('Keywords',
              'Macroeconomi

### Test

In [157]:
meta_urls

['https://zenodo.org/api/files/cd6c851e-1d5a-4cce-93b6-fdb73d3f9e4f/Metadata%20damage%20functions%20T4.3.csv',
 'https://zenodo.org/api/files/c5de69df-aafb-46ca-8e57-a8a2aeafa8ba/COACCH_ICES_MetaData-CMCC.csv',
 'https://zenodo.org/api/files/ace61379-f532-4208-b241-091201337012/COACCH_MetaData_T2.6_CUNI.csv',
 'https://zenodo.org/api/files/47e4e7fc-084d-4e22-83ef-59337a2f86a9/COACCH_MetaData_energy_demand.csv',
 'https://zenodo.org/api/files/caa37be2-8cb3-4fa5-be8a-b4270581b143/COACCH_MetaData_WP2_4_Trade_and_Supply_Chain_Shocks.csv',
 'https://zenodo.org/api/files/bbbc0e7e-dff8-4737-bd93-67da222b1478/metadata.csv',
 'https://zenodo.org/api/files/d69d64c8-affa-4166-9635-d14660bdcd60/metadata.csv',
 'https://zenodo.org/api/files/22c9648d-617e-4cd7-8a3b-fd85bb567dac/COACCH_MetaData.csv',
 'https://zenodo.org/api/files/3399b37e-2171-44d9-8ba5-088474b03d69/COACCH_MetaData.csv',
 'https://zenodo.org/api/files/0beb8030-0676-45fe-9134-3406d6116a73/T3.4_SETP_Food_MetaData.csv',
 'https://zenod