# Trial Zenodo API and processing of COACCH metadata

In [1]:
import chardet
import unicodecsv as csv
import io
import json
import os
import pandas as pd
import requests
import time
import urllib

from pprint import pprint

### Helper functions

In [2]:
def get_nb_hits(json_response):
    return len(json_response['hits']['hits'])

def get_next_link(json_response):
    return json_response['links'].get('next', None)

def reget(url, params=None, **kwargs):
    """
    Sends a GET request and resends it with increasing delays
    when status code 429 (too many requests) is received.

    :param url: URL for the new :class:`Request` object.
    :param params: (optional) Dictionary, list of tuples or bytes to send
        in the query string for the :class:`Request`.
    :param **kwargs: Optional arguments that ``request`` takes.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response
    """
    if params is None:
        print(f"URL: {url}")
    else:
        print(f"URL: {url}, params:")
        redacted_params = params
        del redacted_params['access_token'] # don't want to leak the token
        print(json.dumps(redacted_params, indent = 4))
    delay = 0.0
    while True:  
        response = requests.get(url, params=params, **kwargs)
        if response.status_code != 429: # not too many requests
            return response
        delay += 2
        print(f"delay: {delay}s to circumvent rate limiting...")
        time.sleep(delay)

def guess_encoding(file, n_lines=20):
    '''Guess a file's encoding using chardet'''
    import chardet

    # Open the file as binary data
    if issubclass(type(file), io.BufferedIOBase):
        file.seek(0)
        rawdata = b''.join([file.readline() for _ in range(n_lines)])
        file.seek(0)
    else:
        # assume we were handed a file path
        with open(file_path, 'rb') as f:
            # Join binary lines for specified number of lines
            rawdata = b''.join([f.readline() for _ in range(n_lines)])

    return chardet.detect(rawdata)['encoding']

### Query and check that we got all hits on the first page. List the IDs.

The query string uses [elastic search syntax](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax).

In [3]:
query = "Climate change impacts on energy demand"
hits_per_page = 100
params = {
    'q': urllib.parse.quote(query),
    #'type': 'publication',
    'type': 'dataset',
    'communities': "coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project",
    'size': hits_per_page,
    'page': 1,
    'access_token': os.environ['ZENODO_API_TOKEN']
}
response = reget("https://zenodo.org/api/records", params = params)
assert response.status_code == 200 # success
j = response.json()
hits = j['hits']['hits']
assert len(hits) < hits_per_page
print(f"{len(hits)} hits on the page.")
pprint([hit['id'] for hit in hits])

URL: https://zenodo.org/api/records, params:
{
    "q": "Climate%20change%20impacts%20on%20energy%20demand",
    "type": "dataset",
    "communities": "coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project",
    "size": 100,
    "page": 1
}
13 hits on the page.
[4733499,
 5530146,
 5541894,
 5546264,
 5546248,
 5530237,
 5513871,
 5541337,
 5529732,
 5534190,
 5549953,
 5541327,
 5529888]


### Pretty print first hit

In [4]:
print(json.dumps(j['hits']['hits'][0], indent = 2))

{
  "conceptdoi": "10.5281/zenodo.5549952",
  "conceptrecid": "5549952",
  "created": "2021-10-05T12:22:18.934163+00:00",
  "doi": "10.5281/zenodo.5549953",
  "files": [
    {
      "bucket": "b049c94c-88b1-4c04-9ed6-6c46566eb1f5",
      "checksum": "md5:a7d40815c45ff1efa12e7c83517765c4",
      "key": "COACCH_MetaData_T3.2_CT2.csv",
      "links": {
        "self": "https://zenodo.org/api/files/b049c94c-88b1-4c04-9ed6-6c46566eb1f5/COACCH_MetaData_T3.2_CT2.csv"
      },
      "size": 16369,
      "type": "csv"
    },
    {
      "bucket": "b049c94c-88b1-4c04-9ed6-6c46566eb1f5",
      "checksum": "md5:0c2003afe9eb8becc8f9ec130e6c1e1d",
      "key": "L10000-glacier_CNRM-CM_SMHI-RCA4_rcp45_1985-2100_mon.nc",
      "links": {
        "self": "https://zenodo.org/api/files/b049c94c-88b1-4c04-9ed6-6c46566eb1f5/L10000-glacier_CNRM-CM_SMHI-RCA4_rcp45_1985-2100_mon.nc"
      },
      "size": 327017524,
      "type": "nc"
    },
    {
      "bucket": "b049c94c-88b1-4c04-9ed6-6c46566eb1f5",
      "

### List URLs of files in the data-set of a hit, and guess the COACCH-specific metadata file

In [5]:
meta_urls = []
for hit in j['hits']['hits']:
    print(f"-------- ID: {hit['id']}")
    files = hit['files']
    meta_url = None
    for f in files:
        link = f['links']['self']
        if link.lower().find("metadata") >= 0 or link.lower().find("meta%20data") >= 0:
            assert meta_url is None
            meta_url = link
            print(f"{link} <-- metadata?")
        else:
            print(f"{link}")
    if meta_url is None:
        print(f"WARNING: data set {hit['id']} includes no obvious metadata file!")
    meta_urls.append(meta_url)

-------- ID: 5549953
https://zenodo.org/api/files/b049c94c-88b1-4c04-9ed6-6c46566eb1f5/COACCH_MetaData_T3.2_CT2.csv <-- metadata?
https://zenodo.org/api/files/b049c94c-88b1-4c04-9ed6-6c46566eb1f5/L10000-glacier_CNRM-CM_SMHI-RCA4_rcp45_1985-2100_mon.nc
https://zenodo.org/api/files/b049c94c-88b1-4c04-9ed6-6c46566eb1f5/L10000-glacier_CNRM-CM_SMHI-RCA4_rcp85_1985-2100_mon.nc
https://zenodo.org/api/files/b049c94c-88b1-4c04-9ed6-6c46566eb1f5/L10000-glacier_EC-EARTH_KNMI-RACMO22E_rcp26_1985-2100_mon.nc
https://zenodo.org/api/files/b049c94c-88b1-4c04-9ed6-6c46566eb1f5/L10000-glacier_EC-EARTH_KNMI-RACMO22E_rcp45_1985-2100_mon.nc
https://zenodo.org/api/files/b049c94c-88b1-4c04-9ed6-6c46566eb1f5/L10000-glacier_EC-EARTH_KNMI-RACMO22E_rcp85_1985-2100_mon.nc
https://zenodo.org/api/files/b049c94c-88b1-4c04-9ed6-6c46566eb1f5/L10000-glacier_IPSL_WRF331F_rcp45_1985-2100_mon.nc
https://zenodo.org/api/files/b049c94c-88b1-4c04-9ed6-6c46566eb1f5/L10000-glacier_IPSL_WRF331F_rcp85_1985-2100_mon.nc
https://zen

### Download meta data as an in-memory binary-file-like object

In [6]:
meta_url = meta_urls[0]
r = requests.get(meta_url)
meta_data = io.BytesIO(r.content)
r.content

b'Name,Entry date,Dataset version,Author/Contact person,Short description,Partner,Model type/method,Model,Model version,Documentation,Sector,Keywords,SSP,RCP,GCM,Variables and units,Time start,Time end,Time resolution,Spatial coverage,Spatial resolution unit Europe,Spatial resolution Rest of World,Spatial projection,Data type,File format,Recommended citation,Other comments\r\nL1000-glacier_EC-EARTH_KNMI-RACMO22E_rcp26_1985-2100_mon.nc,9202021,v1,Daniele Peano,Glacier Length change starting from small glaciers,CMCC,Minimal Glacier Model,Minimal Glacier Model,,"Peano D., Chiarle M., and Von Hardenberg J. (2016) A minimal model approach for glacier length modeling in the western Italian Alps, Geogr. Fis. Dinam. Quat., 39(1), 69\x9682, doi:10.4461/GFDQ.2016.39.7 ",Industry energy services and trade,"Glacier, climate change, tipping point",,RCP2.6,RACMO22E-EC-EARTH,"L (m), sbm (mwe/yr)",1985,2100,yearly,Europe,~11km,,,,netcdf,"Peano D., and Scoccimarro E. WP3 - CT2 Alpine Glacier Disappeara

### Import CSV as dataframe with Pandas

In [7]:
mf = pd.read_csv(meta_data, encoding=guess_encoding(meta_data))
mf

Unnamed: 0,Name,Entry date,Dataset version,Author/Contact person,Short description,Partner,Model type/method,Model,Model version,Documentation,...,Time end,Time resolution,Spatial coverage,Spatial resolution unit Europe,Spatial resolution Rest of World,Spatial projection,Data type,File format,Recommended citation,Other comments
0,L1000-glacier_EC-EARTH_KNMI-RACMO22E_rcp26_198...,9202021,v1,Daniele Peano,Glacier Length change starting from small glac...,CMCC,Minimal Glacier Model,Minimal Glacier Model,,"Peano D., Chiarle M., and Von Hardenberg J. (2...",...,2100,yearly,Europe,~11km,,,,netcdf,"Peano D., and Scoccimarro E. WP3 - CT2 Alpine ...",
1,L4000-glacier_EC-EARTH_KNMI-RACMO22E_rcp26_198...,9202021,v1,Daniele Peano,Glacier Length change starting from medium gla...,CMCC,Minimal Glacier Model,Minimal Glacier Model,,"Peano D., Chiarle M., and Von Hardenberg J. (2...",...,2100,yearly,Europe,~11km,,,,netcdf,"Peano D., and Scoccimarro E. WP3 - CT2 Alpine ...",
2,L10000-glacier_EC-EARTH_KNMI-RACMO22E_rcp26_19...,9202021,v1,Daniele Peano,Glacier Length change starting from large glac...,CMCC,Minimal Glacier Model,Minimal Glacier Model,,"Peano D., Chiarle M., and Von Hardenberg J. (2...",...,2100,yearly,Europe,~11km,,,,netcdf,"Peano D., and Scoccimarro E. WP3 - CT2 Alpine ...",
3,L1000-glacier_EC-EARTH_KNMI-RACMO22E_rcp45_198...,9202021,v1,Daniele Peano,Glacier Length change starting from small glac...,CMCC,Minimal Glacier Model,Minimal Glacier Model,,"Peano D., Chiarle M., and Von Hardenberg J. (2...",...,2100,yearly,Europe,~11km,,,,netcdf,"Peano D., and Scoccimarro E. WP3 - CT2 Alpine ...",
4,L4000-glacier_EC-EARTH_KNMI-RACMO22E_rcp45_198...,9202021,v1,Daniele Peano,Glacier Length change starting from medium gla...,CMCC,Minimal Glacier Model,Minimal Glacier Model,,"Peano D., Chiarle M., and Von Hardenberg J. (2...",...,2100,yearly,Europe,~11km,,,,netcdf,"Peano D., and Scoccimarro E. WP3 - CT2 Alpine ...",
5,L10000-glacier_EC-EARTH_KNMI-RACMO22E_rcp45_19...,9202021,v1,Daniele Peano,Glacier Length change starting from large glac...,CMCC,Minimal Glacier Model,Minimal Glacier Model,,"Peano D., Chiarle M., and Von Hardenberg J. (2...",...,2100,yearly,Europe,~11km,,,,netcdf,"Peano D., and Scoccimarro E. WP3 - CT2 Alpine ...",
6,L1000-glacier_EC-EARTH_KNMI-RACMO22E_rcp85_198...,9202021,v1,Daniele Peano,Glacier Length change starting from small glac...,CMCC,Minimal Glacier Model,Minimal Glacier Model,,"Peano D., Chiarle M., and Von Hardenberg J. (2...",...,2100,yearly,Europe,~11km,,,,netcdf,"Peano D., and Scoccimarro E. WP3 - CT2 Alpine ...",
7,L4000-glacier_EC-EARTH_KNMI-RACMO22E_rcp85_198...,9202021,v1,Daniele Peano,Glacier Length change starting from medium gla...,CMCC,Minimal Glacier Model,Minimal Glacier Model,,"Peano D., Chiarle M., and Von Hardenberg J. (2...",...,2100,yearly,Europe,~11km,,,,netcdf,"Peano D., and Scoccimarro E. WP3 - CT2 Alpine ...",
8,L10000-glacier_EC-EARTH_KNMI-RACMO22E_rcp85_19...,9202021,v1,Daniele Peano,Glacier Length change starting from large glac...,CMCC,Minimal Glacier Model,Minimal Glacier Model,,"Peano D., Chiarle M., and Von Hardenberg J. (2...",...,2100,yearly,Europe,~11km,,,,netcdf,"Peano D., and Scoccimarro E. WP3 - CT2 Alpine ...",
9,L1000-glacier_MPI-M_REMO2009_rcp26_1985-2100_m...,9202021,v1,Daniele Peano,Glacier Length change starting from small glac...,CMCC,Minimal Glacier Model,Minimal Glacier Model,,"Peano D., Chiarle M., and Von Hardenberg J. (2...",...,2100,yearly,Europe,~11km,,,,netcdf,"Peano D., and Scoccimarro E. WP3 - CT2 Alpine ...",


### Sniff and read the CSV with the unicodecsv module

In [8]:
encoding = guess_encoding(meta_data)
meta_data_chunk = str(meta_data.read(8000), encoding)
meta_data.seek(0)
assert csv.Sniffer().has_header(meta_data_chunk) # must have a header
dialect = csv.Sniffer().sniff(meta_data_chunk)
reader = csv.DictReader(meta_data, dialect=dialect, encoding=encoding)
meta = []
for row in reader:
    print("-----")
    pprint(row)
    meta.append(row)

-----
OrderedDict([('Name',
              'L1000-glacier_EC-EARTH_KNMI-RACMO22E_rcp26_1985-2100_mon.nc'),
             ('Entry date', '9202021'),
             ('Dataset version', 'v1'),
             ('Author/Contact person', 'Daniele Peano'),
             ('Short description',
              'Glacier Length change starting from small glaciers'),
             ('Partner', 'CMCC'),
             ('Model type/method', 'Minimal Glacier Model'),
             ('Model', 'Minimal Glacier Model'),
             ('Model version', ''),
             ('Documentation',
              'Peano D., Chiarle M., and Von Hardenberg J. (2016) A minimal '
              'model approach for glacier length modeling in the western '
              'Italian Alps, Geogr. Fis. Dinam. Quat., 39(1), 69–82, '
              'doi:10.4461/GFDQ.2016.39.7 '),
             ('Sector', 'Industry energy services and trade'),
             ('Keywords', 'Glacier, climate change, tipping point'),
             ('SSP', ''),
             (

### Extract and check column headers

In [9]:
expected_headers = ['Name', 'Entry date', 'Dataset version', 'Author/Contact person', 'Short description', 'Partner', 'Model type/method', 'Model', 'Model version', 'Documentation', 'Sector', 'Keywords', 'SSP', 'RCP', 'GCM', 'Variables and units', 'Time start', 'Time end', 'Time resolution', 'Spatial coverage', 'Spatial resolution unit Europe', 'Spatial resolution Rest of World', 'Spatial projection', 'Data type', 'File format', 'Recommended citation', 'Other comments']
assert len(meta[0].keys()) == len(expected_headers) # expected number of headers?
for header,expected in zip(meta[0].keys(), expected_headers):
    if header != expected:
        print(f"WaRNING: header '{header}' differs from expected header '{expected}'!")

### List row names

In [10]:
for m in meta:
    print(m['Name'])

L1000-glacier_EC-EARTH_KNMI-RACMO22E_rcp26_1985-2100_mon.nc
L4000-glacier_EC-EARTH_KNMI-RACMO22E_rcp26_1985-2100_mon.nc
L10000-glacier_EC-EARTH_KNMI-RACMO22E_rcp26_1985-2100_mon.nc
L1000-glacier_EC-EARTH_KNMI-RACMO22E_rcp45_1985-2100_mon.nc
L4000-glacier_EC-EARTH_KNMI-RACMO22E_rcp45_1985-2100_mon.nc
L10000-glacier_EC-EARTH_KNMI-RACMO22E_rcp45_1985-2100_mon.nc
L1000-glacier_EC-EARTH_KNMI-RACMO22E_rcp85_1985-2100_mon.nc
L4000-glacier_EC-EARTH_KNMI-RACMO22E_rcp85_1985-2100_mon.nc
L10000-glacier_EC-EARTH_KNMI-RACMO22E_rcp85_1985-2100_mon.nc
L1000-glacier_MPI-M_REMO2009_rcp26_1985-2100_mon.nc
L4000-glacier_MPI-M_REMO2009_rcp26_1985-2100_mon.nc
L10000-glacier_MPI-M_REMO2009_rcp26_1985-2100_mon.nc
L1000-glacier_IPSL_WRF331F_rcp45_1985-2100_mon.nc
L4000-glacier_IPSL_WRF331F_rcp45_1985-2100_mon.nc
L10000-glacier_IPSL_WRF331F_rcp45_1985-2100_mon.nc
L1000-glacier_IPSL_WRF331F_rcp85_1985-2100_mon.nc
L4000-glacier_IPSL_WRF331F_rcp85_1985-2100_mon.nc
L10000-glacier_IPSL_WRF331F_rcp85_1985-2100_mon.n

In [20]:
j.keys()

dict_keys(['aggregations', 'hits', 'links'])