# Query COACCH data sets and generate Data Repository [ReST](https://en.wikipedia.org/wiki/ReStructuredText)

In [None]:
import unicodecsv as csv
import io
import os
import urllib
import yaml

from pprint import pprint

# Bespoke modules
from classes import *
from zenodo_helpers import *
from rest_helpers import *

### Query and collect hits from result pages

The query string uses [elastic search syntax](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax).

In [None]:
query = ""
# Perform query and collect initial response
params = {
    'q': urllib.parse.quote(query),
    #'type': 'publication',
    'type': 'dataset',
    'communities': "coacch-co-designing-the-assessment-of-climate-change-costs-h2020-project",
    'size': 10, # hits per page
    'page': 1,
    'sort': '-mostrecent',
    'access_token': os.environ['ZENODO_API_TOKEN']
}
response = reget("https://zenodo.org/api/records", params = params)
assert response.status_code == 200 # success
j = response.json()
hits = j['hits']['hits'] # initial set of hits, to be appended to for next pages
print(f"--> {len(hits)}")
hits_total = j['hits']['total']
aggregations = j['aggregations']

# Process further hits/pages until they exhaust
while get_next_link(j): 
    next_response = reget(get_next_link(j))
    assert next_response.status_code == 200 # success
    j = next_response.json()
    hits.extend(j['hits']['hits'])
assert len(hits) == hits_total

# Report total hits and ids
print(f"Query resulted in a total of {hits_total} hits with Zenodo IDs:")
pprint([hit['id'] for hit in hits])

### Strip HTML markup from description metadata

In [None]:
for hit in hits:
    hit['metadata']['description'] = strip_html_markup(hit['metadata']['description'])

### Download the Zenodo DOI badges as static content

So as to not require a separate connection to Zendo for pages to display
as well as to make inclusion of badges in the generated PDF work.

In [None]:
badge_dir = f"../docs/_static/badges"
if not os.path.exists(badge_dir):
    os.makedirs(badge_dir)

# List already present badges
stale_badges = os.listdir(badge_dir)

# Get badges from Zenodo and write them as static content
for hit in hits:
    badge_url = hit['links']['badge']
    badge_name = badge_url.rsplit('/', 1)[-1]
    r = reget(badge_url)
    with open(f"{badge_dir}/{badge_name}", "wb") as b:
        b.write(r.content)
    if badge_name in stale_badges:
        # Badge is not stale
        stale_badges.remove(badge_name)

# Remove any badges that are no longer required
for badge_name in stale_badges:
    os.remove(f"{badge_dir}/{badge_name}")

### Try to determine the URL of the COACCH metadata file for each hit

In [None]:
meta_urls = []
for hit in hits:
    print(f"-------- ID: {hit['id']}")
    files = hit['files']
    meta_url = None
    for f in files:
        link = f['links']['self']
        if link.lower().find("metadata") >= 0 or link.lower().find("meta%20data") >= 0:
            if link.lower().find("datasetwide") >= 0:
                # prefer a dataset wide metadata file
                meta_url = link
                print(f"{link} <-- preferred metadata")
                break
            assert meta_url is None # data set should have only one meta data file
            meta_url = link
            print(f"{link} <-- metadata?")
        else:
            print(f"{link}")
    if meta_url is None:
        print(f"WARNING: data set {hit['id']} includes no obvious metadata file!")
    else:
        hit['coacch'] = {} # add empty dict to hold COACCH-specifics
        hit['coacch']['meta_url'] = meta_url

### Retrieve COACCH metadata files for the hits

Store as an in-memory binary-file-like object

In [None]:
for hit in hits:
    if 'coacch' in hit and 'meta_url' in hit['coacch']:
        r = reget(hit['coacch']['meta_url'])
        hit['coacch']['metadata'] = io.BytesIO(r.content)
print("Done retrieving COACCH metadata for hits.")

### Special handling: Retrieve COACCH metadata for 4733499

In [None]:
for hit in hits:
    if hit['id'] == 4733499:
        hit4733499 = hit
with open("../bad_metadata/COACCH_MetaData_BC3_WP4.csv", "rb") as f:
    contents = f.read()
    hit4733499['coacch'] = {}
    hit4733499['coacch']['metadata'] = io.BytesIO(contents)

### Special handling: Retrieve COACCH metadata for 5541894

In [None]:
for hit in hits:
    if hit['id'] == 5541894:
        hit5541894 = hit
with open("../bad_metadata/COACCH_MetaData_energy_demand_cleaned_up.csv", "rb") as f:
    contents = f.read()
    hit5541894['coacch'] = {}
    hit5541894['coacch']['metadata'] = io.BytesIO(contents)

### For each hit, check and convert the metadata CSV to dictionaries for each row

In [None]:
for hit in hits:
    print(f"-------- ID: {hit['id']}")
    if 'coacch' in hit and 'metadata' in hit['coacch']:
        metadata = hit['coacch']['metadata']
        encoding = guess_encoding(metadata)
        try:
            metadata.seek(0)
            metadata_chunk = str(metadata.read(8000), encoding)
        except UnicodeDecodeError as error:
            try:
                metadata.seek(0)
                encoding = 'Windows-1252'
                metadata_chunk = str(metadata.read(8000), encoding)
            except UnicodeDecodeError as error:
                metadata.seek(0)
                encoding = 'utf-8'
                metadata_chunk = str(metadata.read(8000), encoding)
        metadata.seek(0)
        if csv.Sniffer().has_header(metadata_chunk):
            dialect = csv.Sniffer().sniff(metadata_chunk)
            reader = csv.DictReader(metadata, dialect=dialect, encoding=encoding)
            rows = []
            for row in reader:
                rows.append(row)
            # Special handling for dataset 5541894: want the 2nd of 2 rows
            if hit['id'] == 5541894:
                rows.reverse()
            hit['coacch']['metadata_rows'] = rows
            print(f"Converted {len(rows)} row{'s' if len(rows) > 1 else ''} of metadata")
        else:
            print(f"WARNING: metadata of dataset https://zenodo.org/record/{hit['id']} has no CSV header. The metadata URL is {hit['coacch']['meta_url']}")
    else:
        print("WARNING: metadata absent")

### Check metadata column headers

In [None]:
for hit in hits:
    print(f"-------- ID: {hit['id']}")
    if 'coacch' in hit and 'metadata_rows' in hit['coacch']:
        rows = hit['coacch']['metadata_rows']
        headers = rows[0].keys()
        print(headers)
        template_headers = ['Name', 'Entry date', 'Dataset version', 'Author/Contact person', 'Short description', 'Partner', 'Model type/method', 'Model', 'Model version', 'Documentation', 'Sector', 'Keywords', 'SSP', 'RCP', 'GCM', 'Variables and units', 'Time start', 'Time end', 'Time resolution', 'Spatial coverage', 'Spatial resolution unit Europe', 'Spatial resolution Rest of World', 'Spatial projection', 'Data type', 'File format', 'Recommended citation', 'Other comments']
        for t in template_headers:
            if t not in headers:
                print(f"WARNING: required header '{t}' is absent.")
        for h in headers:
            if  h not in template_headers:
                print(f"WARNING: header '{h}' is present but not required.")
    else:
        print("No metadata rows were converted")

### Remove spurious line feeds from COACCH metadata

In [None]:
for hit in hits:
    if 'coacch' in hit and 'metadata_rows' in hit['coacch']:
        cm = hit['coacch']['metadata_rows'][0]
        cm['RCP'] = cm['RCP'].replace('\n', '')

### Produce a reStructuredText page for each hit

In [None]:
page_dir = f"../docs/{params['type']}s"
if not os.path.exists(page_dir):
    os.makedirs(page_dir)
# List already present pages
stale_pages = os.listdir(page_dir)

# Generate ReST pages and write them to the page directory
for hit in hits:
    print(f"Producing reStructuredText for hit {hit['id']}: {hit['metadata']['title']}")
    page_name = rest_hit(hit, page_dir)
    if page_name in stale_pages:
        # Page is not stale
        stale_pages.remove(page_name)

# Remove any pages that are no longer required
for page_name in stale_pages:
    os.remove(f"{page_dir}/{page_name}")



### Add datasets to their respective class pages

In [None]:
# collect the IDs of the hits
ids = [hit['id'] for hit in hits]
# set up a dictionary to keep track of which IDs were written
written = {}

for c in classes:
    page_path = f"../docs/classes/{c}.rst"
    # check that the IDs in the class were returned by the query
    for id in classes[c]:
        if id not in ids:
            print(f"WARNING: no hit with ID {id} in class '{c}' was returned by the Zenodo query!")
    # read the reStructuredText page of the class
    with open(page_path, "r", encoding = 'utf-8') as class_rst:
        lines = class_rst.readlines()
    # find the toctree
    i = 0
    while lines[i].find(".. toctree::") < 0:
        i += 1
    # find the start of the ToC entries
    while lines[i] != '\n':
        i += 1
    i += 1
    assert i > 2
    start = i
    # remove ToC entries
    while i < len(lines) and lines[i] != '\n':
        lines.pop(i)
    # add new ToC entries
    for id in reversed([hit['id'] for hit in hits]):
        if id in classes[c] or c =='other' and id not in written:
            written[id] = True
            lines.insert(start, f"   ../datasets/{id}\n")
    # write the reStructuredText page of the class
    with open(page_path, "w", encoding = 'utf-8', newline = '\n') as class_rst:
        class_rst.writelines(lines)

### Show aggregations (aggregate data over all query hits, presumably)

In [None]:
print(yaml.dump(aggregations))

### Dump a hit as YAML

In [None]:
print(yaml.dump(hits[0]))