In [1]:
from lxml import etree
import requests
import shutil
import os

In [2]:
endpoint = "https://whos.geodab.eu/gs-service/services/essi/token/my_token/view/whos-plata/oaipmh"
metadata_prefix = "WIGOS-1.0RC6"

### list metadata formats

In [3]:
def getFormats(endpoint="https://whos.geodab.eu/gs-service/services/essi/token/my_token/view/whos-plata/oaipmh",output_file="metadataFormats.xml"):
    response = requests.get(endpoint,params={"verb":"ListMetadataFormats"})
    f = open(output_file,"w")
    f.write(response.text)
    f.close()
    tree = etree.parse(output_file)
    root = tree.getroot()
    list_metadata_formats = root.find("{http://www.openarchives.org/OAI/2.0/}ListMetadataFormats")
    prefixes = []
    for metadata_format in list_metadata_formats.iter("{http://www.openarchives.org/OAI/2.0/}metadataFormat"):
        prefix = metadata_format.find("{http://www.openarchives.org/OAI/2.0/}metadataPrefix")
        prefixes.append(prefix.text)
    return prefixes

In [4]:
prefixes = getFormats()

### list identifiers

In [5]:
def getIdentifiers(endpoint="https://whos.geodab.eu/gs-service/services/essi/token/my_token/view/whos-plata/oaipmh",metadata_prefix="WIGOS-1.0RC6",output_file="identifiers.xml"):
    response = requests.get(endpoint,params={"verb":"ListIdentifiers","metadataPrefix":metadata_prefix})
    f = open(output_file,"w")
    f.write(response.text)
    f.close()
    tree = etree.parse(output_file)
    root = tree.getroot()
    root.tag
    list_identifiers = root.find("{http://www.openarchives.org/OAI/2.0/}ListIdentifiers")
    identifiers = []
    for header in list_identifiers.iter("{http://www.openarchives.org/OAI/2.0/}header"):
        identifiers.append({
            "identifier": header.find("{http://www.openarchives.org/OAI/2.0/}identifier").text,
            "datestamp" : header.find("{http://www.openarchives.org/OAI/2.0/}datestamp").text,
            "setSpec" : header.find("{http://www.openarchives.org/OAI/2.0/}setSpec").text
        })
    resumption_token = list_identifiers.find("{http://www.openarchives.org/OAI/2.0/}resumptionToken").text
    return identifiers, resumption_token


In [6]:
identifiers, resumption_token = getIdentifiers()

### resume identifiers download

In [7]:
def resumeGetIdentifiers(resumption_token,endpoint="https://whos.geodab.eu/gs-service/services/essi/token/my_token/view/whos-plata/oaipmh",output_file="identifiers.xml"):
    response = requests.get(endpoint,params={"verb":"ListIdentifiers","resumptionToken":resumption_token})
    f = open(output_file,"w")
    f.write(response.text)
    f.close()
    tree = etree.parse(output_file)
    root = tree.getroot()
    list_identifiers = root.find("{http://www.openarchives.org/OAI/2.0/}ListIdentifiers")
    identifiers = []
    for header in list_identifiers.iter("{http://www.openarchives.org/OAI/2.0/}header"):
        identifiers.append({
            "identifier": header.find("{http://www.openarchives.org/OAI/2.0/}identifier").text,
            "datestamp" : header.find("{http://www.openarchives.org/OAI/2.0/}datestamp").text,
            "setSpec" : header.find("{http://www.openarchives.org/OAI/2.0/}setSpec").text
        })
    resumption_token = list_identifiers.find("{http://www.openarchives.org/OAI/2.0/}resumptionToken").text
    return identifiers, resumption_token

In [8]:
more_identifiers = resumeGetIdentifiers(resumption_token)

### list records (optionally filtered by set_spec)

In [9]:
def getRecords(endpoint="https://whos.geodab.eu/gs-service/services/essi/token/my_token/view/whos-plata/oaipmh",metadata_prefix="WIGOS-1.0RC6",output_file="records.xml",set_spec=None):
    params={"verb":"ListRecords","metadataPrefix":metadata_prefix}
    if set_spec is not None:
        params["set"] = set_spec
    response = requests.get(endpoint,params=params)
    f = open(output_file,"w")
    f.write(response.text)
    f.close()
    tree = etree.parse(output_file)
    root = tree.getroot()
    root.tag
    list_records = root.find("{http://www.openarchives.org/OAI/2.0/}ListRecords")
    records = []
    for record in list_records.iter("{http://www.openarchives.org/OAI/2.0/}record"):
        identifier = record.find("{http://www.openarchives.org/OAI/2.0/}header/{http://www.openarchives.org/OAI/2.0/}identifier").text
        records.append({
            "identifier": identifier,
            "metadata" : record.find("{http://www.openarchives.org/OAI/2.0/}metadata/{http://def.wmo.int/wmdr/2017}WIGOSMetadataRecord")
        })
    resumption_token = list_records.find("{http://www.openarchives.org/OAI/2.0/}resumptionToken").text
    completeListSize = int(list_records.find("{http://www.openarchives.org/OAI/2.0/}resumptionToken").attrib["completeListSize"])
    cursor = int(list_records.find("{http://www.openarchives.org/OAI/2.0/}resumptionToken").attrib["cursor"])
    return records, resumption_token, completeListSize, cursor

In [10]:
set_spec = "argentina-ina"
records, resumption_token, completeListSize, cursor = getRecords(set_spec=set_spec)

### resume list records, download iteratively

In [11]:

def getRecordsNextPage(resumption_token,endpoint="https://whos.geodab.eu/gs-service/services/essi/token/my_token/view/whos-plata/oaipmh",output_file="records.xml",output_dir="wigos_records"):
    response = requests.get(endpoint,params={"verb":"ListRecords","resumptionToken":resumption_token})
    f = open(output_file,"w")
    f.write(response.text)
    f.close()
    tree = etree.parse(output_file)
    root = tree.getroot()
    root.tag
    list_records = root.find("{http://www.openarchives.org/OAI/2.0/}ListRecords")
    cursor = int(list_records.find("{http://www.openarchives.org/OAI/2.0/}resumptionToken").attrib["cursor"])
    filename ="%s/records_%i.xml" % (output_dir,cursor)
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    shutil.copyfile(output_file,filename)
    records = []
    for record in list_records.iter("{http://www.openarchives.org/OAI/2.0/}record"):
        identifier = record.find("{http://www.openarchives.org/OAI/2.0/}header/{http://www.openarchives.org/OAI/2.0/}identifier").text
        records.append({
            "identifier": identifier,
            "metadata" : record.find("{http://www.openarchives.org/OAI/2.0/}metadata/{http://def.wmo.int/wmdr/2017}WIGOSMetadataRecord")
        })
    new_token = list_records.find("{http://www.openarchives.org/OAI/2.0/}resumptionToken").text
    return records, cursor, new_token

In [12]:
more_records, cursor, new_token = getRecordsNextPage(resumption_token)

In [13]:
more_records, cursor, new_token = getRecordsNextPage(new_token)

In [21]:
def getRecordsAllPages(endpoint="https://whos.geodab.eu/gs-service/services/essi/token/my_token/view/whos-plata/oaipmh",max_pages=500,metadata_prefix="WIGOS-1.0RC6",output_file="records.xml",set_spec=None,output_dir="wigos_records"):
    records, resumption_token, completeListSize, cursor = getRecords(endpoint=endpoint,metadata_prefix=metadata_prefix,output_file=output_file,set_spec=set_spec)
    page = 0
    while cursor < completeListSize and page < max_pages:
        page = page + 1
        more_records, cursor, resumption_token = getRecordsNextPage(resumption_token,endpoint=endpoint,output_file=output_file,output_dir=output_dir)
        records.extend(more_records)
    return records

In [22]:
records = getRecordsAllPages(set_spec="argentina-ina", max_pages=5)

In [23]:
def saveRecord(record):
    record["metadata"].attrib["{http://www.w3.org/2001/XMLSchema-instance}schemaLocation"] = "http://def.wmo.int/wmdr/2017 http://schemas.wmo.int/wmdr/1.0RC9/wmdr.xsd"
    et = etree.ElementTree(record["metadata"])
    filename = "%s/%s.xml" % ("wigos_records/", record["identifier"])
    et.write(filename, pretty_print=True)

In [24]:
for record in records:
    saveRecord(record)

### get individual record

In [25]:
def getRecord(identifier,output_dir="wigos_records",metadata_prefix = "WIGOS-1.0RC6",endpoint="https://whos.geodab.eu/gs-service/services/essi/token/my_token/view/whos-plata/oaipmh"):
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    record_file = "%s/%s.xml" % (output_dir,identifier)
    response = requests.get(endpoint,params={"verb":"GetRecord","metadataPrefix":metadata_prefix,"identifier":identifier})
    f = open(record_file,"w")
    f.write(response.text)
    f.close()
    tree = etree.parse(record_file)
    root = tree.getroot()
    el = root.find("{http://www.openarchives.org/OAI/2.0/}GetRecord/{http://www.openarchives.org/OAI/2.0/}record/{http://www.openarchives.org/OAI/2.0/}metadata/{http://def.wmo.int/wmdr/2017}WIGOSMetadataRecord")
    el.attrib["{http://www.w3.org/2001/XMLSchema-instance}schemaLocation"] = "http://def.wmo.int/wmdr/2017 http://schemas.wmo.int/wmdr/1.0RC9/wmdr.xsd"
    et = etree.ElementTree(el)
    filename = "%s/%s.xml" % ("wigos_records/", identifier)
    et.write(filename, pretty_print=True)
    return el

In [26]:
identifier = "fcf413b2-703a-4950-b96f-76f310a1f7b0"
record = getRecord(identifier)

In [27]:
records= []
for identifier in identifiers:
    el = getRecord(identifier["identifier"])
    records.append({
        "identifier": identifier,
        "metadata": el
    })