In [1]:
import json
from pathlib import Path
import urllib
import xarray

In [2]:
def download_variable(
    experiment_id,
    source_id,
    variable_id,
    variant_label,
    frequency='mon',
    grid_label='gn',
    merge_files=False,
    save_to_local=False
):
    '''
    Function: download_variable()
        Retrieve a CMIP6 model output variable from CEDA 
        `https://esgf-index1.ceda.ac.uk/search/cmip6-ceda/`
    
    Inputs:
    - experiment_id (string): model experiment, e.g. 'historical', 'ssp585'
    - source_id (string): model source, e.g. 'UKESM1-0-LL'
    - variable_id (string): variable, e.g. 'pr'
    - variant_id (string): model realisation, e.g. 'r2i1p1f2'
    
    TODO: regrid?
    '''
    base_url = 'https://esgf-index1.ceda.ac.uk/esg-search/search/'
    query = {
        'experiment_id': experiment_id,
        'format': 'application/solr+json',
        'frequency': frequency,
        'grid_label': grid_label,
        'latest': 'true',
        'limit': 20,
        'mip_era': 'CMIP6',
        'offset': 0,
        'replica': 'false',
        'source_id': source_id,
        'type': 'Dataset',
        'variable_id': variable_id,
        'variant_label': variant_label
    }
    url = f'{base_url}?{urllib.parse.urlencode(query)}'
    print('Requesting:')
    print(f'-> {url}')
    response = urllib.request.urlopen(url)
    if response.status < 200 or response.status > 299:
        msg = '\n'.join([
            f'An error occurred making request:', 
            f'-> URL: {url}',
            f'-> Status code: {response.status}',
        ])
        raise ConnectionError(msg)

    results = json.load(response)['response']['docs']
    if len(results) == 0:
        print('No results found')
        return

    print('Results:')
    for item in results:
        item_id = item['id']
        print(f'-> {item_id}')

        if not save_to_local:
            continue
  
        item_source_id = item['source_id'][0]
        item_local_path = f'_data/cmip6/{item_source_id}'
        local_filenames = download_remote_files(item, item_local_path)

        if not merge_files or len(local_filenames) < 2:
            continue

        # Create merged array
        merged_array = xarray.open_mfdataset(paths=local_filenames, combine='by_coords', autoclose=True)
        merged_array = merged_array.chunk()
        
        # Set time encoding from first file if needed
        if 'time' in merged_array:
            first_file = xarray.open_mfdataset(paths=local_filenames[0], combine='by_coords', autoclose=True)
            merged_array.time.encoding['units'] = first_file.time.encoding['units']
            merged_array.time.encoding['calendar'] = first_file.time.encoding['calendar']
        
        # Generate new filename with updated date ranges
        merged_array_dates = merged_array[variable_id].time.values
        first_filename = str(local_filenames[0]).split('/')[-1]
        filename_split = first_filename.split('_')[0:-1] # Remove date range
        date_start = merged_array_dates[0].strftime('%Y%m')
        date_end = merged_array_dates[-1].strftime('%Y%m')
        filename_split.append(f'{date_start}-{date_end}') # Add new date range
        combined_filename = '_'.join(filename_split) + '.nc'
        combined_path = Path(item_local_path, combined_filename)

        # Write to file
        print(f'Writing to {combined_path}')
        merged_array.to_netcdf(combined_path)
        
        return


def download_remote_files(item, local_path):
    '''
    Function: download_remote_files()
        Downlaoad remote files based off CEDA response item
        `https://esgf-index1.ceda.ac.uk/search_files/{item_id}/{item_index_node}/`
    
    Inputs:
    - item (dict): item from array response['response']['docs'] from request
        'https://esgf-index1.ceda.ac.uk/esg-search/search/'
    - local_path (string): path to save to (not including filename)
    
    Outputs:
    - (array): array of local paths
    '''
    item_id = item['id']
    item_index_node = item['index_node']
    url = f'https://esgf-index1.ceda.ac.uk/search_files/{item_id}/{item_index_node}/'
    
    response = urllib.request.urlopen(url)
    if response.status < 200 or response.status > 299:
        msg = '\n'.join([
            f'An error occurred making request:', 
            f'-> URL: {url}',
            f'-> Status code: {response.status}',
            #f'-> Reason: {response.reason}'
        ])
        raise ConnectionError(msg)
    
    results = json.load(response)['response']['docs']
    local_filenames = []
    
    for item in results:
        file_url = [url.split('|')[0] for url in item['url'] if 'HTTPServer' in url]
        if len(file_url) == 0:
            continue
        
        filename = urllib.parse.urlparse(file_url[0]).path.split('/')[-1]
        local_filename = Path(local_path, filename)
        local_filename.parent.mkdir(parents=True, exist_ok=True)
        local_filenames.append(local_filename) 
        
        if local_filename.exists():
            print(f'   -> Already exists, skipping: {local_filename}')
            continue

        print(f'   -> Downloading: {local_filename}')
        file_response = urllib.request.urlretrieve(
            file_url[0],
            local_filename
        )
    
    return local_filenames

In [3]:
kwargs = {
    'experiment_id': 'ssp585',
    'source_id': 'NorESM2-LM',
    'variable_id': 'siconc',
    'variant_label': 'r1i1p1f1',
    'merge_files': False,
    'save_to_local': False
}

siconc = download_variable(**kwargs)

Requesting:
-> https://esgf-index1.ceda.ac.uk/esg-search/search/?experiment_id=ssp585&format=application%2Fsolr%2Bjson&frequency=mon&grid_label=gn&latest=true&limit=20&mip_era=CMIP6&offset=0&replica=false&source_id=NorESM2-LM&type=Dataset&variable_id=siconc&variant_label=r1i1p1f1
Results:
-> CMIP6.ScenarioMIP.NCC.NorESM2-LM.ssp585.r1i1p1f1.SImon.siconc.gn.v20191108|noresg.nird.sigma2.no
