<font size = "12"> **MEASO space paper** </font>

# Loading relevant libraries

In [1]:
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dask.distributed import Client, progress
import UsefulFunctions as uf
import os
import netCDF4 as nc
from collections import OrderedDict
from glob import glob
from clef.code import *
%matplotlib inline

# Setting up paralellisation prior to analysis

In [2]:
client = Client(n_workers = 8)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 41431 instead


# Setting up dictionaries with supporting information for analysis

In [3]:
#This dictionary contains information about each variable to make production of summaries easier
#Change the values of this dictionary as needed
varDict = OrderedDict([
    ('temp', {'model': 'ACCESS-ESM1-5',
              'var_id': 'bigthetao',
              'exp': 'historical', 
              'exp_future': 'esm-ssp585',
              'variant': 'r10i1p1f1',
              'freq': 'mon',
              'long_name': 'water temperature', #Used mainly to label graphs
              'short_name': 'temp', #Used mainly to label graphs
              'means': r'/g/data/v45/la6889/MEASO/Summaries/Temperature/Means',
              'percentiles': r'/g/data/v45/la6889/MEASO/Summaries/Temperature/Percentiles',
              'std_dev': r'/g/data/v45/la6889/MEASO/Summaries/Temperature/StdDev'}),
    ('sal', {'model': 'ACCESS-ESM1-5',
             'var_id': 'so',
             'exp': 'historical', 
             'exp_future': 'esm-ssp585',
             'variant': 'r10i1p1f1',
             'freq': 'mon',
             'long_name': 'water salinity', 
             'short_name': 'sal', 
             'means': r'/g/data/v45/la6889/MEASO/Summaries/Salinity/Means',
             'percentiles': r'/g/data/v45/la6889/MEASO/Summaries/Salinity/Percentiles',
             'std_dev': r'/g/data/v45/la6889/MEASO/Summaries/Salinity/StdDev'}),
    ('ph',{'model': 'ACCESS-ESM1-5',
             'var_id': 'ph', #pH
             'exp': 'historical', 
             'exp_future': 'esm-ssp585',
             'variant': 'r10i1p1f1',
             'freq': 'mon',
             'long_name': 'pH', 
             'short_name': 'pH', 
             'means': r'/g/data/v45/la6889/MEASO/Summaries/pH/Means',
             'percentiles': r'/g/data/v45/la6889/MEASO/Summaries/pH/Percentiles',
             'std_dev': r'/g/data/v45/la6889/MEASO/Summaries/pH/StdDev'}),
    ('pCO2',{'model': 'ACCESS-ESM1-5',
             'var_id': 'spco2', #Surface Aqueous Partial Pressure of CO2 [Pa] - only surface available
             'exp': 'historical', 
             'exp_future': 'esm-ssp585',
             'variant': 'r10i1p1f1',
             'freq': 'mon',
             'long_name': 'partial CO2 pressure at surface', 
             'short_name': 'pCO2', 
             'means': r'/g/data/v45/la6889/MEASO/Summaries/pCO2/Means',
             'percentiles': r'/g/data/v45/la6889/MEASO/Summaries/pCO2/Percentiles',
             'std_dev': r'/g/data/v45/la6889/MEASO/Summaries/pCO2/StdDev'}),
    ('mld', {'model': 'ACCESS-ESM1-5',
             'var_id': 'mlotst', #Ocean Mixed Layer Thickness Defined by Sigma T [m]
             'exp': 'historical', 
             'exp_future': 'esm-ssp585',
             'variant': 'r10i1p1f1',
             'freq': 'mon',
             'long_name': 'mixed layer depth', 
             'short_name': 'mld', 
             'means': r'/g/data/v45/la6889/MEASO/Summaries/MLD/Means',
             'percentiles': r'/g/data/v45/la6889/MEASO/Summaries/MLD/Percentiles',
             'std_dev': r'/g/data/v45/la6889/MEASO/Summaries/MLD/StdDev'}),
     ('sic', {'model': 'ACCESS-ESM1-5',
             'var_id': 'siconc', #sea ice area fraction (% of grid cell covered by ice)
             'exp': 'historical', 
             'exp_future': 'esm-ssp585',
             'variant': 'r10i1p1f1',
             'freq': 'mon',
             'long_name': 'sea ice concentration', 
             'short_name': 'sic', 
             'means': r'/g/data/v45/la6889/MEASO/Summaries/SIC/Means',
             'percentiles': r'/g/data/v45/la6889/MEASO/Summaries/SIC/Percentiles',
             'std_dev': r'/g/data/v45/la6889/MEASO/Summaries/SIC/StdDev'})
])

sea ice concentration:
- siareas - sea ice area South (total area of sea ice in the Southern Hemisphere)
- siextents - sea ice extent South (total area of Southern Hemisphere cells covered by at least 15% areal fraction of sea ice)
- **siconc - sea ice area fraction (% of grid cell covered by ice)**
  
mixed layer depth:
- **mlotst - MLD defined by sigma T (potential density)**. Max and min mean daily values also available.
- omldamax - Ocean MLD defined by mixing scheme. Mean daily maximum

partial CO2 pressure:
- **spco2 - surface partial pressure of CO2 in sea water**

PAR:
- **rsntds - net downward shortwave flux at sea surface**

cloud cover:
- clhcalipso - Cloud area fraction in atmosphere later. % cloud cover in layer centered on 220hPa.


In [4]:
var = 'sic'

#This dictionary contains the time periods of interest
periods_interest = {'p1': np.arange(1890, 1911, 10),
                    'p2': np.arange(1940, 1961, 10),
                    'p3': np.arange(2000, 2021, 10),
                    'p4': np.arange(2040, 2061, 10),
                    'p5': np.arange(2090, 2101, 10)}

#This dictionary contains the depths of interest
depths_interest = {'surface': [0, 10],
                   'pelagic': [0, 200],
                   'krill': [0, 400]}

# Using clef to access ACCESS-ESM1.5 data
The function below, `searchACCESS`, uses the `clef` library from CLEX to search CMIP6 databases. The user can create a dictionary with search requirements to extract data that meets their needs.  
  
This function will be used to search data that would be used as weights in calculation of summary statistics per MEASO area. Variables that can be used as weights include area (`areacello`), depth (`thkcello`) and volume (`volcello`).  
  
More information about the `clef` library and how to use it can be found in [COECMS GitHub repository](https://github.com/coecms/clef).

## Accessing volume per pixel using the `clef` library

In [None]:
#Volume per pixel - For weighting
vol_file = uf.searchACCESS(var = 'volcello',
                        model = varDict[var]['model'], 
                        freq = 'fx', 
                        exp = varDict[var]['exp'], 
                        variant = varDict[var]['variant'])

In [None]:
#Loading volume data
vol_df = uf.loadData(filelist = vol_file, 
                      var_name = 'volcello',
                      SO = True, 
                      depth_range = [0, 401])

## Accessing area per pixel using the `clef` library

In [None]:
area_file = uf.searchACCESS(var = 'areacello',
                        model = varDict[var]['model'], 
                        freq = 'fx', 
                        exp = varDict[var]['exp_future'], 
                        variant = varDict[var]['variant'])

In [None]:
area_df = uf.loadData(filelist = area_file, 
                          var_name = 'areacello',
                          SO = True)

# Using clef to extract ACCESS-ESM 1.5 data

## Finding datasets in `historical` experiment

In [7]:
#Create a session in CLEF to access CMIP6 data
db = connect()
s = Session()

#Define search parameters for CLEF
search_dict = {'variable_id': varDict[var]['var_id'], 
               'model': varDict[var]['model'], 
               'frequency': varDict[var]['freq'], 
              'experiment_id': varDict[var]['exp'],
               'variant_label': varDict[var]['variant']}

#Perform search
df = search(s, project = 'CMIP6', latest = True, **search_dict)

## Finding datasets in `SSP585` experiment

In [24]:
#Define search parameters for CLEF
search_dict = {'variable_id': varDict[var]['var_id'], 
               'model': varDict[var]['model'], 
               'frequency': varDict[var]['freq'], 
              'experiment_id': varDict[var]['exp_future'], 
               'variant_label': varDict[var]['variant']}

#Perform search
df_f = search(s, project = 'CMIP6', latest = True, **search_dict)

## Creating a single list of files for both experiments

In [25]:
#Get folder path where files are located
folder_path = df['path'][0]
folder_path_future = df_f['path'][0]

#Extract the file names inside the folders and sort them
filenames = sorted(glob(os.path.join(folder_path, '*.nc')))
filenames_future = sorted(glob(os.path.join(folder_path_future, '*.nc')))

#Join all file names under one variable
filenames = np.append(filenames, filenames_future)

#List filenames 
filenames

#Remove variables no longer in use
del folder_path, folder_path_future, filenames_future

## Separating lists into each period of interest

In [26]:
#Separate file list into each time period of interest
file_lists = {}

if len(filenames) > 2:
    #Loop over each period of interest
    for key in periods_interest:
        #Creating an empty list to hold file names
        files = []
        #Only keep filenames that are within the specified time period
        for fn in filenames:
            if (int(fn[-16:-12]) >= (periods_interest[key][0]-5)) \
            and (int(fn[-16:-12]) <= (periods_interest[key][-1])):
                #Add file names to dictionary
                files.append(fn)
        file_lists[key] = files
        #Remove list as it is no longer needed
        del files
else:
     for key in periods_interest:
        if periods_interest[key][0] < 2000:
            file_lists[key] = [filenames[0]]
        elif periods_interest[key][-1] == 2020:
            file_lists[key] = filenames
        else:
            file_lists[key] = [filenames[1]]

## Checking data structure prior to further processing (Optional step)

In [53]:
var = nc.Dataset(file_lists['p1'][0])
print(var)
print(var.__dict__)
[print(dim) for dim in var.dimensions.values()]
[print(var) for var in var.variables.values()]

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4_CLASSIC data model, file format HDF5):
    Conventions: CF-1.7 CMIP-6.2
    activity_id: CMIP
    branch_method: standard
    branch_time_in_child: 0.0
    branch_time_in_parent: 87658.0
    creation_date: 2020-06-05T05:02:30Z
    data_specs_version: 01.00.30
    experiment: all-forcing simulation of the recent past
    experiment_id: historical
    external_variables: areacello volcello
    forcing_index: 1
    frequency: mon
    further_info_url: https://furtherinfo.es-doc.org/CMIP6.CSIRO.ACCESS-ESM1-5.historical.none.r10i1p1f1
    grid: native atmosphere N96 grid (145x192 latxlon)
    grid_label: gn
    history: 2020-06-05T05:02:30Z ; CMOR rewrote data to be consistent with CMIP6, CF-1.7 CMIP-6.2 and CF standards.
    initialization_index: 1
    institution: Commonwealth Scientific and Industrial Research Organisation, Aspendale, Victoria 3195, Australia
    institution_id: CSIRO
    mip_era: CMIP6
    nominal_resolution: 250 km

[None, None, None, None, None, None, None, None, None, None, None]

# Creating a mask with regions of interest
Create a mask using a netcdf file with the `creatingMask` function. Masks can be created from shapefiles using the `0_CreatingMeasoMask.ipynb` notebook.

In [27]:
MEASOregions, regionNames = uf.creatingMask('MEASO_3Dmask.nc')

# Loading data for each period of interest

## Accessing data that will be used as weights
ACCESS-ESM-1.5 has the following variables that could be used as weights in summary calculations:
- area (`areacello`)
- depth (`thkcello`)
- volume (`volcello`)  
  
All the variables above provide data for each pixel in the area of interest.

## Applying functions to calculate summaries

In [None]:
for per in periods_interest:
    years = [periods_interest[per][0], 
             periods_interest[per][-1]]
    
    # Loading variable of interest for each time period and depth of interest
    var_df = uf.loadData(filelist = file_lists[per], 
                      var_name = varDict[var]['var_id'],
                      SO = True, depth_range = [0, 401],
                      years = years, 
                      months = ['07', '06'])

    #Selecting depth of interest
    if 'lev' in var_df.coords:
        for depth in depths_interest:
            #Creating paths to save data and ensuring they exist
            #Mean
            mean_path_out = os.path.join(varDict[var]['means'], depth.title())
            os.makedirs(mean_path_out, exist_ok = True)
            #Create filename
            file_out_m = os.path.join(mean_path_out, 
                                      f'WeightedMonthlyMeans_{var}_{depth}_{years[0]}-{years[-1]}.nc')
            
            #Standard deviation
            std_path_out = os.path.join(varDict[var]['std_dev'], depth.title())
            os.makedirs(std_path_out, exist_ok = True)
            #Create filenames
            file_out_w = os.path.join(std_path_out, 
                                      f'WeightedMonthlyMeans_{var}_{depth}_{years[0]}-{years[-1]}.nc')
            file_out_u = os.path.join(std_path_out, 
                                      f'UnweightedMonthlyMeans_{var}_{depth}_{years[0]}-{years[-1]}.nc')
            
            #Percentiles
            per_path_out = os.path.join(varDict[var]['percentiles'], depth.title())
            os.makedirs(per_path_out, exist_ok = True)
            #Create filename
            file_out_p = os.path.join(per_path_out, 
                                    f'MonthlyPercentiles_{var}_{depth}_{years[0]}-{years[-1]}.nc')
            
            #Subsetting data to depth of interest
            var_sub = var_df.sel(lev = slice(depths_interest[depth][0],
                                             depths_interest[depth][-1]))
            vol_sub = vol_df.sel(lev = slice(depths_interest[depth][0],
                                             depths_interest[depth][-1]))
            
            #Summary statistics
            #Calculate weighted means
            weightedvarMeans = uf.weightedMeans(regions = regionNames, 
                                             var_df = var_sub, 
                                             mask_df =  MEASOregions, 
                                             weights = vol_sub)
            #Save file
            weightedvarMeans.to_netcdf(file_out_m)

            #Calculate weighted and unweighted standard deviation
            un_std_calcs, w_std_calcs = uf.std_dev(regions = regionNames,
                                                var_df = var_sub, 
                                                mask_df =  MEASOregions, 
                                                weights = vol_sub,
                                                weighted_means = weightedvarMeans)
            #Save file
            w_std_calcs.to_netcdf(file_out_w)
            un_std_calcs.to_netcdf(file_out_u)

            #Calculate percentiles
            per_calcs = uf.perc_calc(regions = regionNames,
                                  var_df = var_sub, 
                                  mask_df =  MEASOregions, 
                                  percentiles = [.2, .4, .5, .6, .8])
            #Save file
            per_calcs.to_netcdf(file_out_p)
    else:
        #Creating paths to save data and ensuring they exist
        #Mean
        mean_path_out = varDict[var]['means']
        os.makedirs(mean_path_out, exist_ok = True)
        #Create filename
        file_out_m = os.path.join(mean_path_out, 
                                f'WeightedMonthlyMeans_{var}_{years[0]}-{years[-1]}.nc')
        
        #Standard deviation
        std_path_out = varDict[var]['std_dev']
        os.makedirs(std_path_out, exist_ok = True)
        #Create filenames
        file_out_w = os.path.join(std_path_out, 
                                  f'WeightedMonthlyMeans_{var}_{years[0]}-{years[-1]}.nc')
        file_out_u = os.path.join(std_path_out, 
                                  f'UnweightedMonthlyMeans_{var}_{years[0]}-{years[-1]}.nc')
        
        #Percentiles
        per_path_out = varDict[var]['percentiles']
        os.makedirs(per_path_out, exist_ok = True)
        #Create filename
        file_out_p = os.path.join(per_path_out, 
                                f'MonthlyPercentiles_{var}_{years[0]}-{years[-1]}.nc')
        
        #No need to subset variable of interest data
        var_sub = var_df
        #Apply area per pixel instead of volume per pixel
        vol_sub = area_df
        
        #Summary statistics
        #Calculate weighted means
        weightedvarMeans = uf.weightedMeans(regions = regionNames, 
                                         var_df = var_sub, 
                                         mask_df =  MEASOregions, 
                                         weights = vol_sub)
        #Save file
        weightedvarMeans.to_netcdf(file_out_m)

        #Calculate weighted and unweighted standard deviation
        un_std_calcs, w_std_calcs = uf.std_dev(regions = regionNames,
                                            var_df = var_sub, 
                                            mask_df =  MEASOregions, 
                                            weights = vol_sub,
                                            weighted_means = weightedvarMeans)
        #Save file
        w_std_calcs.to_netcdf(file_out_w)
        un_std_calcs.to_netcdf(file_out_u)

        #Calculate percentiles
        per_calcs = uf.perc_calc(regions = regionNames,
                              var_df = var_sub, 
                              mask_df =  MEASOregions, 
                              percentiles = [.2, .4, .5, .6, .8])
        #Save file
        per_calcs.to_netcdf(file_out_p)

Exception during reset or similar
Traceback (most recent call last):
  File "/g/data/hh5/public/apps/miniconda3/envs/analysis3-21.10/lib/python3.9/site-packages/sqlalchemy/pool/base.py", line 697, in _finalize_fairy
    fairy._reset(pool)
  File "/g/data/hh5/public/apps/miniconda3/envs/analysis3-21.10/lib/python3.9/site-packages/sqlalchemy/pool/base.py", line 893, in _reset
    pool._dialect.do_rollback(self)
  File "/g/data/hh5/public/apps/miniconda3/envs/analysis3-21.10/lib/python3.9/site-packages/sqlalchemy/engine/default.py", line 558, in do_rollback
    dbapi_connection.rollback()
psycopg2.OperationalError: server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.

Exception during reset or similar
Traceback (most recent call last):
  File "/g/data/hh5/public/apps/miniconda3/envs/analysis3-21.10/lib/python3.9/site-packages/sqlalchemy/pool/base.py", line 697, in _finalize_fairy
    fairy._reset(pool)
  F

# Plotting results

In [108]:
# uf.loadData(filelist = var_int, 
#                           var_name = varDict[var]['var_id'],
#                           SO = True, depth_range = depths_interest[depth],
#                           years = years, 
#                           months = months)
var_name = varDict[var]['var_id']
x= []
if len(var_int) > 1:
    #Looping through files and stacking them
    for f in var_int:
        x.append(xr.open_dataset(f, mask_and_scale = True))
    #Concatenating files across time dimension
    x = xr.concat(x, dim = 'time')
else:
    x = xr.open_dataset(var_int[0], mask_and_scale = True)
    x = x[var_name][0]