# Script to avg annual tas and pr for Ghana for historical period and for ssp585

Author: Sarah Chapman

- adapted from Jess Baker's ntoebook (load CMIP6 models)
- includes code from Eszter's drought notebook
- adapted 'make_cmip6_filepath' function to accomodate multiple model fpaths
- code to get the institute for each model

- loads pre-processed pr and tas data
- calculates avg annual temp and pr for Ghana for mid century time period
- also calculates based on rainy season

Some useful links
https://towardsdatascience.com/basic-data-structures-of-xarray-80bab8094efa

In [335]:
import os
import numpy as np
import xarray as xr
import glob
import datetime
from datetime import datetime as dt
from pyhdf.SD import SD, SDC
from netCDF4 import Dataset
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
from pathlib import Path
import cf_units
import pandas as pd
import copy
import climate_indices
from climate_indices import compute, indices
import iris
from netCDF4 import date2num
import iris.coord_categorisation
from iris.experimental.equalise_cubes import equalise_attributes
from iris.util import unify_time_units

import os 

def get_dates(cube, verbose=False):
    dates = cube.coord('time').units.num2date(cube.coord('time').points)
    dates = [dt(date.year, date.month, date.day) for date in dates]
    if verbose is True:
        print(dates)
    else:
        print(dates[0], '–', dates[-1])
    return(dates)

def make_cmip6_filepath(institute, model, scenario, variant, experiment, table_id, variable, grid, version, time_range,
                        data_root="/badc/cmip6/data/CMIP6/"):
    """
    Make a file path for a cmip6 dataset for a single variable
    Historical runs (1850-2014) are in `/badc/cmip6/data/CMIP6/CMIP/<institute>/<model>/historical/<variant>/<table_id>/<variable>/<grid>/<version>/`
    Scenario runs are in `/badc/cmip6/data/CMIP6/ScenarioMIP/<institute>/<model>/<scenario_name>/<variant>/<table_id>/<variable>/<grid>/<version>/`
    `scenario_name` is likely to be one of ssp119, ssp126, ssp245, ssp370 or ssp585
    `variant` takes the form `r<realiation_id>0<initialization_id>0<physics_id>0<forcing_id>`, e.g. `r1i1p1f2`, where the numbers are the indexes for:  
    **r**ealization, **i**nitialization, **p**hysics, **f**orcing
    `table_id` generally indicates the frequency of the data, e.g. day, 3hr, Amon
    `grid` is the model grid being used, e.g. gn, where  
       * `gm`: global mean data  
       * `gn`: data reported on a model's native grid  
       * `gr1`: regridded data reported on a grid other than the native grid and other than the preferred target grid  
    It is likely the `grid` will be the native grid, i.e. `gn`
    `version` normally in the form `v[YYYYMMDD]` or `latest`, e.g. `v20200203
    
    `variable` generally follows the list on https://pcmdi.llnl.gov/mips/cmip3/variableList.html, for example 
       `tas`: air_temperature 
       `pr`: precipitation_flux
       `ts`: surface_temperature
    The following institutions have data in both historical and ScenarioMIPs:
    AS-RCEC, AWI, BCC, CAMS, CAS, CCCR-IITM, CCCma, CMCC, CNRM-CERFACS, CSIRO, CSIRO-ARCCSS, E3SM-Project, EC-Earth-Consortium, FIO-QLNM, HAMMOZ-Consortium, INM, IPSL, KIOST, MIROC, MOHC, MPI-M, MRI, NASA-GISS, NCAR, NCC, NIMS-KMA, NOAA-GFDL, NUIST, THU, UA
    """
    # get base path
    path = str(DATA_ROOT / scenario / institute / model / experiment)
    print(path)
    #print(os.listdir(path))
    
    # get path for variant
    if variant is None:
        # select first variant
        dir_list = os.listdir(path)
        variant_list = [x for x in dir_list if x.startswith('r')]
    else:
        variant_list = [variant]
    
    # update path
    var = [x for x in variant_list if x.startswith('r1i1p1')]
    if len(var) == 0:
        print(variant_list)
        var = [x for x in variant_list if x.startswith('r')]
        path = path + '/' + var[0] + '/' + str(table_id) + '/' + str(variable)
    else:
        path = path + '/' + var[0] + '/' + str(table_id) + '/' + str(variable) 
    print(path)
    # get path for grid
    if grid is None:
        # select first grid (usually only 1)
        dir_list = os.listdir(path)
        grid_list = [x for x in dir_list if x.startswith('g')]
    else:
        grid_list = [grid]
        
    # update path
    path = path + '/' + str(grid_list[0])
    print(path)
    
    # get version path
    if version is None:
        dir_list2 = os.listdir(path)
        version_list = [x for x in dir_list2 if x.startswith('v')]
    else:
        version_list = [version]
    
    # update path
    path = path + '/' + str(version_list[0]) + '/'
    print(path)
    print(os.listdir(path))
    return(path+ '*.nc')

# test
DATA_ROOT = Path("/badc/cmip6/data/CMIP6/")
model = "UKESM1-0-LL"
expt = 'ssp585'
scenario = 'ScenarioMIP'
fp = make_cmip6_filepath(
        institute="MOHC", scenario=scenario, model=model, experiment=expt, variant=None,
        table_id="Amon", variable="pr", grid=None, version=None, time_range="*"
    )

/badc/cmip6/data/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/ssp585
/badc/cmip6/data/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/ssp585/r1i1p1f2/Amon/pr
/badc/cmip6/data/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/ssp585/r1i1p1f2/Amon/pr/gn
/badc/cmip6/data/CMIP6/ScenarioMIP/MOHC/UKESM1-0-LL/ssp585/r1i1p1f2/Amon/pr/gn/v20190507/
['pr_Amon_UKESM1-0-LL_ssp585_r1i1p1f2_gn_201501-204912.nc', 'pr_Amon_UKESM1-0-LL_ssp585_r1i1p1f2_gn_205001-210012.nc']


In [2]:
def merge_hist_future_cmip6(hist_cube, future_cube):
    # combine hist and future data into single cube
    hist_dates = get_dates(hist_cube)
    calendar = 'gregorian'
    units = 'days since ' + str(hist_dates[0].year) + '-01-01 00:00:0.0'
    
    future_dates = get_dates(future_cube)
    
    if future_dates[0].year <= hist_dates[-1].year:
        # if future and historical cubes overlap remove overlap
        print(hist_dates[0].year)
        print(future_dates[0].year)
        hist_constraint = iris.Constraint(time=lambda cell: hist_dates[0].year <= cell.point.year < future_dates[0].year)
        hist_cube = hist_cube.extract(hist_constraint)
        hist_dates = get_dates(hist_cube)
        
    dates = hist_dates + future_dates
    
    # get time dimension
    times = date2num(dates, units=units, calendar=calendar)
    time_unit = cf_units.Unit(units, calendar=calendar)
    time = iris.coords.DimCoord(times, standard_name='time',
                                units=time_unit)

    lats = future_cube.coord('latitude').points
    latitude = iris.coords.DimCoord(lats, standard_name='latitude',
                                    units='degrees')
    lons = future_cube.coord('lon').points
    longitude = iris.coords.DimCoord(lons, standard_name='longitude',
                                     units='degrees')
    new_data = np.zeros((len(dates), len(lats), len(lons)))
    
    # add historical data
    new_data[0:hist_cube.shape[0], :, :] = hist_cube.data
    new_data[hist_cube.shape[0]:, :, :] = future_cube.data
    
    # Put data into data cube
    cube = iris.cube.Cube(new_data, var_name=future_cube.standard_name,
                          units=future_cube.units,
                          dim_coords_and_dims=[(time, 0), (latitude, 1),
                                               (longitude, 2)])
    print(cube)
    return(cube)

In [3]:
# create dictionary of models and institutes
basepath = '/badc/cmip6/data/CMIP6/CMIP/'
institute_list = os.listdir(basepath)
model_inst_dict = {}

# loop over institutes
for inst in institute_list:
    model_list = os.listdir(basepath + inst + '/')
    
    # for each institute list models and store in dictionary
    for model_temp in model_list:
        model_inst_dict[model_temp] = inst
        #print(model_inst_dict)
        #assert False
    
    # correction for UKESM which is used by multiple centres - we want MOHC only
    model_inst_dict['UKESM1-0-LL'] = 'MOHC'
#print(os.listdir(path))
print(model_inst_dict)

{'TaiESM1': 'AS-RCEC', 'UKESM1-0-LL': 'MOHC', 'AWI-CM-1-1-MR': 'AWI', 'AWI-ESM-1-1-LR': 'AWI', 'BCC-CSM2-MR': 'BCC', 'BCC-ESM1': 'BCC', 'CAMS-CSM1-0': 'CAMS', 'CAS-ESM2-0': 'CAS', 'FGOALS-f3-L': 'CAS', 'FGOALS-g3': 'CAS', 'IITM-ESM': 'CCCR-IITM', 'CanESM5': 'CCCma', 'CanESM5-CanOE': 'CCCma', 'CMCC-CM2-HR4': 'CMCC', 'CMCC-CM2-SR5': 'CMCC', 'CMCC-ESM2': 'CMCC', 'CNRM-CM6-1': 'CNRM-CERFACS', 'CNRM-CM6-1-HR': 'CNRM-CERFACS', 'CNRM-ESM2-1': 'CNRM-CERFACS', 'ACCESS-ESM1-5': 'CSIRO', 'ACCESS-CM2': 'CSIRO-ARCCSS', 'E3SM-1-0': 'E3SM-Project', 'E3SM-1-1': 'E3SM-Project', 'E3SM-1-1-ECA': 'E3SM-Project', 'EC-Earth3': 'EC-Earth-Consortium', 'EC-Earth3-AerChem': 'EC-Earth-Consortium', 'EC-Earth3-CC': 'EC-Earth-Consortium', 'EC-Earth3-LR': 'EC-Earth-Consortium', 'EC-Earth3-Veg': 'EC-Earth-Consortium', 'EC-Earth3-Veg-LR': 'EC-Earth-Consortium', 'EC-Earth3P-VHR': 'EC-Earth-Consortium', 'FIO-ESM-2-0': 'FIO-QLNM', 'MPI-ESM-1-2-HAM': 'HAMMOZ-Consortium', 'INM-CM4-8': 'INM', 'INM-CM5-0': 'INM', 'IPSL-CM5A2

In [None]:
assert False
# useful guide here: https://nci-data-training.readthedocs.io/en/latest/_notebook/climate/1_02_Xarray_subset_slicing_plot_CMIP6.html
DATA_ROOT = Path("/badc/cmip6/data/CMIP6/")
pr_datasets = {}
tas_datasets = {}
expt = 'ssp585'
scenario = 'ScenarioMIP'

# read in monthly data
models = ['ACCESS-CM2', 'ACCESS-ESM1-5', 'BCC-CSM2-MR', 'CAMS-CSM1-0', 'CanESM5',
          'CNRM-CM6-1', 'CNRM-ESM2-1', 'FGOALS-f3-L', 'FGOALS-g3', 'HadGEM3-GC31-MM',
          'GISS-E2-1-G', 'INM-CM5-0', 'INM-CM4-8',
          'MPI-ESM1-2-LR', 'NorESM2-LM', 'NorESM2-MM', 'TaiESM1', 'UKESM1-0-LL'] 

# models without SSP585 pr OR tas or have issues with data ('MRI-ESM2-0', )
# 'BCC-ESM1', 'CESM2', 'MPI-ESM-1-2-HAM', 'MPI-ESM1-2-HR', 'NorCPM1', 'MRI-ESM2-0', 

#for model in ["HadGEM3-GC31-LL"]:
#for model in ['UKESM1-0-LL']:
#for model in ['FGOALS-g3']:
for model in models:
    print(model)
    institute = model_inst_dict[model]
    
    if model in ['UKESM1-0-LL']:  #something wrong with UKESM r1i1p1 variant (hdf error)
        variant = 'r2i1p1f2'
    else:
        variant = None
    
    # get historical precip data
    fp_hist = make_cmip6_filepath(
        institute=institute, scenario='CMIP', model=model, experiment='historical', variant=variant,
        table_id="Amon", variable="pr", grid=None, version=None, time_range="*"
    )
    hist_pr_cube = xr.open_mfdataset(fp_hist)
    hist_pr_cube = hist_pr_cube.assign_coords(lon=(((hist_pr_cube.lon + 180) % 360) - 180)).sortby('lon') # change lons from 0,360 to -180,180
    
    # select data over Ghana and convert to Iris cube
    hist_ghana_pr = hist_pr_cube.sel(lat=slice(4.5,11.5), lon=slice(-3.5,1))
    hist_ghana_pr = hist_ghana_pr.pr.to_iris()
    
    fp_future = make_cmip6_filepath(
        institute=institute, scenario=scenario, model=model, experiment=expt, variant=variant,
        table_id="Amon", variable="pr", grid=None, version=None, time_range="*"
    )
    
    future_pr_cube = xr.open_mfdataset(fp_future)
    future_pr_cube = future_pr_cube.assign_coords(lon=(((future_pr_cube.lon + 180) % 360) - 180)).sortby('lon') # change lons from 0,360 to -180,180
    
    # select data over Ghana and convert to Iris cube
    future_ghana_pr = future_pr_cube.sel(lat=slice(4.5,11.5), lon=slice(-3.5,1))
    future_ghana_pr = future_ghana_pr.pr.to_iris()
    
    # combine hist and future precip data into single cube
    cube = merge_hist_future_cmip6(hist_ghana_pr, future_ghana_pr)
    
    pr_datasets[model] = cube
    
    # repeat for tas
    fp_hist = make_cmip6_filepath(
        institute=institute, scenario='CMIP', model=model, experiment='historical', variant=variant,
        table_id="Amon", variable="tas", grid=None, version=None, time_range="*"
    )
    hist_tas_cube = xr.open_mfdataset(fp_hist)
    
    # change longitudes from 0,360 to -180,180
    hist_tas_cube = hist_tas_cube.assign_coords(lon=(((hist_tas_cube.lon + 180) % 360) - 180)).sortby('lon')
    
    # select data over Ghana
    hist_ghana_tas = hist_tas_cube.sel(lat=slice(4.5,11.5), lon=slice(-3.5,1))
    hist_ghana_tas = hist_ghana_tas.tas.to_iris()
    
    fp_future = make_cmip6_filepath(
        institute=institute, scenario=scenario, model=model, experiment=expt, variant=variant,
        table_id="Amon", variable="tas", grid=None, version=None, time_range="*"
    )
    
    future_tas_cube = xr.open_mfdataset(fp_future)
    
    # change longitudes from 0,360 to -180,180
    future_tas_cube = future_tas_cube.assign_coords(lon=(((future_tas_cube.lon + 180) % 360) - 180)).sortby('lon')
    
    # select data over Ghana
    future_ghana_tas = future_tas_cube.sel(lat=slice(4.5,11.5), lon=slice(-3.5,1))
    future_ghana_tas = future_ghana_tas.tas.to_iris()
    
    # combine hist and future data into single cube
    cube = merge_hist_future_cmip6(hist_ghana_tas, future_ghana_tas)
    
    tas_datasets[model] = cube
    #assert False
outpath = '/home/users/train008/data/'
np.save(outpath + 'hist_plus_' + expt + '_ghana_pr_dict.npy', pr_datasets)
np.save(outpath + 'hist_plus_' + expt + '_ghana_tas_dict.npy', tas_datasets)

In [349]:
def cube_to_frame(cube_dict, expt='hist', region = 'all', var = 'pr'): # set up in case calculate SPEI for multiple models
    df = pd.DataFrame(columns = ['model', 'expt', 'year', 'month', 'value', 'region'])
    
    for key in cube_dict.keys():
        cube = cube_dict[key].copy()
        
        if var == 'pr':
            cube.convert_units('kg m-2 month-1')
        if var == 'tas':
            cube.convert_units('celsius')
            
        cube = cube.collapsed(['latitude', 'longitude'], iris.analysis.MEAN)
            
        #cube_time_mean = cube.collapsed(['latitude', 'longitude'], iris.analysis.MEAN)
        model = key #extract from attributes later
        
        #print(cube)
        x = cube.data
        x = x.flatten() #if masked to land sea, which isn't as moment
        
        iris.coord_categorisation.add_year(cube, 'time', name='year')
        iris.coord_categorisation.add_month(cube, 'time', name='month')
        
        year =  cube.coord('year').points
        #print(year)
        month = cube.coord('month').points
        
        y = pd.DataFrame(columns =   ['model', 'expt', 'year', 'month', 'value', 'region'])
       
        y['value'] = x
        y['year'] = year
        y['model'] = model
        y['expt'] = expt
        y['month'] = month
        y['region'] = region
        
        df= df.append(y)
    
    return(df)

In [350]:
#calculate avg pr and temp in Ghana in future
expt = 'ssp585'
path = '/home/users/train008/data/'
pr_datasets = np.load(path + 'hist_plus_' + expt + '_ghana_pr_dict.npy').item()
tas_datasets = np.load(path + 'hist_plus_' + expt + '_ghana_tas_dict.npy').item()
print(pr_datasets.keys())

dict_keys(['ACCESS-CM2', 'ACCESS-ESM1-5', 'BCC-CSM2-MR', 'CAMS-CSM1-0', 'CanESM5', 'CNRM-CM6-1', 'CNRM-ESM2-1', 'FGOALS-f3-L', 'FGOALS-g3', 'HadGEM3-GC31-MM', 'GISS-E2-1-G', 'INM-CM5-0', 'INM-CM4-8', 'MPI-ESM1-2-LR', 'NorESM2-LM', 'NorESM2-MM', 'TaiESM1', 'UKESM1-0-LL'])


In [351]:
df_pr = cube_to_frame(pr_datasets)
df_tas = cube_to_frame(tas_datasets, var = 'tas')

In [389]:
#average tas and monthly rainfall in historical

df_pr_his = df_pr[(df_pr['year'] >= 1980) & (df_pr['year'] <= 2010)]
df_tas_his = df_tas[(df_tas['year'] >= 1980) & (df_tas['year'] <= 2010)]

df_pr_mid = df_pr[(df_pr['year'] >= 2040) & (df_pr['year'] <= 2050)]
df_tas_mid = df_tas[(df_tas['year'] >= 2040) & (df_tas['year'] <= 2050)]

In [400]:
x = df_pr_his.groupby(['month'])['value'].mean()
x
np.nansum(x.values)

print('Total annual precipitation (mm) in 2040 - 2050 ', np.round(np.nansum(x.values),0))
print('Avg annual temperature (C) ', np.round(np.nanmean(df_tas_his['value']),0))

x = df_pr_mid.groupby(['month'])['value'].mean()
x


print('Total annual precipitation (mm) in 2040 - 2050 ', np.round(np.nansum(x.values),0))
print('Avg annual temperature (C) ', np.round(np.nanmean(df_tas_mid['value']),0))



Total annual precipitation (mm) in 2040 - 2050  1064.0
Avg annual temperature (C)  27.0
Total annual precipitation (mm) in 2040 - 2050  1112.0
Avg annual temperature (C)  29.0


In [381]:
#look at wet season only
wet_season = ['Jun', 'Jul', 'Aug', 'Sep']

df_pr_his = df_pr_his[df_pr_his['month'].isin(wet_season)]
df_tas_his = df_tas_his[df_tas_his['month'].isin(wet_season)]

df_pr_mid = df_pr_mid[df_pr_mid['month'].isin(wet_season)]
df_tas_mid = df_tas_mid[df_tas_mid['month'].isin(wet_season)]

In [None]:
df_pr_his.groupby(['month'])['value'].mean()
df_pr_his

In [385]:
print(np.nanmean(df_pr_his['value']))
print(np.nanmean(df_pr_mid['value']))

print(np.nanmean(df_tas_his['value']))
print(np.nanmean(df_tas_mid['value']))

170.80215186324713
177.0018462685923
26.520434846149612
28.077663215746863


In [383]:
tas_his_max = df_tas_his.groupby(['model', 'expt'])['value'].max()
tas_mid_max = df_tas_mid.groupby(['model', 'expt'])['value'].max()

print(np.nanmean(tas_his_max.values))
print(np.nanmean(tas_mid_max.values))

28.94483951639248
30.235563853857357


In [377]:
np.unique(df_pr_his['month'])

array(['Aug', 'Jul', 'Jun', 'Sep'], dtype=object)