# Python Learn by Doing: Climate Change Indicators, Your Turn! Option 2
Developed By: Dr. Kerrie Geil, Mississippi State University

Date: May 2024

Link: notebook available to download at

# Import packages and define workspace

In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import scipy.stats as ss
import warnings
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cf

In [None]:
# path to your working directory (where this notebook is on your computer)
work_dir = r'C://Users/kerrie/Documents/01_LocalCode/repos/MSU_py_training/learn_by_doing/climate_change_indicators/' 

# path to the data files
data_dir = work_dir+r'data/AgERA5_daily/'

# Data Cleaning

In [None]:
tn = xr.open_dataarray(data_dir+'tmin_AgERA5_Mississippi_Daily_1979-2023.nc')
tn = tn.squeeze()
tn

In [None]:
#### nan for daily temperature greater than 70C (158F) or less than -70C (-94F)
# is tn>70C or tn<-70C?
((tn>70)|(tn<-70)).data.sum()

In [None]:
#### leap days (i.e Feb 29th)

# create a boolean array of dim 'time' where leap days are True and all other days are False
isleapday=xr.where((tn.time.dt.day==29) & (tn.time.dt.month==2),True,False)

tn=xr.where(isleapday,np.nan,tn)  # fill leapdays with nan

In [None]:
#### daily temperature outliers 

# find the time-mean for each day of the year
tn_daily_mean=tn.groupby(tn.time.dt.dayofyear).mean('time')

# find the standard deviation for each day of the yar
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", message="Degrees of freedom <= 0 for slice")
    tn_stddev=tn.groupby(tn.time.dt.dayofyear).std('time')

# define daily outlier temperature as exceeding the mean +/- 5 times standard deviation
tn_outlier_upper, tn_outlier_lower=(tn_daily_mean+tn_stddev*5), (tn_daily_mean-tn_stddev*5)
print('tn',(tn.groupby(tn.time.dt.dayofyear)>tn_outlier_upper).data.sum(), (tn.groupby(tn.time.dt.dayofyear)<tn_outlier_lower).data.sum())

In [None]:
# look at the nan situation

# a function that sums the number of nans in each month of data
def get_nans_per_month(data_in):
    month_groups=pd.MultiIndex.from_arrays([data_in.time['time.year'].data,data_in.time['time.month'].data])
    data_in.coords['month_groups']=('time',month_groups)    
    nancount=data_in.isnull().groupby('month_groups').sum()
    return nancount

tn_nan_per_month=get_nans_per_month(tn.copy())

# the maximum number of nans per month at each grid cell
max_nan_per_month=tn_nan_per_month.max('month_groups')

# plot it
fig=plt.figure(figsize=(5,5))
ax=fig.add_subplot(111,projection=ccrs.PlateCarree())
ax.add_feature(cf.STATES.with_scale("50m"),lw=0.3)
max_nan_per_month.plot(cmap='summer')
plt.show()

# Monthly Maximum Value of Daily Minimum Temperature (TNx)

- max(each month of daily minimum temperature values) for each grid cell

Here we are inputting daily data and pulling out 1 value per month at each grid cell.

In [None]:
# create monthly datetimes 
time_months=pd.date_range(tn.time.data[0],tn.time.data[-1],freq='MS')

In [None]:
# create an index value for every month in the timeseries
month_groups=pd.MultiIndex.from_arrays([tn.time['time.year'].data,tn.time['time.month'].data])

# add the month_groups index the time coordinate labels
tn.coords['month_groups']=('time',month_groups)    
tn

In [None]:
# now groupby month and find the maximum value of each month
TNx=tn.groupby('month_groups').max('time')
TNx

In [None]:
# change the month_groups dim name and coordinate labels to datetimes
TNx=TNx.rename({'month_groups':'time'}) # rename dim
TNx=TNx.drop_vars(['time_level_0','time_level_1']) # we don't need these leftovers
TNx.coords['time']=('time',time_months) # replace coord labels 
TNx

Our TNx result is now 3-dimensional (time,lat,lon) instead of 1-dimensional like before when we were working with a single data point and had only the time dimension.

Let's select some of our 3D TNx array to plot

In [None]:
# plot TNx timeseries for a single point from the 3D array

# using the first lat and lon
lat=TNx.lat[0]
lon=TNx.lon[0]

# # or this way will yield the same as above
# lat=TNx.lat.sel(lat=35.2,method='nearest')
# lon=TNx.lon.sel(lon=-91.8,method='nearest')

fig=plt.figure(figsize=(15,2))
TNx.sel(lat=lat,lon=lon).plot()
plt.title(f'monthly maximum value of daily minimum temperature (TNx) at lat {lat:.2f} lon {lon:.2f}')
plt.ylabel('degrees C')
plt.show()

In [None]:
# plot the map of TNx for a single time

# using January 2020
ptime='2020-01'

fig=plt.figure(figsize=(5,5))
ax=fig.add_subplot(111,projection=ccrs.PlateCarree())
ax.add_feature(cf.STATES.with_scale("50m"),lw=0.3)
TNx.sel(time=ptime).plot(cbar_kwargs={'label':'degrees C'})
plt.title('TNx for '+ptime)
plt.show()

# Trend analysis

In [None]:
# use resample to create a timeseries of seasonal mean values
TNx_seasonal=TNx.resample(time='QS-DEC').mean('time')
TNx_seasonal

In [None]:
# remove any seasonal mean values that were created using less than 3 values (first and last DJF values)
TNx_DJF=TNx_seasonal[0::4].drop_sel(time=['1978-12-01','2023-12-01']) # drop first and last values by label
TNx_DJF

scipy.stats.linregress only operates on 1-dimensional data arrays so if we want to stick with this function to calculate the linear regression information, we will need to either use nested loops or .stack plus one loop. Note, it's best to avoid looping if you can in python, because it will be much slower than vectorized computation. Here, I will use .stack and a single loop to build arrays for trend and pval with scipy.stats.linregress 

However, if you were operating on a larger dataset (higher spatial resolution or global extent) this technique of calculating trend and pval may be too slow. At the end of this notebook, I'll show a custom function for linear regression on 3D xarray data array that doesn't stack or loop, which would be much faster on big data.

In [None]:
# stack lat and lon dimensions into a single dimension called space
TNx_DJF_stacked=TNx_DJF.stack(space=['lat','lon'])
TNx_DJF_stacked

In [None]:
# create two arrays of dimensions (space) to hold our results and initialize to nan

pval=TNx_DJF_stacked.isel(time=0).copy()  # copy a single time from our stacked array
pval[:]=np.nan  # initialize to nan
del pval['time']  # get rid of the time coordinate label that isn't relevant

trend=TNx_DJF_stacked.isel(time=0).copy()
trend[:]=np.nan
del trend['time']

pval

In [None]:
# linear regression
# loop through every grid cell

xvals=TNx_DJF.time.dt.year

for i,point in enumerate(pval.space):
    reg_info=ss.linregress(xvals,TNx_DJF_stacked.sel(space=point))
    pval[i]=reg_info.pvalue
    trend[i]=reg_info.slope*100 # mm/year --> mm/century    
trend

In [None]:
# now unstack the results back to two dimensions (lat,lon)
pval=pval.unstack()
trend=trend.unstack()
trend

In [None]:
# plot the trend and pval results separately

fig=plt.figure(figsize=(10,5))

ax=fig.add_subplot(121,projection=ccrs.PlateCarree())
ax.add_feature(cf.STATES.with_scale("50m"),lw=0.3)
trend.plot(cbar_kwargs={'label':'C/Century'})
plt.title('TNx DJF trend')

ax=fig.add_subplot(122,projection=ccrs.PlateCarree())
ax.add_feature(cf.STATES.with_scale("50m"),lw=0.3)
pval.plot()
plt.title('TNx DJF trend p value')

plt.show()

In [None]:
# plot the trend where statistical significance is at the 90% condifidence level or greater (using pval as a mask)

fig=plt.figure(figsize=(10,5))

ax=fig.add_subplot(111,projection=ccrs.PlateCarree())
ax.add_feature(cf.STATES.with_scale("50m"),lw=0.3)
trend.where(pval<0.1).plot(cbar_kwargs={'label':'C/Century'})
plt.title('TNx DJF trend (p < 0.1)')

plt.show()

# vectorized linear regression with p values (multiple dimensions, no stacking, no looping)

This is the approach I would take with bigger data

In [None]:
def linear_reg(x,y):
    
    # Compute x length, and mean and standard deviation for x and y
    n     = x.shape[0]
    
    xmean = x.mean('time')
    xstd  = x.std('time') 
    
    ymean = y.mean('time')
    ystd  = y.std('time')
    
    # here's where you would compute equivalent sample size
    # if you wanted to account for autocorrelation
    
    # Compute covariance 
    # including min_count=1 will return nan instead of 0 at the ocean points
    cov   =  ((x - xmean)*(y - ymean)).sum('time',min_count=1)/n

    # Compute correlation
    cor   = cov/(xstd*ystd)

    # Compute regression slope 
    slope     = cov/(xstd**2)

    # Compute t statistc and p-value
    tstats = cor*np.sqrt(n-2)/np.sqrt(1-cor**2)
    p   = ss.t.sf(abs(tstats), n-2)*2 # x2 for two-sided test
    p   = xr.DataArray(p, dims=cor.dims, coords=cor.coords)

    return slope,p

In [None]:
slope,p=linear_reg(TNx_DJF.time.dt.year,TNx_DJF)
slope=slope*100 # C/century

Don't worry about the degrees of freedom warning. The ystd line throws this warning because a few of the grid cells in our data array are all nan (the ocean grid cells). If you want to turn off this warning you could use warnings.catch_warnings and warnings.filterwarnings like we did in the data cleaning section

In [None]:
# plot the trend and pval results separately

fig=plt.figure(figsize=(10,5))

ax=fig.add_subplot(121,projection=ccrs.PlateCarree())
ax.add_feature(cf.STATES.with_scale("50m"),lw=0.3)
slope.plot(cbar_kwargs={'label':'C/Century'})
plt.title('TNx DJF trend')

ax=fig.add_subplot(122,projection=ccrs.PlateCarree())
ax.add_feature(cf.STATES.with_scale("50m"),lw=0.3)
p.plot()
plt.title('TNx DJF trend p value')

plt.show()

In [None]:
# plot the trend where statistical significance is at the 90% condifidence level or greater (using pval as a mask)

fig=plt.figure(figsize=(10,5))

ax=fig.add_subplot(111,projection=ccrs.PlateCarree())
ax.add_feature(cf.STATES.with_scale("50m"),lw=0.3)
slope.where(p<0.1).plot(cbar_kwargs={'label':'C/Century'})
plt.title('TNx DJF trend (p < 0.1)')

plt.show()

If we plot the difference in trend between our custom function and scipy.stats.linregress we would expect to see just noise (a random pattern of very small differences) 

In [None]:
(slope-trend).plot()

We should also see very small differences in p values from our custom function and scipy.stats.linregress 

In [None]:
(p-pval).plot()