In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import glob
from matplotlib import pyplot as plt
from numpy.polynomial import polynomial
from sklearn.linear_model import LinearRegression
import copy

# Functions

In [2]:
def doy_helper(ds):
    # Remove leap day (Feb 29)
    ds = ds.isel(
        time=~((pd.to_datetime(ds.time).day == 29)&(pd.to_datetime(ds.time).month == 2))
    )
    
    # Create day of year (DOY) array for indexing
    doy = np.tile(
        np.arange(1, 366, 1, dtype='int32'),
        len(pd.to_datetime(ds.time).year.unique())
    )

    # rework dims/coords for climo
    return ds.rename({'time':'doy'}).assign_coords({'doy':('doy', doy)})

def return_from_doy_helper(dswdoy, dswtime):

    # Remove leap day (Feb 29)
    dswtime = dswtime.isel(
        time=~((pd.to_datetime(dswtime.time).day == 29)&(pd.to_datetime(dswtime.time).month == 2))
    )

    # save time coord
    time_arr = pd.to_datetime(dswtime.time.values)

    # reverse to time instead of doy
    return dswdoy.assign_coords({'doy':('doy', time_arr)}).rename({'doy':'time'})

def get_climatology_smoothed(ds, var, window=60, fileauthor='Jhayron S. Pérez-Carrasquilla'):
    
    # Subset dataset for the period 1981-2020
    ds_clima = ds

    # adjust to doy coords/dims
    ds_clima = doy_helper(ds_clima)
    
    # Compute the daily mean for each day of the year
    climo = []
    for i in range(1, 366):
        daily_mean = ds_clima.sel(doy=i)[var].mean('doy')
        climo.append(daily_mean)
    
    # Convert to xarray Dataset with the appropriate dimensions
    attrs = ds[var].attrs
    attrs['File Author'] = fileauthor
    
    climo = xr.Dataset({
        f'{var}_climo': (['doy', 'lat', 'lon'], np.array(climo)),
    }, 
    coords={
        'doy': np.arange(1, 366, 1),
        'lat': ds.lat.values,
        'lon': ds.lon.values,
    },
    attrs=attrs
    )

    # reorder dims
    climo = climo.transpose('doy', 'lat', 'lon')

    # sanity check
    # print(climo[f'{var}_climo'].shape)
    
    # Stack climatology 3 times to handle edges
    climo_extended = xr.concat([climo, climo, climo], dim='doy')

    # Adjust coordinates after stacking to represent a larger time span
    climo_extended['doy'] = np.arange(1, 365 * 3 + 1, 1)

    # Apply rolling mean with a 60-day window for smoothing
    climo_smoothed = climo_extended.rolling(doy=window, center=True, min_periods=1).mean(skipna=True)

    # Extract the middle portion, corresponding to the original 365 days
    climo_smoothed = climo_smoothed.isel(doy=slice(365, 365 + 365))

    # sanity check
    # print(climo_smoothed[f'{var}_climo'].shape)
    
    # Reset 'day_of_year' coordinate to original range
    climo_smoothed['doy'] = np.arange(1, 366, 1)

    return climo_smoothed

def get_anomalies(ds, var, climo):

    # deep copy dataset
    anom = copy.deepcopy(ds)

    # adjust to doy coords/dims
    dstmp = doy_helper(ds)
    anom = doy_helper(anom)

    # compute doy anomalies
    for day in range(1, 366):
        
        anom[var][{'doy':(dstmp.doy == day)}] = (dstmp[var].sel(doy=day) - climo[f'{var}_climo'].sel(doy=day))

    # back to dataset
    anom = anom.rename({var:f'{var}_anom'})

    # add original time dim
    return return_from_doy_helper(anom, ds)

def fourierfilter(da, cutoff_period=10):
    
    # Compute the Fourier transform along the time axis
    fft_data = np.fft.fft(da, axis=0)
    
    # Get the frequencies corresponding to the FFT components
    freqs = np.fft.fftfreq(da.shape[0], d=1)
    # d=1 assumes daily data; adjust if different
    
    # Compute the corresponding periods (in days)
    periods = np.abs(1 / freqs)
    
    # Define the cutoff period for low-pass filter (10 days)
    cutoff_period = cutoff_period
    
    # Create a mask to filter out high-frequency components (shorter than 10 days)
    high_pass_mask = periods < cutoff_period
    
    # Apply the mask to the FFT data (set high-frequency components to zero)
    fft_data_filtered = fft_data.copy()
    fft_data_filtered[high_pass_mask, :, :] = 0
    
    # Perform the inverse FFT to get the filtered data back in the time domain
    filtered_data = np.fft.ifft(fft_data_filtered, axis=0).real
    
    # Create a new xarray DataArray to store the filtered data
    filtered_anom = xr.DataArray(
        filtered_data,
        dims=da.dims,
        coords=da.coords,
        attrs=da.attrs
    )
    return filtered_anom

# Get average time series of the region
def get_weighted_area_average(da):
    
    # Compute the cosine of the latitudes (in radians) for weighting
    # For a rectangular grid the cosine of the latitude is proportional to the grid cell area
    weights = np.cos(np.deg2rad(da.lat))
    weights.name = "weights"

    # Apply the weights and compute the mean across lat/lon
    da_weighted = da.weighted(weights)
    weighted_mean = da_weighted.mean(dim=['lat', 'lon'])
    
    return weighted_mean


def create_doy_dummy(num_yr=84):
    """Creates dummy array for indexing the non-leap year based doy time series"""
    days_per_year = 365
    years = num_yr
    day_indices = np.arange(
        0, 
        years * days_per_year, 
        days_per_year,
        dtype=int
    ).reshape(-1, 1) + np.arange(days_per_year)
    return day_indices


def get_climatology_std_smoothed(ds, var, window=60, fileauthor='Jhayron S. Pérez-Carrasquilla'):

    # Get the day of year (DOY)
    doy_tmp = create_doy_dummy(len(np.unique(pd.to_datetime(ds.time).year)))
    
    # Compute the daily standard deviation for each day of the year
    climo = []
    for i in range(0, 365):
        # grab indices for doy
        doy_indx = doy_tmp[:, i]
        
        # ensure time is first axis with transpose
        array_temp = ds[var].transpose('time', 'lat', 'lon')[doy_indx]

        # compute std for doy
        std = np.nanstd(array_temp, axis=0)
        # make nan where 0
        std[std == 0] = np.nan
        climo.append(std)
    
    # Convert to xarray Dataset with the appropriate dimensions
    attrs = ds[var].attrs
    attrs['File Author'] = fileauthor
    
    climo = xr.Dataset({
        f'{var}_climo_std': (['doy', 'lat', 'lon'], np.array(climo)),
    }, 
    coords={
        'doy': np.arange(1, 366, 1),
        'lat': ds.lat.values,
        'lon': ds.lon.values,
    },
    attrs=attrs)

    climo = climo.transpose('doy', 'lat', 'lon')
    
    # Stack climatology 3 times to handle edges
    climo_extended = xr.concat([climo, climo, climo], dim='doy')

    # Adjust coordinates after stacking to represent a larger time span
    climo_extended['doy'] = np.arange(1, 365 * 3 + 1, 1)

    # Apply rolling mean with a 60-day window for smoothing
    climo_smoothed = climo_extended.rolling(doy=window, center=True, min_periods=1).mean(skipna=True)

    # Extract the middle portion, corresponding to the original 365 days
    climo_smoothed = climo_smoothed.isel(doy=slice(365, 365 + 365))

    # Reset 'day_of_year' coordinate to original range
    climo_smoothed['doy'] = np.arange(1, 366, 1)

    return climo_smoothed

# this func is not used (below)
def standardize_anomalies(anom, var, climo_std):

    # ensure time is first dim/axis
    anom = anom.transpose('time', 'lat', 'lon')
    std_anom = copy.deepcopy(anom)
    
    # Get the day of year (DOY)
    doy_tmp = create_doy_dummy(len(np.unique(pd.to_datetime(anom.time).year)))

    for i in range(0, 365):
        
        doy_indx = doy_tmp[:, i]

        std_anom[var][doy_indx] = (
            anom[var][doy_indx] / climo_std[f'{var}_climo_std'].sel(doy=i + 1)
        )
    return std_anom

In [3]:
def extractz500_several_files(filestemp):
    listxarrays = []
    for file in filestemp:
        dstemp = xr.open_dataset(file)
        dstemp = regridz500intolens2grid(dstemp,lat_lens2,lon_lens2)
        dstemp = dstemp.sel(time=slice('1940-01-01', '2100-12-31'))
        # dstemp = dstemp.Z500.where((dstemp.lat>=10),drop=True)
        # Transpose the data to match the desired dimension order
        dstemp_transposed = dstemp.transpose('time', 'lat', 'lon')
        lat0=20; lat1=80; lon0=180; lon1=330
        dstemp_transposed = dstemp_transposed.where((dstemp_transposed.lat>=lat0)&(dstemp_transposed.lat<=lat1)&\
                               (dstemp_transposed.lon>=lon0)&(dstemp_transposed.lon<=lon1),drop=True)
        
        # Recreate the DataArray with the coordinates in the desired order
        dstemp_reordered = xr.DataArray(
            dstemp_transposed.values,
            dims=['time', 'lat', 'lon'],
            coords={
                'time': dstemp_transposed.coords['time'],
                'lat': dstemp_transposed.coords['lat'],
                'lon': dstemp_transposed.coords['lon']
            },
            attrs=dstemp.attrs,
            name=dstemp.name
        )
        listxarrays.append(dstemp_reordered)
    xarrayfull = xr.concat(listxarrays, dim='time')
    xarrayfull = xarrayfull.drop_duplicates('time',keep='first')
    xarrayfull = xarrayfull.sortby('time')
    cftime_index = xr.coding.cftimeindex.CFTimeIndex(xarrayfull['time'].values)
    datetime_index = cftime_index.to_datetimeindex()
    xarrayfull.coords['time'] = datetime_index
    return xarrayfull.to_dataset()

In [4]:
def compute_anoms_experiment_complete(id_experiment):
    name_experiment = unique_names_experiments[id_experiment]
    print(name_experiment)
    where_files = np.where(names_experiments_all==name_experiment)[0]
    files_temp = filenames[where_files]
    
    dataset_raw = extractz500_several_files(files_temp)
    
    lat0=20; lat1=80; lon0=180; lon1=330
    dataset_region = dataset_raw.where(
        (
            dataset_raw.lat>=lat0)&(
                dataset_raw.lat<=lat1)&(
                dataset_raw.lon>=lon0)&(
                dataset_raw.lon<=lon1),
        drop=True
    )
    clima = get_climatology_smoothed(dataset_region, 'Z500')
    anoms = get_anomalies(dataset_region, 'Z500', clima)
    anoms_filtered = fourierfilter(anoms.Z500_anom)
    del(anoms)
    del(clima)
    anoms_filtered = anoms_filtered.to_dataset(name='Z_anom')
    anoms_smooth = copy.deepcopy(anoms_filtered).rolling(time=60, center=True, min_periods=1).mean(skipna=True)
    mean_series = get_weighted_area_average(anoms_smooth.Z_anom)
    
    full_curve = copy.deepcopy(mean_series)
    full_curve.data = np.zeros(len(full_curve))
    npoly=3
    
    doy_tmp = create_doy_dummy(len(np.unique(pd.to_datetime(mean_series.time).year)))
    
    for i in range(0, 365): #Iterate through every day of the year
    
        doy_indx = doy_tmp[:, i]
        
        # fit a polynomial for the trend of each DOY
        params_curve = polynomial.polyfit(
            np.arange(0, mean_series[doy_indx].shape[0]), 
            mean_series[doy_indx], 
            npoly
        )
        curve = polynomial.polyval(
            np.arange(0, mean_series[doy_indx].shape[0]), 
            params_curve, 
            tensor=True
        )
        
        ## Center curve in zero
        full_curve.loc[{'time': mean_series[doy_indx].time}] = curve
    
    # plt.plot(mean_series.time,mean_series)
    # plt.plot(mean_series.time,full_curve)
    # plt.show()
    # plt.close('all')
    # aaaa
    anoms_detrended = anoms_filtered.Z_anom - full_curve.data[:, np.newaxis, np.newaxis]
    anoms_detrended = anoms_detrended.to_dataset()
    
    del(anoms_filtered)
    
    clima_std = get_climatology_std_smoothed(anoms_detrended, 'Z_anom')
    clima_std_average_region_series = get_weighted_area_average(clima_std)
    
    anoms_standardized = copy.deepcopy(anoms_detrended).Z_anom
    anoms_standardized.data = np.zeros_like(anoms_standardized.data)
    
    doy_tmp = create_doy_dummy(len(np.unique(pd.to_datetime(mean_series.time).year)))
    
    for i in range(0, 365): #Iterate through every day of the year
    
        doy_indx = doy_tmp[:, i]
    
        # grab doy std (climo)
        std_temp = clima_std_average_region_series.Z_anom_climo_std[i].data
        # standardize the detrended anoms by std climo
        standardized_temp = anoms_detrended.Z_anom[doy_indx].data / std_temp
        anoms_standardized.loc[{'time': anoms_detrended.Z_anom[doy_indx].time}] = standardized_temp
    
    anoms_standardized = anoms_standardized.to_dataset()
    anoms_standardized.to_netcdf(f'{path_outputs_anoms}anoms_{name_experiment}.nc')

In [5]:
from scipy.spatial import cKDTree

def regridz500intolens2grid(ds,lat_lens2,lon_lens2):
    # Step 1: Flatten original coordinates
    lat_flat = ds.lat.values.flatten()  # shape: (ncol,)
    lon_flat = ds.lon.values.flatten()  # shape: (ncol,)
    
    # Step 2: Prepare new grid
    lon_grid, lat_grid = np.meshgrid(lon_lens2, lat_lens2)
    target_points = np.column_stack([lat_grid.ravel(), lon_grid.ravel()])  # shape: (new_ncol, 2)
    
    # Step 3: KDTree for nearest neighbor search
    tree = cKDTree(np.column_stack([lat_flat, lon_flat]))
    _, nearest_idx = tree.query(target_points)  # indices of original points closest to new grid
    
    # Step 4: Rebuild new DataArray
    # Z500: shape (time, ncol)
    Z500_new_flat = ds.Z500.values[:, nearest_idx]  # shape (time, new_ncol)
    Z500_new = Z500_new_flat.reshape((ds.Z500.shape[0], len(lat_lens2), len(lon_lens2)))  # shape: (time, lat, lon)
    
    # Step 5: Wrap in xarray
    Z500_regridded = xr.DataArray(
        Z500_new,
        coords={
            "time": ds.Z500.time,
            "lat": lat_lens2,
            "lon": lon_lens2
        },
        dims=["time", "lat", "lon"],
        name="Z500"
    )
    return Z500_regridded

# Compute anomalies

In [6]:
path_folders_exps_historic = np.sort(glob.glob(f'/glade/campaign/collections/rda/data/d651007/*/'))
path_folders_exps_rcp85 = np.sort(glob.glob(f'/glade/campaign/collections/rda/data/d651009/*/'))

In [7]:
filenames = []

for folderi in range(len(path_folders_exps_historic)):
    listfilestemp = glob.glob(f'{path_folders_exps_historic[folderi]}atm/proc/tseries/day_1/*Z500*.nc')
    filenames.extend(listfilestemp)
    
for folderi in range(len(path_folders_exps_rcp85)):
    listfilestemp = glob.glob(f'{path_folders_exps_rcp85[folderi]}atm/proc/tseries/day_1/*Z500*.nc')
    filenames.extend(listfilestemp)

filenames = np.array(filenames)

In [8]:
filenames

array(['/glade/campaign/collections/rda/data/d651007/b.e13.BHISTC5.ne120_t12.cesm-ihesp-hires1.0.30-1920-2005.002/atm/proc/tseries/day_1/b.e13.BHISTC5.ne120_t12.cesm-ihesp-hires1.0.30-1920-2005.002.cam.h1.Z500.19450101-19491231.nc',
       '/glade/campaign/collections/rda/data/d651007/b.e13.BHISTC5.ne120_t12.cesm-ihesp-hires1.0.30-1920-2005.002/atm/proc/tseries/day_1/b.e13.BHISTC5.ne120_t12.cesm-ihesp-hires1.0.30-1920-2005.002.cam.h1.Z500.19850101-19891231.nc',
       '/glade/campaign/collections/rda/data/d651007/b.e13.BHISTC5.ne120_t12.cesm-ihesp-hires1.0.30-1920-2005.002/atm/proc/tseries/day_1/b.e13.BHISTC5.ne120_t12.cesm-ihesp-hires1.0.30-1920-2005.002.cam.h1.Z500.19500101-19541231.nc',
       '/glade/campaign/collections/rda/data/d651007/b.e13.BHISTC5.ne120_t12.cesm-ihesp-hires1.0.30-1920-2005.002/atm/proc/tseries/day_1/b.e13.BHISTC5.ne120_t12.cesm-ihesp-hires1.0.30-1920-2005.002.cam.h1.Z500.19950101-19991231.nc',
       '/glade/campaign/collections/rda/data/d651007/b.e13.BHISTC5.n

In [9]:
names_experiments_all = np.array([filenames[fi].split('.')[-6] for fi in range(len(filenames))])

In [10]:
unique_names_experiments = np.unique(names_experiments_all)

In [11]:
path_outputs_anoms = '/glade/derecho/scratch/jhayron/Data4WRsClimateChange/CESM1_HR_Anoms/'

In [12]:
ds_lens2 = xr.open_dataset('/glade/derecho/scratch/jhayron/Data4WRsClimateChange/LENS2_full_Z500_Anoms/anoms_LE2-1001.001_cmip6.nc')
lat_lens2 = ds_lens2.lat
lon_lens2 = ds_lens2.lon

In [13]:
from multiprocessing import Pool

num_ids = len(unique_names_experiments)
num_processors = 5

# Create a Pool of worker processes
with Pool(processes=num_processors) as pool:
    # Map the function to the range of IDs
    pool.map(compute_anoms_experiment_complete, range(num_ids))

003002004005001






  datetime_index = cftime_index.to_datetimeindex()
  datetime_index = cftime_index.to_datetimeindex()
  datetime_index = cftime_index.to_datetimeindex()
  datetime_index = cftime_index.to_datetimeindex()


006


  datetime_index = cftime_index.to_datetimeindex()
  periods = np.abs(1 / freqs)
  periods = np.abs(1 / freqs)
  periods = np.abs(1 / freqs)
  periods = np.abs(1 / freqs)


007
008
009
010


  datetime_index = cftime_index.to_datetimeindex()
  datetime_index = cftime_index.to_datetimeindex()
  periods = np.abs(1 / freqs)
  datetime_index = cftime_index.to_datetimeindex()
  datetime_index = cftime_index.to_datetimeindex()
  datetime_index = cftime_index.to_datetimeindex()
  periods = np.abs(1 / freqs)
  periods = np.abs(1 / freqs)
  periods = np.abs(1 / freqs)


ValueError: conflicting sizes for dimension 'doy': length 58765 on 'doy' and length 58551 on {'doy': 'Z500', 'lat': 'Z500', 'lon': 'Z500'}

In [37]:
id_experiment = 8

In [38]:
name_experiment = unique_names_experiments[id_experiment]
print(name_experiment)
where_files = np.where(names_experiments_all==name_experiment)[0]
files_temp = filenames[where_files]

dataset_raw = extractz500_several_files(files_temp)

009


  datetime_index = cftime_index.to_datetimeindex()


In [44]:
files_temp

array(['/glade/campaign/collections/rda/data/d651007/b.e13.BHISTC5.ne120_t12.cesm-ihesp-hires1.0.46-1920-2005.009/atm/proc/tseries/day_1/b.e13.BHISTC5.ne120_t12.cesm-ihesp-hires1.0.46-1920-2005.009.cam.h1.Z500.19300101-19341231.nc',
       '/glade/campaign/collections/rda/data/d651007/b.e13.BHISTC5.ne120_t12.cesm-ihesp-hires1.0.46-1920-2005.009/atm/proc/tseries/day_1/b.e13.BHISTC5.ne120_t12.cesm-ihesp-hires1.0.46-1920-2005.009.cam.h1.Z500.19450101-19491231.nc',
       '/glade/campaign/collections/rda/data/d651007/b.e13.BHISTC5.ne120_t12.cesm-ihesp-hires1.0.46-1920-2005.009/atm/proc/tseries/day_1/b.e13.BHISTC5.ne120_t12.cesm-ihesp-hires1.0.46-1920-2005.009.cam.h1.Z500.19700101-19741231.nc',
       '/glade/campaign/collections/rda/data/d651007/b.e13.BHISTC5.ne120_t12.cesm-ihesp-hires1.0.46-1920-2005.009/atm/proc/tseries/day_1/b.e13.BHISTC5.ne120_t12.cesm-ihesp-hires1.0.46-1920-2005.009.cam.h1.Z500.19900101-19941231.nc',
       '/glade/campaign/collections/rda/data/d651007/b.e13.BHISTC5.n

In [42]:
count=0
for year in range(1940,2101):
    print(year, len(dataset_raw.sel(time=f'{year}').time))
    count+=len(dataset_raw.sel(time=f'{year}').time)

1940 365
1941 365
1942 365
1943 365
1944 365
1945 365
1946 365
1947 365
1948 365
1949 365
1950 365
1951 365
1952 365
1953 365
1954 365
1955 365
1956 365
1957 365
1958 365
1959 365
1960 365
1961 365
1962 365
1963 365
1964 365
1965 365
1966 365
1967 365
1968 365
1969 365
1970 365
1971 365
1972 365
1973 365
1974 365
1975 365
1976 365
1977 365
1978 365
1979 365
1980 365
1981 365
1982 365
1983 365
1984 365
1985 365
1986 365
1987 365
1988 365
1989 365
1990 365
1991 365
1992 365
1993 365
1994 365
1995 365
1996 365
1997 365
1998 365
1999 365
2000 365
2001 365
2002 365
2003 365
2004 365
2005 365
2006 1


KeyError: "not all values found in index 'time'. Try setting the `method` keyword argument (example: method='nearest')."

In [45]:
## In exp 001 -----> 2062 only has 152 days
## In exp 009 -----> does not have data after 2005