In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import glob
from matplotlib import pyplot as plt
from numpy.polynomial import polynomial
from sklearn.linear_model import LinearRegression
import copy

# Functions

In [2]:
def doy_helper(ds):

    # Remove leap day (Feb 29)
    ds = ds.isel(
        time=~((pd.to_datetime(ds.time).day == 29)&(pd.to_datetime(ds.time).month == 2))
    )
    
    # Create day of year (DOY) array for indexing
    doy = np.tile(
        np.arange(1, 366, 1, dtype='int32'),
        len(pd.to_datetime(ds.time).year.unique())
    )

    # rework dims/coords for climo
    return ds.rename({'time':'doy'}).assign_coords({'doy':('doy', doy)})

def return_from_doy_helper(dswdoy, dswtime):

    # Remove leap day (Feb 29)
    dswtime = dswtime.isel(
        time=~((pd.to_datetime(dswtime.time).day == 29)&(pd.to_datetime(dswtime.time).month == 2))
    )

    # save time coord
    time_arr = pd.to_datetime(dswtime.time.values)

    # reverse to time instead of doy
    return dswdoy.assign_coords({'doy':('doy', time_arr)}).rename({'doy':'time'})

def get_climatology_smoothed(ds, var, window=60, fileauthor='Jhayron S. Pérez-Carrasquilla'):
    
    # Subset dataset for the period 1981-2020
    ds_clima = ds

    # adjust to doy coords/dims
    ds_clima = doy_helper(ds_clima)
    
    # Compute the daily mean for each day of the year
    climo = []
    for i in range(1, 366):
        daily_mean = ds_clima.sel(doy=i)[var].mean('doy')
        climo.append(daily_mean)
    
    # Convert to xarray Dataset with the appropriate dimensions
    attrs = ds[var].attrs
    attrs['File Author'] = fileauthor
    
    climo = xr.Dataset({
        f'{var}_climo': (['doy', 'lat', 'lon'], np.array(climo)),
    }, 
    coords={
        'doy': np.arange(1, 366, 1),
        'lat': ds.lat.values,
        'lon': ds.lon.values,
    },
    attrs=attrs
    )

    # reorder dims
    climo = climo.transpose('doy', 'lat', 'lon')

    # sanity check
    print(climo[f'{var}_climo'].shape)
    
    # Stack climatology 3 times to handle edges
    climo_extended = xr.concat([climo, climo, climo], dim='doy')

    # Adjust coordinates after stacking to represent a larger time span
    climo_extended['doy'] = np.arange(1, 365 * 3 + 1, 1)

    # Apply rolling mean with a 60-day window for smoothing
    climo_smoothed = climo_extended.rolling(doy=window, center=True, min_periods=1).mean(skipna=True)

    # Extract the middle portion, corresponding to the original 365 days
    climo_smoothed = climo_smoothed.isel(doy=slice(365, 365 + 365))

    # sanity check
    print(climo_smoothed[f'{var}_climo'].shape)
    
    # Reset 'day_of_year' coordinate to original range
    climo_smoothed['doy'] = np.arange(1, 366, 1)

    return climo_smoothed

def get_anomalies(ds, var, climo):

    # deep copy dataset
    anom = copy.deepcopy(ds)

    # adjust to doy coords/dims
    dstmp = doy_helper(ds)
    anom = doy_helper(anom)

    # compute doy anomalies
    for day in range(1, 366):
        
        anom[var][{'doy':(dstmp.doy == day)}] = (dstmp[var].sel(doy=day) - climo[f'{var}_climo'].sel(doy=day))

    # back to dataset
    anom = anom.rename({var:f'{var}_anom'})

    # add original time dim
    return return_from_doy_helper(anom, ds)

def fourierfilter(da, cutoff_period=10):
    
    # Compute the Fourier transform along the time axis
    fft_data = np.fft.fft(da, axis=0)
    
    # Get the frequencies corresponding to the FFT components
    freqs = np.fft.fftfreq(da.shape[0], d=1)
    # d=1 assumes daily data; adjust if different
    
    # Compute the corresponding periods (in days)
    periods = np.abs(1 / freqs)
    
    # Define the cutoff period for low-pass filter (10 days)
    cutoff_period = cutoff_period
    
    # Create a mask to filter out high-frequency components (shorter than 10 days)
    high_pass_mask = periods < cutoff_period
    
    # Apply the mask to the FFT data (set high-frequency components to zero)
    fft_data_filtered = fft_data.copy()
    fft_data_filtered[high_pass_mask, :, :] = 0
    
    # Perform the inverse FFT to get the filtered data back in the time domain
    filtered_data = np.fft.ifft(fft_data_filtered, axis=0).real
    
    # Create a new xarray DataArray to store the filtered data
    filtered_anom = xr.DataArray(
        filtered_data,
        dims=da.dims,
        coords=da.coords,
        attrs=da.attrs
    )
    return filtered_anom

# Get average time series of the region
def get_weighted_area_average(da):
    
    # Compute the cosine of the latitudes (in radians) for weighting
    # For a rectangular grid the cosine of the latitude is proportional to the grid cell area
    weights = np.cos(np.deg2rad(da.lat))
    weights.name = "weights"

    # Apply the weights and compute the mean across lat/lon
    da_weighted = da.weighted(weights)
    weighted_mean = da_weighted.mean(dim=['lat', 'lon'])
    
    return weighted_mean


def create_doy_dummy(num_yr=84):
    """Creates dummy array for indexing the non-leap year based doy time series"""
    days_per_year = 365
    years = num_yr
    day_indices = np.arange(
        0, 
        years * days_per_year, 
        days_per_year,
        dtype=int
    ).reshape(-1, 1) + np.arange(days_per_year)
    return day_indices


def get_climatology_std_smoothed(ds, var, window=60, fileauthor='Jhayron S. Pérez-Carrasquilla'):

    # Get the day of year (DOY)
    doy_tmp = create_doy_dummy(len(np.unique(pd.to_datetime(ds.time).year)))
    
    # Compute the daily standard deviation for each day of the year
    climo = []
    for i in range(0, 365):
        # grab indices for doy
        doy_indx = doy_tmp[:, i]
        
        # ensure time is first axis with transpose
        array_temp = ds[var].transpose('time', 'lat', 'lon')[doy_indx]

        # compute std for doy
        std = np.nanstd(array_temp, axis=0)
        # make nan where 0
        std[std == 0] = np.nan
        climo.append(std)
    
    # Convert to xarray Dataset with the appropriate dimensions
    attrs = ds[var].attrs
    attrs['File Author'] = fileauthor
    
    climo = xr.Dataset({
        f'{var}_climo_std': (['doy', 'lat', 'lon'], np.array(climo)),
    }, 
    coords={
        'doy': np.arange(1, 366, 1),
        'lat': ds.lat.values,
        'lon': ds.lon.values,
    },
    attrs=attrs)

    climo = climo.transpose('doy', 'lat', 'lon')
    
    # Stack climatology 3 times to handle edges
    climo_extended = xr.concat([climo, climo, climo], dim='doy')

    # Adjust coordinates after stacking to represent a larger time span
    climo_extended['doy'] = np.arange(1, 365 * 3 + 1, 1)

    # Apply rolling mean with a 60-day window for smoothing
    climo_smoothed = climo_extended.rolling(doy=window, center=True, min_periods=1).mean(skipna=True)

    # Extract the middle portion, corresponding to the original 365 days
    climo_smoothed = climo_smoothed.isel(doy=slice(365, 365 + 365))

    # Reset 'day_of_year' coordinate to original range
    climo_smoothed['doy'] = np.arange(1, 366, 1)

    return climo_smoothed

# this func is not used (below)
def standardize_anomalies(anom, var, climo_std):

    # ensure time is first dim/axis
    anom = anom.transpose('time', 'lat', 'lon')
    std_anom = copy.deepcopy(anom)
    
    # Get the day of year (DOY)
    doy_tmp = create_doy_dummy(len(np.unique(pd.to_datetime(anom.time).year)))

    for i in range(0, 365):
        
        doy_indx = doy_tmp[:, i]

        std_anom[var][doy_indx] = (
            anom[var][doy_indx] / climo_std[f'{var}_climo_std'].sel(doy=i + 1)
        )
    return std_anom

# Compute anomalies

In [3]:
path_origins = '/glade/derecho/scratch/jhayron/Data4WRsClimateChange/ProcessedDataReanalyses/'

for name_reanalysis in ['ERA5']:
    dataset_raw = xr.open_dataset(
        f'{path_origins}Z500_{name_reanalysis}.nc'
    )
    dataset_raw = dataset_raw.sel(time=slice('1979-01-01','2023-12-31'))
    lat0=20; lat1=80; lon0=180; lon1=330
    dataset_region = dataset_raw.where(
        (
            dataset_raw.lat>=lat0)&(
                dataset_raw.lat<=lat1)&(
                dataset_raw.lon>=lon0)&(
                dataset_raw.lon<=lon1),
        drop=True
    )
    dataset_region.Z.data = dataset_region.Z.data / 9.82 ### Divide by gravity to convert from m2/s2 to m
    
    clima = get_climatology_smoothed(dataset_region, 'Z')
    anoms = get_anomalies(dataset_region, 'Z', clima)
    
    anoms_filtered = fourierfilter(anoms.Z_anom)
    anoms_filtered = anoms_filtered.to_dataset(name='Z_anom')
    anoms_smooth = copy.deepcopy(anoms_filtered).rolling(time=60, center=True, min_periods=1).mean(skipna=True)
    mean_series = get_weighted_area_average(anoms_smooth.Z_anom)
    full_curve = copy.deepcopy(mean_series)
    full_curve.data = np.zeros(len(full_curve))
    npoly=3
    
    doy_tmp = create_doy_dummy(len(np.unique(pd.to_datetime(mean_series.time).year)))
    
    for i in range(0, 365): #Iterate through every day of the year
    
        doy_indx = doy_tmp[:, i]
        
        # fit a polynomial for the trend of each DOY
        params_curve = polynomial.polyfit(
            np.arange(0, mean_series[doy_indx].shape[0]), 
            mean_series[doy_indx], 
            npoly
        )
        curve = polynomial.polyval(
            np.arange(0, mean_series[doy_indx].shape[0]), 
            params_curve, 
            tensor=True
        )
        
        ## Center curve in zero
        full_curve.loc[{'time': mean_series[doy_indx].time}] = curve
    
    anoms_detrended = anoms_filtered.Z_anom - full_curve.data[:, np.newaxis, np.newaxis]
    anoms_detrended = anoms_detrended.to_dataset()
    clima_std = get_climatology_std_smoothed(anoms_detrended, 'Z_anom')
    clima_std_average_region_series = get_weighted_area_average(clima_std)
    
    anoms_standardized = copy.deepcopy(anoms_detrended).Z_anom
    anoms_standardized.data = np.zeros_like(anoms_standardized.data)
    
    doy_tmp = create_doy_dummy(len(np.unique(pd.to_datetime(mean_series.time).year)))
    
    for i in range(0, 365): #Iterate through every day of the year
    
        doy_indx = doy_tmp[:, i]
    
        # grab doy std (climo)
        std_temp = clima_std_average_region_series.Z_anom_climo_std[i].data
        
        # standardize the detrended anoms by std climo
        standardized_temp = anoms_detrended.Z_anom[doy_indx].data / std_temp
        
        anoms_standardized.loc[{'time': anoms_detrended.Z_anom[doy_indx].time}] = standardized_temp
    
    anoms_standardized = anoms_standardized.to_dataset()
    anoms_standardized.to_netcdf(f'{path_origins}Z500Anoms_{name_reanalysis}_19792023.nc')

(365, 61, 151)
(365, 61, 151)


  periods = np.abs(1 / freqs)


In [4]:
path_origins = '/glade/derecho/scratch/jhayron/Data4WRsClimateChange/ProcessedDataReanalyses/'

for name_reanalysis in ['ERA5']:
    dataset_raw = xr.open_dataset(
        f'{path_origins}Z500_{name_reanalysis}.nc'
    )
    lat0=20; lat1=80; lon0=180; lon1=330
    dataset_raw = dataset_raw.sel(time=slice('1940-01-01','1978-12-31'))

    
    dataset_region = dataset_raw.where(
        (
            dataset_raw.lat>=lat0)&(
                dataset_raw.lat<=lat1)&(
                dataset_raw.lon>=lon0)&(
                dataset_raw.lon<=lon1),
        drop=True
    )
    
    
    dataset_region.Z.data = dataset_region.Z.data / 9.82 ### Divide by gravity to convert from m2/s2 to m
    
    clima = get_climatology_smoothed(dataset_region, 'Z')
    anoms = get_anomalies(dataset_region, 'Z', clima)
    
    anoms_filtered = fourierfilter(anoms.Z_anom)
    anoms_filtered = anoms_filtered.to_dataset(name='Z_anom')
    anoms_smooth = copy.deepcopy(anoms_filtered).rolling(time=60, center=True, min_periods=1).mean(skipna=True)
    mean_series = get_weighted_area_average(anoms_smooth.Z_anom)
    full_curve = copy.deepcopy(mean_series)
    full_curve.data = np.zeros(len(full_curve))
    npoly=3
    
    doy_tmp = create_doy_dummy(len(np.unique(pd.to_datetime(mean_series.time).year)))
    
    for i in range(0, 365): #Iterate through every day of the year
    
        doy_indx = doy_tmp[:, i]
        
        # fit a polynomial for the trend of each DOY
        params_curve = polynomial.polyfit(
            np.arange(0, mean_series[doy_indx].shape[0]), 
            mean_series[doy_indx], 
            npoly
        )
        curve = polynomial.polyval(
            np.arange(0, mean_series[doy_indx].shape[0]), 
            params_curve, 
            tensor=True
        )
        
        ## Center curve in zero
        full_curve.loc[{'time': mean_series[doy_indx].time}] = curve
    
    anoms_detrended = anoms_filtered.Z_anom - full_curve.data[:, np.newaxis, np.newaxis]
    anoms_detrended = anoms_detrended.to_dataset()
    clima_std = get_climatology_std_smoothed(anoms_detrended, 'Z_anom')
    clima_std_average_region_series = get_weighted_area_average(clima_std)
    
    anoms_standardized = copy.deepcopy(anoms_detrended).Z_anom
    anoms_standardized.data = np.zeros_like(anoms_standardized.data)
    
    doy_tmp = create_doy_dummy(len(np.unique(pd.to_datetime(mean_series.time).year)))
    
    for i in range(0, 365): #Iterate through every day of the year
    
        doy_indx = doy_tmp[:, i]
    
        # grab doy std (climo)
        std_temp = clima_std_average_region_series.Z_anom_climo_std[i].data
        
        # standardize the detrended anoms by std climo
        standardized_temp = anoms_detrended.Z_anom[doy_indx].data / std_temp
        
        anoms_standardized.loc[{'time': anoms_detrended.Z_anom[doy_indx].time}] = standardized_temp
    
    anoms_standardized = anoms_standardized.to_dataset()
    anoms_standardized.to_netcdf(f'{path_origins}Z500Anoms_{name_reanalysis}_19401978.nc')

(365, 61, 151)
(365, 61, 151)


  periods = np.abs(1 / freqs)


# compute PCs

In [5]:
from sklearn.decomposition import PCA
import joblib

In [6]:
def compute_pcs(dataarray):
    dataflattened = dataarray.Z_anom.stack(flat=('lat','lon')).transpose('time','flat')
    pca_obj = PCA(12, whiten=True)
    pca_obj = pca_obj.fit(dataflattened)
    datatransformed = pca_obj.transform(dataflattened)

    variance_explained = np.sum(pca_obj.explained_variance_ratio_) * 100
    return datatransformed, variance_explained, pca_obj

In [7]:
path_pcs = '/glade/derecho/scratch/jhayron/Data4WRsClimateChange/PCs_Z500/'
path_files = '/glade/derecho/scratch/jhayron/Data4WRsClimateChange/ProcessedDataReanalyses/'
reanalysis = 'ERA5'
for timeperiod in ['19401978','19792023']:
    anoms = xr.open_dataset(f'{path_files}Z500Anoms_{reanalysis}_{timeperiod}.nc')
    pcs, variance_explained_temp, pca_obj = compute_pcs(anoms)

    filename = f'{path_pcs}PCs_{reanalysis}_{timeperiod}.pca_obj'
    joblib.dump(pca_obj, filename)

    pcs = pd.DataFrame(pcs,index = anoms.time)
    pcs.to_csv(f'{path_pcs}/PCs_{reanalysis}_{timeperiod}.csv')
    print(f'{path_pcs}/PCs_{reanalysis}_{timeperiod}.csv')

/glade/derecho/scratch/jhayron/Data4WRsClimateChange/PCs_Z500//PCs_ERA5_19401978.csv
/glade/derecho/scratch/jhayron/Data4WRsClimateChange/PCs_Z500//PCs_ERA5_19792023.csv
