In [1]:
import rioxarray as riox
import rasterio as rio
import xarray as xr
import os
import re
import numpy as np
import pandas as pd
import geopandas as gpd
from datetime import datetime, timedelta
from src.hls_funcs.masks import shp2mask
from tqdm import tqdm

In [2]:
prefix = 'cper'
yr = 2023
keep_recent_days = False

cluster_loc = 'local'

In [3]:
if cluster_loc == 'local':
    #os.chdir(wkDIR)
    print('   setting up Local cluster...')
    from dask.distributed import LocalCluster, Client
    import dask
    cluster = LocalCluster(n_workers=8, threads_per_worker=2)
    client = Client(cluster)
    display(client)
    inDIR = 'data/'
    hlsDIR = 'data/hls_nrt/'
elif cluster_loc == 'hpc':
    from dask.distributed import LocalCluster, Client
    import dask_jobqueue as jq
    import dask
    from jupyter_server import serverapp
    wkDIR = '/project/cper_neon_aop/hls_nrt/'
    inDIR = '/90daydata/cper_neon_aop/hls_nrt/'
    hlsDIR = inDIR
    os.chdir(wkDIR)
    # get the server address for porting
    try:
        jupServer = [x for x in serverapp.list_running_servers()][0]
    except IndexError:
        # manually copy/paste the server address
        jupServer = {'base_url': '/node/ceres19-compute-98-eth.scinet.local/17710/'}
    print('   setting up cluster on HPC...')
    dask.config.set({'distributed.dashboard.link': jupServer['base_url'] + 'proxy/{port}/status'})
    partition='short',#'short','debug', 'mem', 'mem-low',
    num_processes = 4
    num_threads_per_processes = 2
    mem = 2.5*num_processes*num_threads_per_processes
    n_cores_per_job = num_processes*num_threads_per_processes
    clust = jq.SLURMCluster(queue=partition,
                            processes=num_processes,
                            cores=n_cores_per_job,
                            memory=str(mem)+'GB',
                            interface='ib0',
                            #interface='enp24s0f0',
                            local_directory='$TMPDIR',
                            death_timeout=30,
                            walltime='02:00:00',
                            job_extra=["--output=/dev/null","--error=/dev/null"])
    client=Client(clust)
    #Scale Cluster 
    num_jobs=16
    clust.scale(jobs=num_jobs)
    try:
        client.wait_for_workers(n_workers=num_jobs*num_processes, timeout=60)
    except dask.distributed.TimeoutError as e:
        print(str(num_jobs*num_processes) + ' workers not available. Continuing with available workers.')
        #print(e)
        pass
    display(client)

   setting up Local cluster...


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 8
Total threads: 16,Total memory: 11.85 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:32977,Workers: 8
Dashboard: http://127.0.0.1:8787/status,Total threads: 16
Started: Just now,Total memory: 11.85 GiB

0,1
Comm: tcp://127.0.0.1:40381,Total threads: 2
Dashboard: http://127.0.0.1:44729/status,Memory: 1.48 GiB
Nanny: tcp://127.0.0.1:35147,
Local directory: /mnt/c/Users/Sean.Kearney/git_repos/hls_nrt/dask-worker-space/worker-4aj_zahb,Local directory: /mnt/c/Users/Sean.Kearney/git_repos/hls_nrt/dask-worker-space/worker-4aj_zahb

0,1
Comm: tcp://127.0.0.1:43713,Total threads: 2
Dashboard: http://127.0.0.1:40097/status,Memory: 1.48 GiB
Nanny: tcp://127.0.0.1:37483,
Local directory: /mnt/c/Users/Sean.Kearney/git_repos/hls_nrt/dask-worker-space/worker-f7y7nwj_,Local directory: /mnt/c/Users/Sean.Kearney/git_repos/hls_nrt/dask-worker-space/worker-f7y7nwj_

0,1
Comm: tcp://127.0.0.1:40575,Total threads: 2
Dashboard: http://127.0.0.1:39297/status,Memory: 1.48 GiB
Nanny: tcp://127.0.0.1:38629,
Local directory: /mnt/c/Users/Sean.Kearney/git_repos/hls_nrt/dask-worker-space/worker-equz9q0r,Local directory: /mnt/c/Users/Sean.Kearney/git_repos/hls_nrt/dask-worker-space/worker-equz9q0r

0,1
Comm: tcp://127.0.0.1:41033,Total threads: 2
Dashboard: http://127.0.0.1:34737/status,Memory: 1.48 GiB
Nanny: tcp://127.0.0.1:43403,
Local directory: /mnt/c/Users/Sean.Kearney/git_repos/hls_nrt/dask-worker-space/worker-z01kyci1,Local directory: /mnt/c/Users/Sean.Kearney/git_repos/hls_nrt/dask-worker-space/worker-z01kyci1

0,1
Comm: tcp://127.0.0.1:44347,Total threads: 2
Dashboard: http://127.0.0.1:40695/status,Memory: 1.48 GiB
Nanny: tcp://127.0.0.1:43865,
Local directory: /mnt/c/Users/Sean.Kearney/git_repos/hls_nrt/dask-worker-space/worker-7sp0pst2,Local directory: /mnt/c/Users/Sean.Kearney/git_repos/hls_nrt/dask-worker-space/worker-7sp0pst2

0,1
Comm: tcp://127.0.0.1:34965,Total threads: 2
Dashboard: http://127.0.0.1:45447/status,Memory: 1.48 GiB
Nanny: tcp://127.0.0.1:41521,
Local directory: /mnt/c/Users/Sean.Kearney/git_repos/hls_nrt/dask-worker-space/worker-7uqcixa9,Local directory: /mnt/c/Users/Sean.Kearney/git_repos/hls_nrt/dask-worker-space/worker-7uqcixa9

0,1
Comm: tcp://127.0.0.1:38357,Total threads: 2
Dashboard: http://127.0.0.1:33255/status,Memory: 1.48 GiB
Nanny: tcp://127.0.0.1:41271,
Local directory: /mnt/c/Users/Sean.Kearney/git_repos/hls_nrt/dask-worker-space/worker-965jucwc,Local directory: /mnt/c/Users/Sean.Kearney/git_repos/hls_nrt/dask-worker-space/worker-965jucwc

0,1
Comm: tcp://127.0.0.1:45699,Total threads: 2
Dashboard: http://127.0.0.1:41025/status,Memory: 1.48 GiB
Nanny: tcp://127.0.0.1:37817,
Local directory: /mnt/c/Users/Sean.Kearney/git_repos/hls_nrt/dask-worker-space/worker-mgqkbawt,Local directory: /mnt/c/Users/Sean.Kearney/git_repos/hls_nrt/dask-worker-space/worker-mgqkbawt


In [4]:
ds = riox.open_rasterio(os.path.join(inDIR, 'gcloud', 'hls_' + prefix + '_' + str(yr) + '_gcloud.nc'), masked=True)
ds_ndvi_lta = riox.open_rasterio(os.path.join(inDIR, 'ee_lta', prefix + '_ee_ndvi_landsat_wkly_lta.nc'), masked=True)
ds_ndvi_lta['date'] = [datetime.strptime(re.sub('2020', '2099', str(x)),'%Y-%m-%d %H:%M:%S') for x in ds_ndvi_lta['date'].values]
#ds_ndvi_lta['date'] = ds_ndvi_lta['date'].dt.date
ds_ndvi_lta = ds_ndvi_lta.reindex({'y': ds.y, 'x': ds.x}, method='nearest', tolerance=30)#.isnull().all()

In [5]:
if prefix == 'cper':
    cper_f = 'data/ground/cper_pastures_2017_dissolved.shp'
    cper = gpd.read_file(cper_f).to_crs(ds.rio.crs.to_epsg())
    cper_info = cper[['Pasture', 'geometry']].reset_index(drop=True).reset_index().rename(columns={'index': 'id'})
    past_dict = {row.id+1: row.Pasture for _, row in cper_info.iterrows()}
    past_dict[0] = 'UNK'
    cper_mask_shp = [(row.geometry, row.id+1) for _, row in cper_info.iterrows()]
    cper_mask = shp2mask(shp=cper_mask_shp, 
                         transform=ds.rio.transform(), 
                         outshape=ds['NDVI'].shape[1:], 
                         xr_object=ds['NDVI'])
    past_mask = np.array([past_dict[i] for i in cper_mask.values.flatten()]).reshape(cper_mask.shape)

In [6]:
ds = ds.assign(Pasture=(['y', 'x'], past_mask)).chunk({'y': 50, 'x': 50})
ds = ds.set_coords('Pasture')

In [7]:
#ds_ndvi_lta['date'] = pd.to_datetime(ds_ndvi_lta['date']) + timedelta(days=3)

In [8]:
df_ndvi_lta = ds_ndvi_lta.groupby(ds['Pasture']).mean(dim='stacked_y_x').to_dataframe().reset_index().drop(columns='spatial_ref')
df_ndvi_lta['Year'] = '30-yr avg.'

In [10]:
yr_list = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
for idx, yr_i in enumerate(tqdm(yr_list)):
    mon_day = (8 - datetime(2020, 1, 1).weekday()) % 7
    yr_dates_tmp = [datetime(yr_i, 1, mon_day) + timedelta(weeks=w) for w in range(52)]
    if yr_i == yr:
        ds_i = ds
    else:
        ds_i = riox.open_rasterio(os.path.join(inDIR, 'gcloud', 'hls_' + prefix + '_' + str(yr_i) + '_gcloud.nc'), masked=True)
    ds_i['date'] = [datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S') for x in ds_i['date'].values]
    
    # update the date range to match the dataset
    yr_dates_tmp = [x for x in yr_dates_tmp if (x <= pd.to_datetime(ds_i['NDVI'].date.max().values) + timedelta(days=1)) and 
                    (x >= pd.to_datetime(ds_i['NDVI'].date.min().values) - timedelta(days=1))]
    
    if (yr_i != yr_list[-1]) or not keep_recent_days:
        ds_ndvi_yr_wkly = ds_i['NDVI'].sel(date=yr_dates_tmp, method='nearest', tolerance=timedelta(days=1), drop=True)
        ds_bm_yr_wkly = ds_i['Biomass'].sel(date=yr_dates_tmp, method='nearest', tolerance=timedelta(days=1), drop=True)
        ds_bare_yr_wkly = ds_i['BARE'].sel(date=yr_dates_tmp, method='nearest', tolerance=timedelta(days=1), drop=True)
        ds_sd_yr_wkly = ds_i['SD'].sel(date=yr_dates_tmp, method='nearest', tolerance=timedelta(days=1), drop=True)
        ds_green_yr_wkly = ds_i['GREEN'].sel(date=yr_dates_tmp, method='nearest', tolerance=timedelta(days=1), drop=True)
        ds_litt_yr_wkly = ds_i['LITT'].sel(date=yr_dates_tmp, method='nearest', tolerance=timedelta(days=1), drop=True)
        df_yr_wkly = xr.merge([ds_ndvi_yr_wkly,
                               ds_bm_yr_wkly,
                               ds_bare_yr_wkly,
                               ds_sd_yr_wkly,
                               ds_green_yr_wkly,
                               ds_litt_yr_wkly]).groupby(
            ds['Pasture']).mean(
            dim='stacked_y_x').to_dataframe().reset_index().drop(
            columns='spatial_ref')
    else:
        df_yr_wkly = xr.merge([ds_i['NDVI'],
                           ds_i['Biomass'],
                           ds_i['BARE'],
                           ds_i['SD'],
                           ds_i['GREEN'],
                           ds_i['LITT']]).groupby(
        ds['Pasture']).mean(
        dim='stacked_y_x').to_dataframe().reset_index().drop(
        columns='spatial_ref')
    df_yr_wkly['Year'] = str(yr_i)
    
    if idx == 0:
        df_out = df_yr_wkly.copy()
    else:
        df_out = pd.concat([df_out, df_yr_wkly])

100%|███████████████████████████████████████████████████████████████████████████████████| 11/11 [00:42<00:00,  3.88s/it]


In [11]:
df_out = pd.concat([df_ndvi_lta, df_out])

In [12]:
df_out_aoi = df_out.groupby('date').mean().reset_index()

In [13]:
df_out_aoi['Pasture'] = prefix
df_out_aoi['Year'] = df_out_aoi['date'].dt.isocalendar().year.transform(lambda x: '30-yr avg.' if x == 2099 else str(x))
df_out = pd.concat([df_out, df_out_aoi])

In [14]:
df_out = df_out.reset_index(drop=True)

In [15]:
df_out['cov_sum'] = df_out[['BARE', 'SD', 'GREEN', 'LITT']].sum(axis=1)

In [16]:
# make sure pasture-scale means of fractional cover sum to 1
for c in tqdm(['BARE', 'SD', 'GREEN', 'LITT']):
    df_out[c] = df_out.groupby(['date', 'Year', 'Pasture']).apply(lambda x: x[c]/x['cov_sum']).reset_index(level=[0, 1, 2])[0]
#df_out = df_out.drop(columns=['cov_sum'])

100%|█████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:42<00:00, 10.64s/it]


In [17]:
df_out = df_out.drop(columns=['cov_sum'])

In [18]:
df_out[['NDVI', 
        'Biomass',
        'BARE',
        'SD', 
        'GREEN',
        'LITT']] = df_out.transform({'NDVI': lambda x: np.round(x, 3),
                  'Biomass': lambda x: np.round(x, 0),
                  'BARE': lambda x: np.round(x * 100, 1),
                  'SD': lambda x: np.round(x * 100, 1),
                  'GREEN': lambda x: np.round(x * 100, 1),
                  'LITT': lambda x: np.round(x * 100, 1)})

In [19]:
df_out.to_csv(os.path.join(inDIR, 'gcloud', 'hls_' + prefix + '_means.csv'), index=False)