In [1]:
import numpy as np
import os
import re
import pandas as pd
import geopandas as gpd
from datetime import datetime, timedelta
from tqdm import tqdm
tqdm.pandas()
import time
from hlsstack.hls_funcs.smooth import despike_ts, double_savgol
import gc
from glob import glob
from itertools import chain
import matplotlib.pyplot as plt
import seaborn as sns

ERROR 1: PROJ: proj_create_from_database: Open of /project/cper_neon_aop/conda_envs/hls_nrt_env/share/proj failed


In [2]:
import sys
sys.path.insert(1, '/project/cper_neon_aop/hls_nrt/params/co_wss_params')
from co_wss_params_smooth import *

In [3]:
#from importlib import reload
#import sys
#reload(sys.modules["ltar_reno_params_smooth"])

In [4]:
def adapt_smooth(ts, dates, despike=True, dat_thresh=None):
    ct_valid = sum(~np.isnan(ts))
    ct_total = len(ts)
    avg_gap = ct_total/ct_valid
    if ct_valid > 0:
        if avg_gap > 15:
            despike = False
        if despike:
            if dat_thresh is None:
                _dat_thresh = np.ptp(ts.values) * 0.10
            else:
                _dat_thresh = dat_thresh
            ts_ds = despike_ts(ts.values, dat_thresh=_dat_thresh, days_thresh=45)
        else:
            ts_ds = ts.values
        if avg_gap > 10:
            ts_smooth = double_savgol(ts_ds, double=True, window1_max=7, window2=31, limit=91)
        elif avg_gap > 7:
             ts_smooth = double_savgol(ts_ds, double=True, window1_max=5, window2=41, limit=91)
        elif avg_gap > 5:
             ts_smooth = double_savgol(ts_ds, double=True, window1_max=5, window2=51, limit=91)
        else:
            ts_smooth = double_savgol(ts_ds, double=False, window2=51, limit=91)
    else:
        ts_smooth = ts
    return pd.Series(ts_smooth, ts.index.get_level_values(-1))

In [5]:
fLIST = glob(inPATH_wc)
df = pd.read_csv(inPATH_dat, parse_dates=[date_col])

In [6]:
if preprocess is not None:
    df = preprocess(df)

In [7]:
df.head(5)

Unnamed: 0,plot_id,year,tmax_Fall,tmax_Spring,tmax_Summer,tmax_Winter,tmin_Fall,tmin_Spring,tmin_Summer,tmin_Winter,...,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,Longitude,Latitude,date
0,90,2013,18.093,15.469333,31.385667,6.890667,2.944667,-0.626333,13.051,-10.074666,...,198122,0500000US08013,8013,Boulder,6,1881290197,36512644,-105.08369,40.2155,2013-08-01
1,19,2014,19.764,17.866667,30.010334,6.795667,2.328,1.292333,12.485333,-8.753333,...,198122,0500000US08013,8013,Boulder,6,1881290197,36512644,-105.10656,40.218361,2014-08-01
2,19,2015,20.734333,17.272666,30.867,8.573,4.229667,1.912333,13.297667,-6.749,...,198122,0500000US08013,8013,Boulder,6,1881290197,36512644,-105.10467,40.2178,2015-08-01
3,19,2016,22.2275,16.65,31.645334,7.93,3.820667,1.422333,13.247667,-7.922,...,198122,0500000US08013,8013,Boulder,6,1881290197,36512644,-105.106776,40.218197,2016-08-01
4,19,2018,18.544333,18.427667,30.664,7.45,2.399667,1.957667,13.454,-8.104,...,198122,0500000US08013,8013,Boulder,6,1881290197,36512644,-105.1036,40.2177,2018-08-01


In [8]:
df_out = None

In [9]:
for yr in df[date_col].dt.year.unique():
    print(yr)

    # get list of files that match year based on path
    fLIST_yr = [x for x in fLIST if str(yr) in os.path.basename(x).split('_')]

    # combine all individual tile files into one dataframe
    df_yr = pd.concat([pd.read_csv(x, parse_dates=[1]) for x in fLIST_yr])
    
    # rename columns to match ground data
    df_yr = df_yr.rename(columns={'time': date_col,
                                  'Plot': id_col})
    
    # convert date to datetime
    df_yr[date_col] = pd.to_datetime(df_yr[date_col])
    
    # remove all non-plot data
    #df_yr = df_yr[df_yr[id_col] != 'UNK']
    
    # remove any dates where bands are negative
    df_yr = df_yr[~((df_yr[band_list] < 0).any(axis=1))]

    # drop any duplicates by taking mean for each plot and date (might exist e.g., from plot on edge of tile during extraction)
    df_yr = df_yr.groupby([id_col, date_col]).mean().reset_index()
    
    # get missing dates for gap-filling
    dates_missing = [x for x in pd.date_range(df_yr[date_col].min(), df_yr[date_col].max()).date if x not in df_yr[date_col].dt.date.unique()]
    
    # convert missing dates to a dataframe for combining with veg index data
    df_missing = pd.DataFrame({
        id_col: list(chain.from_iterable([list(np.repeat(x, len(dates_missing))) for x in df_yr[id_col].unique()])),
        date_col: list(chain.from_iterable(dates_missing for x in df_yr[id_col].unique()))})
    
    df_missing[date_col] = pd.to_datetime(df_missing[date_col])
    
    # combine into one dataframe for gapfilling
    df_yr_ts = pd.concat([df_yr, df_missing]).sort_values([id_col, date_col])
    
    # apply Bolton mask to extracted values
    if mask_bolton_by_id:
        ps_bolton = df_yr_ts.groupby(id_col).progress_apply(lambda x: bolton_mask_np(x['BLUE'], x['SWIR2']))
        df_yr_ts.loc[df_yr_ts[ps_bolton.droplevel(0) == 1.0].index, list(veg_dict.keys()) + band_list] = np.nan
    
    # smooth all vegetation indices to gapfill
    print('Smoothing vegetation indices')
    for vegidx in tqdm(veg_list):
        #df_yr_ts[vegidx + '_smooth'] = df_yr_ts.groupby(id_col)[vegidx].transform(lambda x: double_savgol(x.values))
        vals_smooth_yr = df_yr_ts.groupby(id_col).apply(lambda x: adapt_smooth(x[vegidx], pd.to_datetime(x[date_col])))
        if vals_smooth_yr.index.nlevels > 1:
            df_yr_ts[vegidx + '_smooth'] = vals_smooth_yr.droplevel(list(np.arange(vals_smooth_yr.index.nlevels-1)))
        else:
            df_yr_ts[vegidx + '_smooth'] = vals_smooth_yr.values.squeeze()
    print('Smoothing individual bands')
    for band in tqdm(band_list):
        #df_yr_ts[band + '_smooth'] = df_yr_ts.groupby(id_col)[band].transform(lambda x: double_savgol(x.values))
        vals_smooth_yr = df_yr_ts.groupby(id_col).apply(lambda x: adapt_smooth(x[band], pd.to_datetime(x[date_col])))
        if vals_smooth_yr.index.nlevels > 1:
            df_yr_ts[band + '_smooth'] = vals_smooth_yr.droplevel(list(np.arange(vals_smooth_yr.index.nlevels-1)))
        else:
            df_yr_ts[band + '_smooth'] = vals_smooth_yr.values.squeeze()
    
    # rename smoothed columns and drop originals
    df_yr_ts = df_yr_ts.drop(columns=veg_list + band_list)
    col_rename_dict = {c: re.sub('_smooth', '', c) for c in df_yr_ts.columns if '_smooth' in c}
    df_yr_ts = df_yr_ts.rename(columns=col_rename_dict)

    # merge with the ground data
    df_out_yr = pd.merge(df[df[date_col].dt.year == yr], 
                         df_yr_ts[[id_col, date_col] + veg_list + band_list], 
                         on=[id_col, date_col],
                         how='left')

    # create or append to final output
    if df_out is not None:
        # merge with existing ouptput dataset
        df_out = pd.concat([df_out, df_out_yr])
    else:
        # create output dataset
        df_out = df_out_yr.copy()

2013
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 135.85it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 289.71it/s]


2014
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 410.12it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 406.50it/s]


2015
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 398.04it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 370.81it/s]


2016
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 163.32it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 165.91it/s]


2018
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 141.22it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 145.16it/s]


2019
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 188.78it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 225.19it/s]


2020
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 247.89it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 236.54it/s]


2021
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 143.87it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 144.49it/s]


2022
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 183.42it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 204.54it/s]


2017


ValueError: No objects to concatenate

In [10]:
df_out.head(5)

Unnamed: 0,plot_id,year,tmax_Fall,tmax_Spring,tmax_Summer,tmax_Winter,tmin_Fall,tmin_Spring,tmin_Summer,tmin_Winter,...,BAI_146,BAI_236,BAI_246,BAI_346,BLUE,GREEN,RED,NIR1,SWIR1,SWIR2
0,90,2013,18.093,15.469333,31.385667,6.890667,2.944667,-0.626333,13.051,-10.074666,...,86.980117,198.244553,84.433554,77.98036,398.650372,705.335429,604.349792,3937.752002,1961.913096,1074.808703
1,77,2013,17.437667,14.070667,29.778334,5.885667,3.721,-0.048,14.073667,-8.409,...,,,,,,,,,,
2,78,2013,17.525667,14.217333,30.109333,5.803667,3.550667,-0.092,14.091667,-8.854667,...,,,,,,,,,,
3,79,2013,17.941,14.422334,30.668666,5.902,2.449333,-0.891333,13.485333,-10.618666,...,,,,,,,,,,
4,106,2013,17.532,14.942666,30.469334,6.007333,2.127333,-1.608333,12.058667,-11.162,...,,,,,,,,,,


In [None]:
# look for any missing data
df_out[df_out['NDVI'].isnull()].sort_values(id_col)

In [55]:
# look for any duplicated ID's
df_out[df_out.duplicated(subset=[id_col, date_col], keep=False)].sort_values(id_col)

Unnamed: 0.1,Unnamed: 0,Block_name,Treatment,Date,Site,Ranch,Year,Low.reading_mean,Low.reading_sd,High.reading_mean,...,BAI_146,BAI_236,BAI_246,BAI_346,BLUE,GREEN,RED,NIR1,SWIR1,SWIR2


In [12]:
# look for any duplicated ID's
df_out[df_out.duplicated(subset=[id_col, date_col, 'NDVI'], keep=False)].sort_values(id_col)

Unnamed: 0.1,Unnamed: 0,Block_name,Treatment,Date,Site,Ranch,Year,Low.reading_mean,Low.reading_sd,High.reading_mean,...,BAI_146,BAI_236,BAI_246,BAI_346,BLUE,GREEN,RED,NIR1,SWIR1,SWIR2


In [13]:
# get mean or drop duplicated ID's
df_out = df_out.drop_duplicates(subset=[id_col, date_col], keep='first')

In [14]:
df_out.to_csv(outPATH, index=False)