In [1]:
import numpy as np
import os
import re
import pandas as pd
import geopandas as gpd
from datetime import datetime, timedelta
from tqdm import tqdm
tqdm.pandas()
import time
from hlsstack.hls_funcs.smooth import despike_ts, double_savgol
import gc
from glob import glob
from itertools import chain
import matplotlib.pyplot as plt
import seaborn as sns

ERROR 1: PROJ: proj_create_from_database: Open of /project/cper_neon_aop/conda_envs/hls_nrt_env/share/proj failed


In [12]:
import sys
sys.path.insert(1, '/project/cper_neon_aop/hls_nrt/params')
from ltar_reno_params_smooth import *

In [11]:
#from importlib import reload
#import sys
#reload(sys.modules["ltar_reno_params_smooth"])

<module 'ltar_reno_params_smooth' from '/project/cper_neon_aop/hls_nrt/params/ltar_reno_params_smooth.py'>

In [13]:
def adapt_smooth(ts, dates, despike=True, dat_thresh=None):
    ct_valid = sum(~np.isnan(ts))
    ct_total = len(ts)
    avg_gap = ct_total/ct_valid
    if ct_valid > 0:
        if avg_gap > 15:
            despike = False
        if despike:
            if dat_thresh is None:
                _dat_thresh = np.ptp(ts.values) * 0.10
            else:
                _dat_thresh = dat_thresh
            ts_ds = despike_ts(ts.values, dat_thresh=_dat_thresh, days_thresh=45)
        else:
            ts_ds = ts.values
        if avg_gap > 10:
            ts_smooth = double_savgol(ts_ds, double=True, window1_max=7, window2=31, limit=91)
        elif avg_gap > 7:
             ts_smooth = double_savgol(ts_ds, double=True, window1_max=5, window2=41, limit=91)
        elif avg_gap > 5:
             ts_smooth = double_savgol(ts_ds, double=True, window1_max=5, window2=51, limit=91)
        else:
            ts_smooth = double_savgol(ts_ds, double=False, window2=51, limit=91)
    else:
        ts_smooth = ts
    return pd.Series(ts_smooth, ts.index.get_level_values(-1))

In [14]:
fLIST = glob(inPATH_wc)
df = pd.read_csv(inPATH_dat, parse_dates=[date_col])

In [15]:
if preprocess is not None:
    df = preprocess(df)

In [16]:
df.head(5)

Unnamed: 0,Pasture,Date,Sample#,DW kg/ha,%moist
0,P11,2020-05-11,3.0,703.2,77.2
1,P11,2020-06-18,3.0,2154.4,51.0
2,P11,2020-08-12,3.0,4060.0,67.8
3,P11,2020-10-15,3.0,2824.0,47.6
4,P11,2021-05-26,3.0,652.8,77.2


In [36]:
df_out = None

In [37]:
for yr in df[date_col].dt.year.unique():
    print(yr)

    # get list of files that match year based on path
    fLIST_yr = [x for x in fLIST if str(yr) in os.path.basename(x).split('_')]

    # combine all individual tile files into one dataframe
    df_yr = pd.concat([pd.read_csv(x, parse_dates=[1]) for x in fLIST_yr])
    
    # rename columns to match ground data
    df_yr = df_yr.rename(columns={'time': date_col,
                                  'Plot': id_col})
    
    # convert date to datetime
    df_yr[date_col] = pd.to_datetime(df_yr[date_col])
    
    # remove all non-plot data
    #df_yr = df_yr[df_yr[id_col] != 'UNK']
    
    # remove any dates where bands are negative
    df_yr = df_yr[~((df_yr[band_list] < 0).any(axis=1))]

    # drop any duplicates by taking mean for each plot and date (might exist e.g., from plot on edge of tile during extraction)
    df_yr = df_yr.groupby([id_col, date_col]).mean().reset_index()
    
    # get missing dates for gap-filling
    dates_missing = [x for x in pd.date_range(df_yr[date_col].min(), df_yr[date_col].max()).date if x not in df_yr[date_col].dt.date.unique()]
    
    # convert missing dates to a dataframe for combining with veg index data
    df_missing = pd.DataFrame({
        id_col: list(chain.from_iterable([list(np.repeat(x, len(dates_missing))) for x in df_yr[id_col].unique()])),
        date_col: list(chain.from_iterable(dates_missing for x in df_yr[id_col].unique()))})
    
    df_missing[date_col] = pd.to_datetime(df_missing[date_col])
    
    # combine into one dataframe for gapfilling
    df_yr_ts = pd.concat([df_yr, df_missing]).sort_values([id_col, date_col])
    
    # apply Bolton mask to extracted values
    if mask_bolton_by_id:
        ps_bolton = df_yr_ts.groupby(id_col).progress_apply(lambda x: bolton_mask_np(x['BLUE'], x['SWIR2']))
        df_yr_ts.loc[df_yr_ts[ps_bolton.droplevel(0) == 1.0].index, list(veg_dict.keys()) + band_list] = np.nan
    
    # smooth all vegetation indices to gapfill
    print('Smoothing vegetation indices')
    for vegidx in tqdm(veg_list):
        #df_yr_ts[vegidx + '_smooth'] = df_yr_ts.groupby(id_col)[vegidx].transform(lambda x: double_savgol(x.values))
        vals_smooth_yr = df_yr_ts.groupby(id_col).apply(lambda x: adapt_smooth(x[vegidx], pd.to_datetime(x[date_col])))
        if vals_smooth_yr.index.nlevels > 1:
            df_yr_ts[vegidx + '_smooth'] = vals_smooth_yr.droplevel(list(np.arange(vals_smooth_yr.index.nlevels-1)))
        else:
            df_yr_ts[vegidx + '_smooth'] = vals_smooth_yr.values.squeeze()
    print('Smoothing individual bands')
    for band in tqdm(band_list):
        #df_yr_ts[band + '_smooth'] = df_yr_ts.groupby(id_col)[band].transform(lambda x: double_savgol(x.values))
        vals_smooth_yr = df_yr_ts.groupby(id_col).apply(lambda x: adapt_smooth(x[band], pd.to_datetime(x[date_col])))
        if vals_smooth_yr.index.nlevels > 1:
            df_yr_ts[band + '_smooth'] = vals_smooth_yr.droplevel(list(np.arange(vals_smooth_yr.index.nlevels-1)))
        else:
            df_yr_ts[band + '_smooth'] = vals_smooth_yr.values.squeeze()
    
    # rename smoothed columns and drop originals
    df_yr_ts = df_yr_ts.drop(columns=veg_list + band_list)
    col_rename_dict = {c: re.sub('_smooth', '', c) for c in df_yr_ts.columns if '_smooth' in c}
    df_yr_ts = df_yr_ts.rename(columns=col_rename_dict)

    # merge with the ground data
    df_out_yr = pd.merge(df[df[date_col].dt.year == yr], 
                         df_yr_ts[[id_col, date_col] + veg_list + band_list], 
                         on=[id_col, date_col],
                         how='left')

    # create or append to final output
    if df_out is not None:
        # merge with existing ouptput dataset
        df_out = pd.concat([df_out, df_out_yr])
    else:
        # create output dataset
        df_out = df_out_yr.copy()

2020
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 26.61it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 25.21it/s]


2021
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 39.61it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 39.52it/s]


2022
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 55.03it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 55.28it/s]


2023
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 37.50it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 37.17it/s]


2019
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 35.12it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 34.88it/s]


2016
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 374.86it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 353.41it/s]


2017
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 317.89it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 296.05it/s]


2018
Smoothing vegetation indices


100%|██████████| 22/22 [00:00<00:00, 176.14it/s]


Smoothing individual bands


100%|██████████| 6/6 [00:00<00:00, 168.41it/s]


In [38]:
df_out.head(5)

Unnamed: 0,Pasture,Date,Sample#,DW kg/ha,%moist,NDVI,DFI,NDTI,SATVI,NDII7,...,BAI_146,BAI_236,BAI_246,BAI_346,BLUE,GREEN,RED,NIR1,SWIR1,SWIR2
0,P11,2020-05-11,3.0,703.2,77.2,0.761241,6.460799,0.313401,0.296264,0.500965,...,88.803598,221.816604,86.423693,78.801794,404.518993,783.583046,519.463895,3816.44824,2431.796455,1275.180726
1,P11,2020-06-18,3.0,2154.4,51.0,0.684892,9.194049,0.329653,0.263256,0.490361,...,91.444944,191.911483,89.322682,82.438767,302.416055,709.105194,661.663863,3602.577144,2403.178259,1214.134063
2,P11,2020-08-12,3.0,4060.0,67.8,0.758583,7.466907,0.373623,0.273021,0.603621,...,80.540618,218.592972,78.894778,72.047286,312.472266,845.438721,585.274542,4260.327724,2294.229036,1046.967021
3,P11,2020-10-15,3.0,2824.0,47.6,0.541829,13.257943,0.28764,0.20345,0.413456,...,92.287918,160.471711,90.496135,84.622399,542.341911,1006.067367,1197.240371,3854.55003,2920.11996,1627.810749
4,P13,2020-05-11,3.0,1190.4,59.6,0.55176,12.512005,0.278206,0.292958,0.255567,...,107.912871,184.866437,105.908365,98.225415,499.876336,861.009288,886.917173,3078.469706,3219.755159,1826.204741


In [39]:
# look for any missing data
df_out[df_out['NDVI'].isnull()].sort_values(id_col)

Unnamed: 0,Pasture,Date,Sample#,DW kg/ha,%moist,NDVI,DFI,NDTI,SATVI,NDII7,...,BAI_146,BAI_236,BAI_246,BAI_346,BLUE,GREEN,RED,NIR1,SWIR1,SWIR2
6,P18,2018-07-23,2.0,2196.0,48.666667,,,,,,...,,,,,,,,,,
7,P18,2018-08-07,3.0,1942.4,36.6,,,,,,...,,,,,,,,,,
8,P18,2018-09-18,3.0,4050.4,58.8,,,,,,...,,,,,,,,,,


In [40]:
# look for any duplicated ID's
df_out[df_out.duplicated(subset=[id_col, date_col], keep=False)].sort_values(id_col)

Unnamed: 0,Pasture,Date,Sample#,DW kg/ha,%moist,NDVI,DFI,NDTI,SATVI,NDII7,...,BAI_146,BAI_236,BAI_246,BAI_346,BLUE,GREEN,RED,NIR1,SWIR1,SWIR2


In [41]:
# look for any duplicated ID's
df_out[df_out.duplicated(subset=[id_col, date_col, 'NDVI'], keep=False)].sort_values(id_col)

Unnamed: 0,Pasture,Date,Sample#,DW kg/ha,%moist,NDVI,DFI,NDTI,SATVI,NDII7,...,BAI_146,BAI_236,BAI_246,BAI_346,BLUE,GREEN,RED,NIR1,SWIR1,SWIR2


In [42]:
# get mean or drop duplicated ID's
#df_out = df_out.drop_duplicates(subset=[id_col, date_col], keep='first')

In [51]:
df_out.to_csv(outPATH, index=False)