# Combine WRF and Landsat predictors into site and fuel specific dataframes
Last updated: Kevin Varga, 11/21/2024

**Inputs:**
* National Fuel Moisture Database LFM observations
* WRF predictor variable netcdf files
* Landsat NIRv predictor variable netcdf file

**Outputs:**
* Fuel type specific data frames with all predictor variables aligned with all LFM observation times and locations

In [1]:
import numpy as np
import pandas as pd
import xarray as xr

In [2]:
# Set file path to predictor variables calculated on a daily basis
predictor_path = '/home/sbarc/students/varga/nasa/ch1/data/predictors/daily/'
obs_path = '/home/sbarc/students/varga/nasa/ch1/data/lfm_obs/'
output_path = '/home/sbarc/students/varga/nasa/ch1/data/site_predictors/'

In [3]:
# Read in LFM observations and clean up
obs_df = pd.read_csv(obs_path + 'lfm_crop.csv')
obs_df.drop(columns=['slope', 'aspect', 'elevation', 'gacc', 'category'], inplace=True)
obs_df['date'] = pd.to_datetime(obs_df['date'])

In [7]:
# Create list of predictor variable names
predictors = ['cwd90sum', 'daylength', 'nirv', 'precip30sum', 
              'precip90sum', 'rad150mean','rh150mean', 'somo7mean', 'temp90mean']

# Get unique fuel type values only if there are more than 500 (our random forest model threshold)
fuels = obs_df['fuel'].value_counts()
fuels = fuels[fuels >= 500].index.tolist()

In [13]:
%%time
# Loop through each fuel type
for fuel_type in fuels:
    # Subset fuel type and identify individual LFM sampling sites for each fuel
    fuel_obs_df = obs_df.loc[obs_df['fuel'] == fuel_type].copy()
    fuel_obs_df = fuel_obs_df.reset_index(drop=True)
    sites = fuel_obs_df['site'].drop_duplicates()
    fuel_obs_df = fuel_obs_df.set_index(['site', 'date'])

    # Loop through each predictor variable
    for predictor_name in predictors:
        # Open predictor data array covering entire domain
        pred_da = xr.open_dataarray(predictor_path + predictor_name + '_daily.nc')

        # Loop through each LFM sampling site
        for i, site_name in enumerate(sites):
            # Subset site
            site_fuel_obs_df = fuel_obs_df.loc[site_name, :]
            # Extract predictor variable data at site location and LFM sampling times
            site_pred_da = pred_da.interp(time = site_fuel_obs_df.index,
                                          latitude = site_fuel_obs_df['latitude'][0],
                                          longitude = site_fuel_obs_df['longitude'][0])

            if predictor_name == 'nirv':
                # NIRv not saved on daily temporal resolution, only saved at SMS resolution
                # Interpolate nan values up to a limit of 40 days for NIRv due to sporadic missing values
                site_pred_da = site_pred_da.interpolate_na(dim='time', method='quadratic', 
                                                           max_gap=pd.Timedelta(40, 'd'), fill_value='extrapolate')
                # Drop lat/lon and convert to numpy
                site_pred_da = site_pred_da.drop_vars(['latitude','longitude'])
                site_pred_values = site_pred_da.to_numpy()

            else:
                # Drop lat/lon and convert to numpy
                site_pred_da = site_pred_da.drop_vars(['latitude','longitude'])
                site_pred_values = site_pred_da.to_numpy()

            if i == 0:
                # Save to new array if on the first LFM sampling site of a fuel and predictor
                all_pred_values = site_pred_values

            else:
                # Concatenate each LFM sampling site for inidividual fuels and predictors
                all_pred_values = np.concatenate([all_pred_values, site_pred_values])

        # Save all predictor values in the fuel specific obervation dataframe
        fuel_obs_df.loc[:, predictor_name] = all_pred_values
        
    # Save each fuel_specific dataframe with all predictor variables at every sampling site
    fuel_obs_df.to_csv(output_path + fuel_type + '.csv', index_label=['site','date'])

CPU times: user 11.5 s, sys: 59 s, total: 1min 10s
Wall time: 1min 10s


### Additional code for testing NIRv interpolation

In [4]:
# Open NIRv data array
pred_da = xr.open_dataarray(predictor_path + 'nirv' + '_daily.nc')

In [7]:
# Identify the chamise LFM sampling sites
fuel_obs_df = obs_df.loc[obs_df['fuel'] == 'chamise']
fuel_obs_df.reset_index(drop=True, inplace=True)
sites = fuel_obs_df['site'].drop_duplicates()
# Set site and date as multiindex
fuel_obs_df.set_index(['site', 'date'], inplace=True)

In [8]:
# Create dataframe to store stats
qstats = pd.DataFrame(index = sites, columns = ['prenan','premin','premax','postnan','postmin','postmax'])

for i, site_name in enumerate(sites):
    # Subset site
    site_fuel_obs_df = fuel_obs_df.loc[site_name, :]
    # Extract predictor variable data at site location and LFM sampling times
    site_pred_da = pred_da.interp(time = site_fuel_obs_df.index,
                                  latitude = site_fuel_obs_df['latitude'][0], 
                                  longitude = site_fuel_obs_df['longitude'][0])
    # Record how many nan values there were before data filling interpolation, as well as the min/max values
    qstats.loc[site_name, 'prenan'] = (len(site_pred_da) - site_pred_da.count().to_numpy())
    qstats.loc[site_name, 'premin'] = np.round(site_pred_da.min().values, 2)
    qstats.loc[site_name, 'premax'] = np.round(site_pred_da.max().values, 2)
    
    # Interpolate nan values up to a limit of 40 days
    site_pred_da = site_pred_da.interpolate_na(dim='time', method='quadratic', max_gap=pd.Timedelta(40, 'd'), fill_value='extrapolate')
    # Record how many nan values there were after interpolation, as well as the min/max values
    qstats.loc[site_name, 'postnan'] = (len(site_pred_da) - site_pred_da.count().to_numpy())
    qstats.loc[site_name, 'postmin'] = np.round(site_pred_da.min().values, 2)
    qstats.loc[site_name, 'postmax'] = np.round(site_pred_da.max().values, 2)

print(qstats)

                               prenan premin premax postnan postmin postmax
site                                                                       
bitter_canyon_castaic               1   0.02   0.11       0    0.02    0.11
cachuma                             0   0.04   0.13       0    0.04    0.13
circle_x_malibu                     0   0.05   0.13       0    0.05    0.13
clark_motorway_malibu               0   0.04   0.16       0    0.04    0.16
gifford                             1   0.04   0.12       0    0.04    0.12
harris_grade                        0   0.06   0.12       0    0.06    0.12
irish_hills                         1   0.06   0.14       0    0.06    0.14
laguna_ridge_casitas                1   0.05   0.26       0    0.05    0.26
lopez_lake                          1   0.05   0.16       0    0.05    0.16
los_alamos                          1   0.03   0.08       0    0.03    0.08
los_robles_thousand_oaks            1   0.05   0.12       0    0.05    0.12
oak_flat    