## Run fuel specific, LFM random forest models over entire spatiotemporal domain
Last updated: Kevin Varga, 11/27/2024

**Inputs:**
* Gridded predictor variables
* Fuel specific random forest models

**Outputs:**
* Netcdf file of predicted LFM of all fuels

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
import pandas as pd
from pathlib import Path

import joblib
from sklearn.preprocessing import StandardScaler

In [6]:
# Set paths
pred_path = '/home/sbarc/students/varga/nasa/ch1/data/predictors/'
rf_path = '/home/sbarc/students/varga/nasa/ch1/data/random_forest/'
grid_path = '/home/sbarc/students/varga/nasa/ch1/data/lfm_model/'

In [7]:
# Open land/water data array and mask to land
land_mask = xr.open_dataarray('/home/sbarc/students/varga/nasa/ch1/data/wrf/vars/land_mask_crop.nc')
land_mask = land_mask.where(land_mask == 1, np.nan)

# Find where the land_mask is not NaN
valid_indices = np.argwhere(~np.isnan(land_mask.values[:, :]))

In [8]:
# Create list of all predictors
pred_list = sorted(list(Path(pred_path).glob('*.nc')))

In [9]:
# Open each predictor netCDF file and store the DataArrays in a list
data_arrays = [xr.open_dataarray(file) for file in pred_list]

# Merge predictor data arrays
ds = xr.merge(data_arrays)
# Subset to time domain
ds = ds.sel(time = slice('1987-12-01','2019-06-15'))
# Mask to land
ds = ds.where(land_mask)

In [10]:
# Create a new dataset with the same coordinates and desired fuels
time = ds['time']
latitude = land_mask['latitude']
longitude = land_mask['longitude']
fuel_vars = {
    'chamise': (['time', 'latitude', 'longitude'], np.full((len(ds['time']),len(latitude),len(longitude)), np.nan)),
    'chamise_old_growth': (['time', 'latitude', 'longitude'], np.full((len(ds['time']),len(latitude),len(longitude)), np.nan)),
    'sage_black': (['time', 'latitude', 'longitude'], np.full((len(ds['time']),len(latitude),len(longitude)), np.nan)),
    'ceanothus_bigpod': (['time', 'latitude', 'longitude'], np.full((len(ds['time']),len(latitude),len(longitude)), np.nan))
}
lfm_ds = xr.Dataset(fuel_vars, coords={'time': time, 'latitude': latitude, 'longitude': longitude})

In [11]:
# Create lists of fuel types and predictor variables
fuels = ['chamise', 'chamise_old_growth', 'sage_black', 'ceanothus_bigpod']

In [26]:
%%time
for fuel_type in fuels:
    print('started ' + fuel_type)
    # Load fuel specific random forest model
    fuel_rf = joblib.load(rf_path + fuel_type + '.rf.joblib')

    for i, (lat,lon) in enumerate(valid_indices):
        # Use to monitor
        #if ((i/1000).is_integer() == True): print(i)
        # Convert time step to pandas dataframe
        df = ds.isel(latitude = lat, longitude=lon).to_dataframe()
        # Save original index
        ridx = df.index
        # Drop location columns and NaN values
        df.drop(columns=['latitude','longitude'], inplace=True)
        df.dropna(inplace=True)
        # Scale predictor variables
        scaler = StandardScaler().fit(df)
        predictors_scaled = pd.DataFrame(scaler.transform(df),
                                         index=df.index,
                                         columns=df.columns.values)
        # Run the random forest
        predicted_lfm = fuel_rf.predict(predictors_scaled)
        # Save predicted LFM as dataframe and reindex to original index
        predicted_s = pd.Series(predicted_lfm, index = df.index)
        predicted_s = predicted_s.reindex(ridx)
        # Save values into dataset
        lfm_ds[fuel_type][:,lat,lon] = predicted_s.values

lfm_ds.to_netcdf(grid_path + 'predicted_ds.nc')

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
CPU times: user 9h 12min 47s, sys: 2h 3min 44s, total: 11h 16min 32s
Wall time: 8h 6min 42s
