## Calculate fuel and observation site specific model statistics
Last updated: Kevin Varga, 12/31/2024

**Inputs:**
* Netcdf file of LFM of all four fuels
* LFM observation dataframe

**Outputs:**
* Fuel specific stats for each observation site
* Dataframe containing all fuels' observations and corresponding model outputs
* Summary stats of all fuels

In [1]:
import xarray as xr
import pandas as pd
import numpy as np
import math
from sklearn.metrics import r2_score
from scipy.stats import pearsonr

In [2]:
# Set paths
grid_path = '/home/sbarc/students/varga/nasa/ch1/data/lfm_model/'
obs_path = '/home/sbarc/students/varga/nasa/ch1/data/lfm_obs/'

In [3]:
# Open LFM dataset
lfm_ds = xr.open_dataset(grid_path + 'sba_lfm_1987-2019.nc')
# Read in LFM observations and clean up
obs_df = pd.read_csv(obs_path + 'lfm_crop.csv', index_col=['fuel','site','date'],
                     parse_dates=True, infer_datetime_format=True)
obs_df = obs_df.drop(columns=['slope', 'aspect', 'elevation', 'gacc', 'category'])
obs_df = obs_df.rename(columns={'percent':'observed'})

In [4]:
# Count the number of rows for each fuel
fuel_counts = obs_df.groupby('fuel').size()

# Identify fuels with at least 500 rows
valid_fuels = fuel_counts[fuel_counts >= 500].index

# Filter the DataFrame to include only rows with valid fuels
obs_df = obs_df.loc[obs_df.index.get_level_values('fuel').isin(valid_fuels)]

# Sort the index
obs_df = obs_df.sort_index()

In [5]:
# Cropping one column of the eastern most grid just barely cut off this observation site
# Assign it the eastern most longitude value
# The distance between the actual lat/lon and converted lat/lon is 45 meters
obs_df.loc[pd.IndexSlice[:, 'bitter_canyon_castaic', :], 'longitude'] = lfm_ds['longitude'].max().values

# Create columns to fill in predictions
obs_df = obs_df.assign(predicted=np.nan, no_bias_correction=np.nan)

# Filter out rows with 'date' values after last prediction time step
end_date = lfm_ds['time'].max().values
obs_df = obs_df.loc[pd.IndexSlice[:, :, :end_date], :]

### Calculate fuel and observation site specific statistics

In [6]:
# Determine fuel types
fuels = list(lfm_ds.keys())

for fuel_type in fuels:
    # If fuel type is non bias corrected chamise, change obs_df label to just chamise
    if fuel_type == 'chamise_no_bias_correction': 
        obs_label = 'chamise'
    else: 
        obs_label = fuel_type
    # Extract observation site names
    sites = obs_df.loc[obs_label].index.get_level_values(0).unique()
    # Create dataframe to store fuel specific site stats
    fuel_stats_df = pd.DataFrame(index = sites, columns = ['mae','mbe','bias_std','rmse','r2','cc'])

    for site_name in sites:
        # Extract site data
        site_df = obs_df.loc[(obs_label,site_name)]
        # Interpolate gridded output to site observation times and location
        predicted = lfm_ds[fuel_type].interp(time=site_df.index,
                                             latitude=site_df['latitude'][0],
                                             longitude=site_df['longitude'][0]
                                            ).values
        # If fuel type is non bias corrected chamise, save to separate column
        if fuel_type == 'chamise_no_bias_correction': 
            obs_df.loc[(obs_label,site_name),'no_bias_correction'] = predicted
        # Otherwise save to predicted column
        else:
            obs_df.loc[(obs_label,site_name),'predicted'] = predicted
        
        # Calculate site errors
        errors = abs(predicted - site_df['observed'])
        # Calculate site bias
        bias = predicted - site_df['observed']
        # Calculate and plug in site mean absolute error
        fuel_stats_df.loc[site_name, 'mae'] = np.mean(errors)
        # Calculate and plug in site mean bias error
        fuel_stats_df.loc[site_name, 'mbe'] = np.mean(bias)
        # Calculate and plug in site bias standard deviation
        fuel_stats_df.loc[site_name, 'bias_std'] = np.std(bias)
        # Calculate and plug in site root mean square error
        fuel_stats_df.loc[site_name, 'rmse'] = math.sqrt(np.square(errors).mean())
        # Calculate and plug in site r2
        fuel_stats_df.loc[site_name, 'r2'] = r2_score(site_df['observed'], predicted)
        # Calculate and plug in site pearson correlation coefficient
        fuel_stats_df.loc[site_name, 'cc'] = pearsonr(site_df['observed'], predicted)[0]
        
    # Save fuel specific dataframes
    fuel_stats_df.to_csv(grid_path + '/stats/' + fuel_type + '_site_stats.csv')
    
# Save observations with model predictions dataframe
obs_df.to_csv(grid_path + '/stats/all_fuels_obs_predictions.csv')

### Calculate final fuel specific overall statistics

In [7]:
# Create dataframe to store overall statistics
stats_df = pd.DataFrame(index = fuels, columns = ['mae','mbe','bias_std','rmse','r2','cc'])

for fuel_type in fuels:
    # If fuel type is non bias corrected chamise, use chamise observations   
    if fuel_type == 'chamise_no_bias_correction':
        fuel_df = obs_df.loc['chamise']
        # Calculate site errors
        errors = abs(fuel_df['no_bias_correction'] - fuel_df['observed'])
        # Calculate site bias
        bias = fuel_df['no_bias_correction'] - fuel_df['observed']
        # Calculate and plug in site mean absolute error
        stats_df.loc[fuel_type, 'mae'] = np.mean(errors)
        # Calculate and plug in site mean bias error
        stats_df.loc[fuel_type, 'mbe'] = np.mean(bias)
        # Calculate and plug in site bias standard deviation
        stats_df.loc[fuel_type, 'bias_std'] = np.std(bias)
        # Calculate and plug in site root mean square error
        stats_df.loc[fuel_type, 'rmse'] = math.sqrt(np.square(errors).mean())
        # Calculate and plug in site r2
        stats_df.loc[fuel_type, 'r2'] = r2_score(fuel_df['observed'], fuel_df['no_bias_correction'])
        # Calculate and plug in site pearson correlation coefficient
        stats_df.loc[fuel_type, 'cc'] = pearsonr(fuel_df['observed'], fuel_df['no_bias_correction'])[0]
    else:
        fuel_df = obs_df.loc[fuel_type]
        # Calculate site errors
        errors = abs(fuel_df['predicted'] - fuel_df['observed'])
        # Calculate site bias
        bias = fuel_df['predicted'] - fuel_df['observed']
        # Calculate and plug in site mean absolute error
        stats_df.loc[fuel_type, 'mae'] = np.mean(errors)
        # Calculate and plug in site mean bias error
        stats_df.loc[fuel_type, 'mbe'] = np.mean(bias)
        # Calculate and plug in site bias standard deviation
        stats_df.loc[fuel_type, 'bias_std'] = np.std(bias)
        # Calculate and plug in site root mean square error
        stats_df.loc[fuel_type, 'rmse'] = math.sqrt(np.square(errors).mean())
        # Calculate and plug in site r2
        stats_df.loc[fuel_type, 'r2'] = r2_score(fuel_df['observed'], fuel_df['predicted'])
        # Calculate and plug in site pearson correlation coefficient
        stats_df.loc[fuel_type, 'cc'] = pearsonr(fuel_df['observed'], fuel_df['predicted'])[0]
        
# Save summary stats dataframe
stats_df.to_csv(grid_path + '/stats/all_fuels_stats.csv')