# Conduct quantile mapping bias correction at each site, <br> testing different number of quantiles
Last updated: Kevin Varga, 11/27/2024

**Inputs:**
* Chamise LFM observations
* Chamise random forest model
* Predictor variables at each LFM obs site for entire temporal domain

**Outputs:**
* Dataframe of observed, predicted, and corrected at each LFM observation site for each number of quantiles tested
* Dataframe showing number of actual and interpolated observations, as well as initial and corrected MBE for each site/quantile
* Time series plot of each site/quantile showing observed, predicted, and corrected
* Dataframe summarizing MBE change at each site/quantile

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from sklearn.preprocessing import StandardScaler

In [2]:
# Path for LFM observations
obs_path = '/home/sbarc/students/varga/nasa/ch1/data/lfm_obs/'
# Path for random forest model
rf_path = '/home/sbarc/students/varga/nasa/ch1/data/random_forest/'
# Path for fuel specific predictor dataframes
# Note that bias correction is only performed for chamise due to observation spatial coverage
pred_path = '/home/sbarc/students/varga/nasa/ch1/data/bias_correction/site_predictors/'
output_path = '/home/sbarc/students/varga/nasa/ch1/data/bias_correction/'

In [3]:
# Read in LFM observations and clean up
obs_df = pd.read_csv(obs_path + 'lfm_crop.csv')
obs_df.drop(columns=['slope', 'aspect', 'elevation', 'gacc', 'category'], inplace=True)
obs_df['date'] = pd.to_datetime(obs_df['date'])

In [4]:
# Read in random forest model and set up dataframe and variables for quantile mapping

# Set fuel type
fuel_type = 'chamise'
# Read in fuel specific random forest model and predictor dataframe
fuel_rf = joblib.load(rf_path + fuel_type + '.rf.joblib')
fuel_df = pd.read_csv(pred_path + fuel_type + '.csv', index_col=[0,1], parse_dates=True, infer_datetime_format=True)
# Set up dataframe to plug in observed, predicted, and corrected values
predict_df = pd.DataFrame(index=fuel_df.index, columns=['observed', 'predicted', 'corrected'])
# Drop latitude and longitude, as they are not predictors
fuel_df.drop(columns = ['latitude', 'longitude'], inplace=True)
# Save LFM observation site names
sites = pd.unique(predict_df.index.get_level_values(0))

In [12]:
%%time

# Create array of quantile values to test
all_quantiles = np.arange(10,110,10)

# Create dataframe to store stats summary
stat_sum_df = pd.DataFrame(index=sites, columns=all_quantiles)

for n_quantiles in all_quantiles:
    print(n_quantiles)
    
    # Create dataframe to store site specific stats for each number of quantiles
    stats_df = pd.DataFrame(index=sites, columns=['n_obs','n_obs_int','MBE','Corrected','Change'])

    for site_name in sites:
        # Extract predictors for individual sites
        site_predictors = fuel_df.loc[site_name]
        # Standardize predictors
        scaler = StandardScaler().fit(site_predictors)
        predictors_scaled = pd.DataFrame(scaler.transform(site_predictors), 
                                         index=site_predictors.index, 
                                         columns=site_predictors.columns.values)
        # Run the random forest
        predicted_lfm = fuel_rf.predict(predictors_scaled)

        # Plug in predicted values to df
        predict_df.loc[site_name, 'predicted'] = predicted_lfm

        # Extract observations for each site
        site_obs = obs_df.loc[(obs_df['site'] == site_name) & (obs_df['fuel'] == fuel_type)]
        site_obs.set_index('date', drop=True, inplace=True)

        # Reindex and interpolate observations to match temporal frequency of model output
        # Limit interpolation in either direction to two time steps
        # Allowing interpolation helps bias correction by extrapolating values
        obs_int = site_obs['percent'].reindex(site_obs.index.union(predict_df.loc[site_name].index)) \
                                     .interpolate(method='quadratic', limit=2, limit_direction='both') \
                                     .reindex(predict_df.loc[site_name].index)

        # Plug in number of observations and number of interpolated observations to stats_df
        stats_df.loc[site_name, 'n_obs'] = len(site_obs['percent'])
        stats_df.loc[site_name, 'n_obs_int'] = len(obs_int.dropna())

        # Plug in interpolated observation values to df
        predict_df.loc[site_name, 'observed'] = obs_int.values

        # Extract interpolated observed and model predicted values during the observation temporal domain
        obs = predict_df.loc[site_name, 'observed'][site_obs.index[0]:site_obs.index[-1]]
        obs = obs.dropna()
        obs_idx = obs.index
        compare = predict_df.loc[site_name, 'predicted'][obs_idx]
        predicted = predict_df.loc[site_name, 'predicted']

        # Calculate and plug in starting MBE using interpolated observations
        starting_mbe = round(np.mean(compare - obs), 2)
        stats_df.loc[site_name, 'MBE'] = starting_mbe

        # Perform quantile mapping bias correction

        # Calculate quantiles
        quantiles = np.linspace(0, 1, n_quantiles+1)  # Adjust the quantiles range as needed

        # Sort observed and modeled
        observed = np.percentile(obs.values, quantiles * 100)
        modeled = np.percentile(compare.values, quantiles * 100)
        # Calculate the difference between the sorted values
        difference = [x - y for x, y in zip(observed, modeled)]

        # Calculate quantile values for each model output
        q_values = np.percentile(predicted.values, quantiles * 100)
        # Create dataframes for mapping
        mapping_df = pd.DataFrame({'quantiles': quantiles, 'q_values': q_values, 'correction': difference})
        predicted = predicted.to_frame(name='predicted')

        # Assign mapped q-values to each modeled value
        predicted.loc[:, 'mapped_q_values'] = pd.qcut(predicted['predicted'], 
                                                      n_quantiles, 
                                                      labels=mapping_df['q_values'][1:])

        # Map q-values to the associated correction
        mappings = dict(zip(mapping_df['q_values'].values, mapping_df['correction'].values))
        predicted.loc[:, 'correction'] = predicted['mapped_q_values'].map(mappings)
        # Plug in corrected values by adding the correction factor to the predicted value
        predicted.loc[:, 'corrected'] = predicted['predicted'].astype(float) + predicted['correction'].astype(float)
        # Plug in corrected values to larger df
        predict_df.loc[site_name, 'corrected'][predicted.index] = predicted['corrected']

        # Calculate and plug in final corrected mbe
        final_mbe = round(np.mean(predict_df.loc[site_name, 'corrected'][obs.index] - obs), 2)
        #print(site_name + ' MBE: ' + str(final_mbe) + ' = corrected MBE')
        stats_df.loc[site_name, 'Corrected'] = final_mbe
        stats_df.loc[site_name, 'Change'] = round(abs(final_mbe) - abs(starting_mbe), 2)

        # Plot and save the corrected time series within the observed temporal domain
        predict_df.loc[site_name, 'observed'][site_obs.index[0]:site_obs.index[-1]].plot(figsize=(20,10), legend=True, grid=True, lw=2, fontsize=16);
        predict_df.loc[site_name, 'predicted'][site_obs.index[0]:site_obs.index[-1]].plot(figsize=(20,10), legend=True, grid=True, lw=2, fontsize=16);
        predict_df.loc[site_name, 'corrected'][site_obs.index[0]:site_obs.index[-1]].plot(figsize=(20,10), legend=True, grid=True, lw=2, fontsize=16);
        plt.title(site_name.replace('_', ' ').title() + ' -- Model MBE: ' + str(starting_mbe) + ' -- Corrected MBE: ' + str(final_mbe), fontsize=16);
        plt.legend(fontsize=16);
        plt.ylabel('LFM %', fontsize=16);
        plt.xlabel('');
        plt.savefig(output_path + fuel_type + '/' + str(n_quantiles) + '_' + site_name + '.png', bbox_inches='tight')
        plt.close()

    # Save the post bias correction MBE change for each site for each quantile value
    stat_sum_df.loc[:, n_quantiles] = stats_df.loc[:, 'Change'].values
    # Save stats and model output dataframes for each quantile value
    stats_df.to_csv(output_path + fuel_type + '/' + str(n_quantiles) + '_stats.csv', index_label='site')
    predict_df.to_csv(output_path + fuel_type + '/' + str(n_quantiles) + '_model_output.csv', index_label=['site','date'])

# Calculate means and add it as a row at the bottom of stats summary dataframe
means = [stat_sum_df.loc[:, all_quantiles[x]].mean() for x in np.arange(0,len(all_quantiles))]
stat_sum_df.loc['mean'] = means
# Save stats summary dataframe
stat_sum_df.to_csv(output_path + fuel_type + '/q_stats_summary.csv', index_label='site')

10
20
30
40
50
60
70
80
90
100
CPU times: user 8min 44s, sys: 1min 11s, total: 9min 56s
Wall time: 7min 35s
