# Observation site cross validation random forest algorithm for individual fuel types
Last updated: Kevin Varga, 11/27/2024

**Inputs:**
* Fuel specific dataframes with predictor variables for every LFM observation

**Outputs:**
* Fuel specific csv files with observation site specific statistics

In [1]:
import numpy as np
import math
import pandas as pd
from pathlib import Path

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr

In [3]:
pred_path = '/home/sbarc/students/varga/nasa/ch1/data/site_predictors/'
param_path = '/home/sbarc/students/varga/nasa/ch1/data/random_forest/'
output_path = '/home/sbarc/students/varga/nasa/ch1/data/random_forest/site_cv/'
# Create list of fuel specific dataframes
fuel_list = list(Path(pred_path).glob('*.csv'))
# Read in hyperparameter tuning dataframe
param_df = pd.read_csv(param_path + 'param_tuning.csv', index_col='fuel')

In [6]:
%%time

for file in fuel_list:
    # Read in the fuel type LFM observations and associated predictor variables
    pre_features = pd.read_csv(file, index_col=[0,1], parse_dates=True, infer_datetime_format=True)
    # Identify the fuel type
    fuel = pre_features['fuel'].iloc[0]
    # Identify all of the LFM observation sites
    sites = pd.unique(pre_features.index.get_level_values(0))

    # Extract LFM percent as target, and drop unneeded columns from df for random forest
    targets = pre_features['percent']
    pre_features.drop(columns=['latitude', 'longitude', 'percent', 'fuel'], inplace=True)

    # Extract optimized parameters from param_df
    n_est = param_df.loc[fuel]['n_estimators']
    n_split = param_df.loc[fuel]['min_samples_split']
    n_leaf = param_df.loc[fuel]['min_samples_leaf']
    max_feature_style = param_df.loc[fuel]['max_features']
    n_depth = param_df.loc[fuel]['max_depth']
    if pd.isna(n_depth) == True: n_depth = None
    bootstrap_style = param_df.loc[fuel]['bootstrap']

    # Loop through all observation sites, using each one as the test data
    for i, site_name in enumerate(sites):
        # Extract predictors for training from all but the selected site
        feature_train = pre_features.drop(site_name, level='site')
        # Extract predictors for testing from the selected site
        feature_test = pre_features.loc[site_name]
        # Extract LFM observations for training
        target_train = targets.drop(site_name, level='site')
        # Extract LFM observations for testing
        target_test = targets.loc[site_name]

        # Create standardized scaler and scale predictors to a mean zero scale to reduce bias
        scaler = StandardScaler().fit(feature_train)
        feature_train_scaled = pd.DataFrame(scaler.transform(feature_train), index=feature_train.index, columns=feature_train.columns.values)
        feature_test_scaled = pd.DataFrame(scaler.transform(feature_test), index=feature_test.index, columns=feature_test.columns.values)

        # Initiate random forest model
        rf = RandomForestRegressor(n_est, min_samples_split=n_split, min_samples_leaf = n_leaf, max_features=max_feature_style, 
                                   max_depth=n_depth, bootstrap=bootstrap_style, n_jobs=-1, random_state=42)

        # Train the model on training data
        rf.fit(feature_train_scaled, target_train)

        # Use model to predict targets on training and testing predictors
        predicted_train = rf.predict(feature_train_scaled)
        predicted_test = rf.predict(feature_test_scaled)

        # Calculate all absolute errors and the errors for winter and summer
        errors = abs(predicted_test - target_test)
        djf_errors = errors.loc[(errors.index.month==12) | (errors.index.month==1) | (errors.index.month==2)]
        jja_errors = errors.loc[(errors.index.month==6) | (errors.index.month==7) | (errors.index.month==8)]

        # Calculate all bias and the bias for winter and summer
        bias = predicted_test - target_test
        djf_bias = bias.loc[(bias.index.month==12) | (bias.index.month==1) | (bias.index.month==2)]
        jja_bias = bias.loc[(bias.index.month==6) | (bias.index.month==7) | (bias.index.month==8)]

        # Calculate root mean square error
        rmse = math.sqrt(np.square(errors).mean())
        djf_rmse = math.sqrt(np.square(djf_errors).mean())
        jja_rmse = math.sqrt(np.square(jja_errors).mean())

        # Calculate noise
        target_noise = np.var(target_test)

        # Calculate predicted data variance
        predicted_var = np.var(predicted_test)

        # Calculate r2, spearman, and pearson correlations between target test data and predicted test data
        r2_value = r2_score(target_test, predicted_test)
        spearman = spearmanr(target_test, predicted_test)
        pearson = pearsonr(target_test, predicted_test)

        # Get numerical feature importances
        feature_list = pre_features.columns
        importances = list(rf.feature_importances_)

        # Create dataframe of predictions, errors, and bias, and save
        predicted_test_s = pd.Series(predicted_test, name='model_percent', index=target_test.index)
        predicted_test_df = pd.concat([target_test, predicted_test_s], axis=1)
        predicted_test_df['site'] = site_name
        predicted_test_df['errors'] = abs(predicted_test_df['model_percent'] - predicted_test_df['percent'])
        predicted_test_df['bias'] = predicted_test_df['model_percent'] - predicted_test_df['percent']
        predicted_test_df.to_csv(output_path + fuel + '/sites/' + site_name + '_test' + '.csv', index_label='date')

        # Create dataframe to store all statistics
        if i == 0:
            stats = [site_name, len(target_train), len(target_test), n_est, round(predicted_var, 2), 
                     round(target_noise, 2), round(np.mean(errors), 2), round(np.mean(errors/target_test), 2), 
                     round(np.mean(djf_errors), 2), round(np.mean(jja_errors), 2), round(np.mean(bias), 2), 
                     round(np.mean(bias/target_test), 2), round(np.mean(djf_bias), 2), 
                     round(np.mean(jja_bias), 2), round(rmse, 2), round(djf_rmse, 2), round(jja_rmse, 2), 
                     round(r2_value, 2), round(spearman[0], 2), round(pearson[0], 2)]
            keys = ['test_site','train_obs','test_obs','n_trees','model_var',
                    'target_var','mae','mae_norm',
                    'djf_mae','jja_mae','mbe',
                    'mbe_norm','djf_mbe',
                    'jja_mbe','rmse','djf_rmse','jja_rmse',
                    'test_r2','spearman_cc','pearson_cc']
            stats_dict = dict(zip(keys, zip(stats)))
            stats_df = pd.DataFrame(stats_dict)

            # Create dataframe to store predictor importances
            im_dict = dict(zip(feature_list, zip(importances)))
            im_df = pd.DataFrame(im_dict)

        # Save all statistics and importances once initial dataframes were created
        else:
            stats = [site_name, len(target_train), len(target_test), n_est, round(predicted_var, 2), round(target_noise, 2), 
                     round(np.mean(errors), 2), round(np.mean(errors/target_test), 2), round(np.mean(djf_errors), 2), round(np.mean(jja_errors), 2), 
                     round(np.mean(bias), 2), round(np.mean(bias/target_test), 2), round(np.mean(djf_bias), 2), round(np.mean(jja_bias), 2), 
                     round(rmse, 2), round(djf_rmse, 2), round(jja_rmse, 2), 
                     round(r2_value, 2), round(spearman[0], 2), round(pearson[0], 2)]
            stats_df.loc[i] = stats
            im_df.loc[i] = importances

    # Save statistics and importances once all sites have been analyzed
    stats_df.to_csv(output_path + fuel + '/' +'stats_df.csv', index=False)
    im_df.insert(0, 'test_site', sites)
    im_df.to_csv(output_path + fuel + '/' +'im_df.csv', index=False)

CPU times: user 45min 7s, sys: 2min 23s, total: 47min 30s
Wall time: 6min 1s
