# 5-fold cross validation random forest algorithm for individual fuel types
Last updated: Kevin Varga, 11/27/2024

**Inputs:**
* Fuel specific dataframes with predictor variables for every LFM observation

**Outputs:**
* Fuel specific csv files with global model statistics from 5-fold cross validation

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
pred_path = '/home/sbarc/students/varga/nasa/ch1/data/site_predictors/'
param_path = '/home/sbarc/students/varga/nasa/ch1/data/random_forest/'
output_path = '/home/sbarc/students/varga/nasa/ch1/data/random_forest/global/'
fuel_list = list(Path(pred_path).glob('*.csv'))
# Read in hyperparameter tuning dataframe
param_df = pd.read_csv(param_path + 'param_tuning.csv', index_col='fuel')

In [5]:
%%time
# Create list of desired cross validation scoring
scoring_list = ['explained_variance','max_error','neg_mean_absolute_error','neg_root_mean_squared_error','r2']

# Loop through individual fuel type dataframes
for file in fuel_list:
    # Read in the fuel type LFM observations and associated predictor variables
    pre_features = pd.read_csv(file, index_col=[0,1], parse_dates=True, infer_datetime_format=True)
    # Identify the fuel type
    fuel = pre_features['fuel'].iloc[0]
    
    # Reset index, extract LFM percent as target, and drop unneeded columns from df for random forest
    pre_features.reset_index(drop=True, inplace=True)
    targets = pre_features['percent']
    pre_features.drop(columns=['latitude', 'longitude', 'percent', 'fuel'], inplace=True)

    # Create standardized scaler and scale predictors to a mean zero scale to reduce bias
    scaler = StandardScaler().fit(pre_features)
    features = pd.DataFrame(scaler.transform(pre_features), index=pre_features.index.values, columns=pre_features.columns.values)

    # Extract optimized parameters from param_df
    n_est = param_df.loc[fuel]['n_estimators']
    n_split = param_df.loc[fuel]['min_samples_split']
    n_leaf = param_df.loc[fuel]['min_samples_leaf']
    max_feature_style = param_df.loc[fuel]['max_features']
    n_depth = param_df.loc[fuel]['max_depth']
    if pd.isna(n_depth) == True: n_depth = None
    bootstrap_style = param_df.loc[fuel]['bootstrap']

    # Initiate random forest model
    rf = RandomForestRegressor(n_est, min_samples_split=n_split, min_samples_leaf = n_leaf, max_features=max_feature_style,
                               max_depth=n_depth, bootstrap=bootstrap_style, n_jobs=-1, random_state=42)

    # Run 5-fold cross validation with model
    results = cross_validate(rf, features, targets,
                                  cv=5,
                                  scoring=scoring_list,
                                  n_jobs=-1)
    
    # Save results to a dataframe/csv
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_path + fuel + '.csv', index=False)

CPU times: user 876 ms, sys: 853 ms, total: 1.73 s
Wall time: 41 s
