# Creation of final random forest models for individual fuel types
Last updated: Kevin Varga, 11/27/2024

**Inputs:**
* Fuel specific dataframes with predictor variables for every LFM observation

**Outputs:**
* Fuel specific random forest models to be used for dataset creation

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import joblib

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [3]:
pred_path = '/home/sbarc/students/varga/nasa/ch1/data/site_predictors/'
param_path = '/home/sbarc/students/varga/nasa/ch1/data/random_forest/'
output_path = '/home/sbarc/students/varga/nasa/ch1/data/random_forest/'

In [3]:
fuel_list = list(Path(pred_path).glob('*.csv'))
# Read in hyperparameter tuning dataframe
param_df = pd.read_csv(param_path + 'param_tuning.csv', index_col='fuel')

In [4]:
%%time

for file in fuel_list:
    # Read in the fuel type LFM observations and associated predictor variables
    pre_features = pd.read_csv(file, index_col=[0,1], parse_dates=True, infer_datetime_format=True)
    # Identify the fuel type
    fuel_type = pre_features['fuel'].iloc[0]

    # Extract LFM percent as target, and drop unneeded columns from df for random forest
    targets = pre_features['percent']
    pre_features.drop(columns=['latitude', 'longitude', 'percent', 'fuel'], inplace=True)

    # Extract optimized parameters from param_df
    n_est = param_df.loc[fuel_type]['n_estimators']
    n_split = param_df.loc[fuel_type]['min_samples_split']
    n_leaf = param_df.loc[fuel_type]['min_samples_leaf']
    max_feature_style = param_df.loc[fuel_type]['max_features']
    n_depth = param_df.loc[fuel_type]['max_depth']
    if pd.isna(n_depth) == True: n_depth = None
    bootstrap_style = param_df.loc[fuel_type]['bootstrap']

    # Create standardized scaler and scale predictors to a mean zero scale to reduce bias
    scaler = StandardScaler().fit(pre_features)
    features = pd.DataFrame(scaler.transform(pre_features), index=pre_features.index, columns=pre_features.columns.values)

    # Initiate random forest model
    rf = RandomForestRegressor(n_est, min_samples_split=n_split, min_samples_leaf = n_leaf, max_features=max_feature_style, 
                               max_depth=n_depth, bootstrap=bootstrap_style, n_jobs=-1, random_state=42)

    # Train the model all of the data
    rf.fit(features, targets)
    # Save the model
    joblib.dump(rf, output_path + fuel_type + '.rf.joblib', compress=3)

CPU times: user 2min 31s, sys: 13.1 s, total: 2min 44s
Wall time: 42.7 s
