# Random forest hyperparameter tuning for individual fuel types
Last updated: Kevin Varga, 11/21/2024

**Inputs:**
* Fuel specific dataframes with predictor variables for every LFM observation

**Outputs:**
* Dataframe with optimized random forest parameters for each fuel type

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from pprint import pprint

In [2]:
# Hyper parameter tuning packages
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
pred_path = '/home/sbarc/students/varga/nasa/ch1/data/site_predictors/'
output_path = '/home/sbarc/students/varga/nasa/ch1/data/random_forest/'
fuel_list = list(Path(pred_path).glob('*.csv'))

## Hyperparameter Tuning with Random Grid

In [4]:
# General process was pulled from: 
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

#To use RandomizedSearchCV, we first need to create a parameter grid to sample from during fitting:

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators, \
               'max_features': max_features, \
               'max_depth': max_depth, \
               'min_samples_split': min_samples_split, \
               'min_samples_leaf': min_samples_leaf, \
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [5]:
%%time

# Create dataframe to save fuel specific tuned parameters
param_df = pd.DataFrame(columns = random_grid.keys())

# Loop through all fuel type csv files
for file in fuel_list:
    
    # Read in fuel type csv
    pre_features = pd.read_csv(file, index_col=[0,1], parse_dates=True, infer_datetime_format=True)
    fuel = pre_features['fuel'].iloc[0]

    # Drop rows with nan as nan values cannot be included in the model
    #print(fuel, '# obs before dropna: ', len(pre_features))
    pre_features = pre_features.dropna()
    #print(fuel, '# obs: ', len(pre_features))

    # Reset index, extract LFM percent as target, and drop unneeded columns from df for random forest
    pre_features = pre_features.reset_index(drop=True)
    targets = pre_features['percent']
    pre_features = pre_features.drop(columns=['latitude', 'longitude', 'percent', 'fuel'])

    # Scale the features to have a zero mean
    scaler = StandardScaler().fit(pre_features)
    features = pd.DataFrame(scaler.transform(pre_features), index=pre_features.index.values, columns=pre_features.columns.values)

    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestRegressor()
    # Random search of parameters, using 5 fold cross validation, 
    # search across 100 different combinations, and use all available cores    
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, random_state=42, n_jobs = -1)    
    # Fit the random search model
    #print('Started: ' + dt.now().strftime('%H:%M:%S'))
    rf_random.fit(features, targets)
    #print('Finished: ' + dt.now().strftime('%H:%M:%S') + '\n')

    param_df.loc[fuel] = rf_random.best_params_


param_df.to_csv(output_path + 'param_tuning.csv', index_label='fuel')



CPU times: user 1min 2s, sys: 2.06 s, total: 1min 4s
Wall time: 15min 6s
