## Imports, Reading Data, Setting up tests


NOte: The majority of this notebook was copied from SE_RandomForest/HealthCare

In [163]:
import pandas as pd
import numpy as np
import pickle

from sklearn.ensemble import RandomForestRegressor
import sklearn.metrics as metrics
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV


In [164]:
training_data = pd.read_csv("../data/train_3.csv")
test_data = pd.read_csv("../data/test_3.csv")

In [186]:
pd.set_option('display.max_rows', None)
training_data.columns.to_list()

['Unnamed: 0',
 'tract_geoid',
 'All_Provider_Count',
 'MaxAdDown',
 'MaxAdUp',
 'Wired_Provider_Count',
 'Satellite_Provider_Count',
 'Fixed_Wireless_Provider_Count',
 'All_Provider_Count_25',
 'All_Provider_Count_100',
 'Fixed_Wireless_Provider_Count_25',
 'Wired_Provider_Count_25',
 'Satellite_Provider_Count_25',
 'Fixed_Wireless_Provider_Count_100',
 'Wired_Provider_Count_100',
 'Satellite_Provider_Count_100',
 'NAME',
 'median_age_overall',
 'median_age_male',
 'median_age_female',
 'state',
 'county',
 'tract',
 'employment_rate',
 'median_income',
 'total_households',
 'ave_household_size',
 'ave_family_size',
 'pct_health_ins_children',
 'pct_health_ins_19_64',
 'pct_health_ins_65+',
 'total_population',
 'median_house_value',
 'pct_white',
 'pct_hisp_latino',
 'pct_black',
 'pct_native',
 'pct_asian',
 'pct_hi_pi',
 'pct_other_race',
 'pct_two+_race',
 'pct_rent_burdened',
 'poverty_rate',
 'pct_pop_bachelors+',
 'pct_pop_hs+',
 'pct_internet',
 'pct_internet_dial_up',
 'pct_i

In [247]:
y_variable = 'pct_health_ins_19_64'

broadband_vars3 = ['pct_internet',  'pct_only_smartphone', 'pct_internet_broadband_any_type',
                   'pct_internet_broadband_fiber','Ookla Median Download Speed (Mbps)']

broadband_vars = broadband_vars3

In [258]:
covariates = ['log_median_income', 'pct_hisp_latino', 'pct_white',
              'log_median_income_over_log_median_house',  'median_income_over_median_rent',
             'pct_pop_foreign_born', 'pct_pop_disability', 'employment_rate', 'median_age_overall',
             'pct_ages_lt_19',
 'ruca_metro',
 'ruca_micro',
 'ruca_small_town',
 'ruca_rural',]

In [259]:
all_vars = broadband_vars.copy()
all_vars.extend(covariates)

## Modeling

### Functions

In [260]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [261]:
Y_train = np.array(training_data[y_variable])
Y_test = np.array(test_data[y_variable])

def RF_model(train_df, 
             test_df, 
             variable_list,
             Y_train = Y_train,  
             Y_test = Y_test):
    train_variables = np.array(train_df[variable_list])
    test_variables = np.array(test_df[variable_list])
    
    model = RandomForestRegressor(n_estimators = 100, random_state = 42).fit(train_variables, Y_train)
    predicted = model.predict(test_variables)
    
    regression_results(Y_test, predicted)
    
    return pd.DataFrame(model.feature_importances_, 
                        index = variable_list).sort_values(by=0, ascending = False)


## Use Grid Search to find best hyperparameters

In [262]:
[int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]

In [263]:
# Number of trees in random forest
n_estimators = [i for i in range(5, 120, 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in range(5, 30, 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 4, 5, 6]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4, 5]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [5, 10, 15, 20, 25, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 3, 4, 5],
 'min_samples_split': [2, 3, 4, 5, 6],
 'n_estimators': [5,
                  10,
                  15,
                  20,
                  25,
                  30,
                  35,
                  40,
                  45,
                  50,
                  55,
                  60,
                  65,
                  70,
                  75,
                  80,
                  85,
                  90,
                  95,
                  100,
                  105,
                  110,
                  115]}


In [264]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 20 different combinations, and us

rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 20, 
                               cv = 3, 
                               verbose=10, 
                               random_state=42, 
                               n_jobs = -1)

In [265]:
# Fit the random search model
rf_random.fit(training_data[all_vars], Y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=20,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [5, 10, 15, 20, 25, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'min_samples_split': [2, 3, 4, 5, 6],
                                        'n_estimators': [5, 10, 15, 20, 25, 30,
                                                         35, 40, 45, 50, 55, 60,
                                                         65, 70, 75, 80, 85, 90,
                                                         95, 100, 105, 110,
                                                         115]},
                   random_state=42, verbose=10)

In [266]:
best = rf_random.best_params_
best

{'n_estimators': 75,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': False}

In [267]:
predictions = rf_random.predict(test_data[all_vars])

In [268]:
regression_results(Y_train, rf_random.predict(training_data[all_vars]))

explained_variance:  0.9241
r2:  0.9241
MAE:  1.9064
MSE:  7.0394
RMSE:  2.6532


In [269]:
regression_results(Y_test, predictions)

explained_variance:  0.6348
r2:  0.6348
MAE:  4.3434
MSE:  34.534
RMSE:  5.8766


In [270]:
#Now use these "best" parameters in a new model

rf = RandomForestRegressor(n_estimators = best['n_estimators'], min_samples_split = best['min_samples_split'],
                           min_samples_leaf = best['min_samples_leaf'], max_features= best['max_features'], 
                           max_depth = best['max_depth'], bootstrap = best['bootstrap'])



In [271]:
rf.fit(training_data[all_vars], training_data[y_variable])

RandomForestRegressor(bootstrap=False, max_depth=20, max_features='sqrt',
                      min_samples_leaf=4, n_estimators=75)

In [272]:
regression_results(Y_train, rf.predict(training_data[all_vars]))

explained_variance:  0.9245
r2:  0.9245
MAE:  1.9015
MSE:  7.0022
RMSE:  2.6462


In [273]:
regression_results(Y_test, rf.predict(test_data[all_vars]))

explained_variance:  0.6339
r2:  0.6338
MAE:  4.3501
MSE:  34.6257
RMSE:  5.8844


In [274]:
print(rf.feature_importances_)
print(rf.feature_names_in_)

[0.06752524 0.14465718 0.06069021 0.02206698 0.02855254 0.13683936
 0.13816946 0.09522695 0.05896423 0.0404487  0.05901829 0.03503248
 0.03311241 0.03821097 0.03546767 0.00316834 0.00109738 0.00041114
 0.00134048]
['pct_internet' 'pct_only_smartphone' 'pct_internet_broadband_any_type'
 'pct_internet_broadband_fiber' 'Ookla Median Download Speed (Mbps)'
 'log_median_income' 'pct_hisp_latino' 'pct_white'
 'log_median_income_over_log_median_house'
 'median_income_over_median_rent' 'pct_pop_foreign_born'
 'pct_pop_disability' 'employment_rate' 'median_age_overall'
 'pct_ages_lt_19' 'ruca_metro' 'ruca_micro' 'ruca_small_town' 'ruca_rural']


In [275]:
for i in rf.feature_importances_:
    print(i)

0.06752523935097983
0.14465718091457966
0.06069021172126821
0.022066981228236558
0.028552539021217266
0.1368393598646889
0.13816946335819083
0.09522695199043292
0.058964227504565925
0.040448696961471714
0.05901828582216558
0.03503247952910599
0.033112408294957055
0.03821097442049578
0.03546766857291562
0.0031683371537359365
0.0010973802816873243
0.00041113592003863976
0.0013404780892661613


In [276]:
for i in rf.feature_names_in_:
    print(i)

pct_internet
pct_only_smartphone
pct_internet_broadband_any_type
pct_internet_broadband_fiber
Ookla Median Download Speed (Mbps)
log_median_income
pct_hisp_latino
pct_white
log_median_income_over_log_median_house
median_income_over_median_rent
pct_pop_foreign_born
pct_pop_disability
employment_rate
median_age_overall
pct_ages_lt_19
ruca_metro
ruca_micro
ruca_small_town
ruca_rural
