## Imports, Reading Data, Setting up tests


NOte: The majority of this notebook was copied from SE_RandomForest/HealthCare

In [3]:
import pandas as pd
import numpy as np
import pickle

from sklearn.ensemble import RandomForestRegressor
import sklearn.metrics as metrics
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV


In [4]:
training_data = pd.read_csv("../../../data/current_data/train_3.csv")
test_data = pd.read_csv("../../../data/current_data/test_3.csv")

In [7]:
pd.set_option('display.max_rows', None)
training_data.columns.to_list()

['Unnamed: 0',
 'tract_geoid',
 'All_Provider_Count',
 'MaxAdDown',
 'MaxAdUp',
 'Wired_Provider_Count',
 'Satellite_Provider_Count',
 'Fixed_Wireless_Provider_Count',
 'All_Provider_Count_25',
 'All_Provider_Count_100',
 'Fixed_Wireless_Provider_Count_25',
 'Wired_Provider_Count_25',
 'Satellite_Provider_Count_25',
 'Fixed_Wireless_Provider_Count_100',
 'Wired_Provider_Count_100',
 'Satellite_Provider_Count_100',
 'NAME',
 'median_age_overall',
 'median_age_male',
 'median_age_female',
 'state',
 'county',
 'tract',
 'employment_rate',
 'median_income',
 'total_households',
 'ave_household_size',
 'ave_family_size',
 'pct_health_ins_children',
 'pct_health_ins_19_64',
 'pct_health_ins_65+',
 'total_population',
 'median_house_value',
 'pct_white',
 'pct_hisp_latino',
 'pct_black',
 'pct_native',
 'pct_asian',
 'pct_hi_pi',
 'pct_other_race',
 'pct_two+_race',
 'pct_rent_burdened',
 'poverty_rate',
 'pct_pop_bachelors+',
 'pct_pop_hs+',
 'pct_internet',
 'pct_internet_dial_up',
 'pct_i

In [132]:
y_variable = 'pct_health_ins_19_64'

broadband_vars3 = ['pct_internet',  'pct_only_smartphone', 'pct_internet_broadband_any_type',
                   'pct_internet_broadband_fiber','Ookla Median Download Speed (Mbps)', 'Wired_Count_100']

broadband_vars_group = ['pct_only_cellular', 'pct_internet_broadband_fiber', 'pct_computing_device_with_broadband', 
                        'pct_internet_broadband_satellite', 'pct_computing_device',
                        'Ookla Median Download Speed (Mbps)','pct_internet_broadband_any_type', 'pct_internet',
                        'Wired_Provider_Count_25']

broadband_vars = broadband_vars_group

In [None]:
covariates = ['log_median_income', 'pct_hisp_latino', 'pct_white',
              'log_median_income_over_log_median_house',  'median_income_over_median_rent',
             'pct_pop_foreign_born', 'pct_pop_disability', 'employment_rate', 'median_age_overall',
             'pct_ages_lt_19',
 'ruca_metro',
 'ruca_micro',
 'ruca_small_town',
 'ruca_rural',]

#Top vars from the all vars feature importance list
covariates = ['pct_only_smartphone', 'pct_hisp_latino', 'pct_pop_income_gt_100k',
       'pct_pop_foreign_born', 'ave_family_size', 'log_median_house_value',
       'median_house_value', 'pct_white', 'log_median_income', 'pct_pop_hs+',
       'median_income', 'ATT_present', 'employment_rate', 'ave_household_size',
       'pct_pop_some_college', 'population_density']

In [114]:
all_vars = broadband_vars.copy()
all_vars.extend(covariates)

## Modeling

### Functions

In [115]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [116]:
Y_train = np.array(training_data[y_variable])
Y_test = np.array(test_data[y_variable])

def RF_model(train_df, 
             test_df, 
             variable_list,
             Y_train = Y_train,  
             Y_test = Y_test):
    train_variables = np.array(train_df[variable_list])
    test_variables = np.array(test_df[variable_list])
    
    model = RandomForestRegressor(n_estimators = 100, random_state = 42).fit(train_variables, Y_train)
    predicted = model.predict(test_variables)
    
    regression_results(Y_test, predicted)
    
    return pd.DataFrame(model.feature_importances_, 
                        index = variable_list).sort_values(by=0, ascending = False)


## Use Grid Search to find best hyperparameters

In [117]:
[int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]

In [118]:
# Number of trees in random forest
n_estimators = [i for i in range(5, 120, 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in range(5, 30, 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 4, 5, 6]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4, 5]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [5, 10, 15, 20, 25, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 3, 4, 5],
 'min_samples_split': [2, 3, 4, 5, 6],
 'n_estimators': [5,
                  10,
                  15,
                  20,
                  25,
                  30,
                  35,
                  40,
                  45,
                  50,
                  55,
                  60,
                  65,
                  70,
                  75,
                  80,
                  85,
                  90,
                  95,
                  100,
                  105,
                  110,
                  115]}


In [119]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 20 different combinations, and us

rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 20, 
                               cv = 3, 
                               verbose=10, 
                               random_state=42, 
                               n_jobs = -1)

In [120]:
# Fit the random search model
rf_random.fit(training_data[all_vars], Y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=20,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [5, 10, 15, 20, 25, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'min_samples_split': [2, 3, 4, 5, 6],
                                        'n_estimators': [5, 10, 15, 20, 25, 30,
                                                         35, 40, 45, 50, 55, 60,
                                                         65, 70, 75, 80, 85, 90,
                                                         95, 100, 105, 110,
                                                         115]},
                   random_state=42, verbose=10)

In [121]:
best = rf_random.best_params_
best

{'n_estimators': 75,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': False}

In [122]:
predictions = rf_random.predict(test_data[all_vars])

In [123]:
regression_results(Y_train, rf_random.predict(training_data[all_vars]))

explained_variance:  0.9467
r2:  0.9467
MAE:  1.5726
MSE:  4.9439
RMSE:  2.2235


In [124]:
regression_results(Y_test, predictions)

explained_variance:  0.6749
r2:  0.6749
MAE:  4.0847
MSE:  30.7395
RMSE:  5.5443


In [125]:
#Now use these "best" parameters in a new model

rf = RandomForestRegressor(n_estimators = best['n_estimators'], min_samples_split = best['min_samples_split'],
                           min_samples_leaf = best['min_samples_leaf'], max_features= best['max_features'], 
                           max_depth = best['max_depth'], bootstrap = best['bootstrap'])



In [126]:
rf.fit(training_data[all_vars], training_data[y_variable])

RandomForestRegressor(bootstrap=False, max_depth=20, max_features='sqrt',
                      min_samples_leaf=4, n_estimators=75)

In [127]:
regression_results(Y_train, rf.predict(training_data[all_vars]))

explained_variance:  0.9467
r2:  0.9467
MAE:  1.5737
MSE:  4.9456
RMSE:  2.2239


In [128]:
regression_results(Y_test, rf.predict(test_data[all_vars]))

explained_variance:  0.6767
r2:  0.6767
MAE:  4.0828
MSE:  30.5759
RMSE:  5.5296


In [129]:
print(rf.feature_importances_)
print(rf.feature_names_in_)

[0.01644935 0.01236166 0.0195313  0.01220692 0.01342668 0.01617829
 0.0203502  0.02661637 0.00359594 0.10543019 0.11002846 0.0793264
 0.04918959 0.04327788 0.03228048 0.03822495 0.05891536 0.07744798
 0.05960183 0.07941697 0.01852849 0.02505449 0.03630967 0.02104764
 0.0252029 ]
['pct_only_cellular' 'pct_internet_broadband_fiber'
 'pct_computing_device_with_broadband' 'pct_internet_broadband_satellite'
 'pct_computing_device' 'Ookla Median Download Speed (Mbps)'
 'pct_internet_broadband_any_type' 'pct_internet'
 'Wired_Provider_Count_100' 'pct_only_smartphone' 'pct_hisp_latino'
 'pct_pop_income_gt_100k' 'pct_pop_foreign_born' 'ave_family_size'
 'log_median_house_value' 'median_house_value' 'pct_white'
 'log_median_income' 'pct_pop_hs+' 'median_income' 'ATT_present'
 'employment_rate' 'ave_household_size' 'pct_pop_some_college'
 'population_density']


In [130]:
for i in rf.feature_importances_:
    print(i)

0.01644934706630692
0.012361656495057492
0.01953130160935779
0.012206915838953825
0.013426681319587595
0.01617829439812445
0.020350204970148004
0.026616372007287108
0.003595936432607385
0.10543019090389144
0.1100284551179577
0.07932640319505602
0.04918959357330891
0.04327788314229213
0.032280483310593584
0.03822494828332262
0.05891536102635197
0.07744797520217904
0.05960183096658747
0.0794169678634357
0.018528494185250214
0.025054491435612433
0.03630967150349502
0.0210476397739332
0.02520290037930196


In [131]:
for i in rf.feature_names_in_:
    print(i)

pct_only_cellular
pct_internet_broadband_fiber
pct_computing_device_with_broadband
pct_internet_broadband_satellite
pct_computing_device
Ookla Median Download Speed (Mbps)
pct_internet_broadband_any_type
pct_internet
Wired_Provider_Count_100
pct_only_smartphone
pct_hisp_latino
pct_pop_income_gt_100k
pct_pop_foreign_born
ave_family_size
log_median_house_value
median_house_value
pct_white
log_median_income
pct_pop_hs+
median_income
ATT_present
employment_rate
ave_household_size
pct_pop_some_college
population_density
