## Imports, Reading Data, Setting up tests


NOte: The majority of this notebook was copied from SE_RandomForest/HealthCare

In [119]:
import pandas as pd
import numpy as np
import pickle

from sklearn.ensemble import RandomForestRegressor
import sklearn.metrics as metrics
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder


In [120]:
training_data = pd.read_csv("../../../data/current_data/train_3.csv", converters= {"state" : lambda x: str(x)})
test_data = pd.read_csv("../../../data/current_data/test_3.csv", converters= {"state" : lambda x: str(x)})

In [121]:
y_variable = 'pct_health_ins_19_64'

broadband_vars3 = ['pct_internet',  'pct_only_smartphone', 'pct_internet_broadband_any_type',
                   'pct_internet_broadband_fiber','Ookla Median Download Speed (Mbps)', 'Wired_Count_100']

broadband_vars_group = ['pct_only_cellular', 'pct_internet_broadband_fiber', 'pct_computing_device_with_broadband', 
                        'pct_internet_broadband_satellite', 'pct_computing_device',
                        'Ookla Median Download Speed (Mbps)','pct_internet_broadband_any_type', 'pct_internet',
                        'Wired_Provider_Count_100', 'pct_only_smartphone']

broadband_vars = broadband_vars_group

pct_only_cellular
pct_internet_broadband_fiber
pct_computing_device_with_broadband
pct_internet_broadband_satellite
pct_computing_device
Ookla Median Download Speed (Mbps)
pct_internet_broadband_any_type
pct_internet
Wired_Provider_Count_100
pct_only_smartphone
pct_hisp_latino
pct_pop_income_gt_100k
pct_pop_foreign_born
ave_family_size
log_median_house_value
median_house_value
pct_white
log_median_income
pct_pop_hs+
median_income
ATT_present
employment_rate
ave_household_size
pct_pop_some_college
population_density

In [122]:
covariates = ['log_median_income', 'pct_hisp_latino', 'pct_white',
              'log_median_income_over_log_median_house',  'median_income_over_median_rent',
             'pct_pop_foreign_born', 'pct_pop_disability', 'employment_rate', 'median_age_overall',
             'pct_ages_lt_19',
 'ruca_metro',
 'ruca_micro',
 'ruca_small_town',
 'ruca_rural', 'state']

#Top vars from the all vars feature importance list
covariates = [ 'pct_hisp_latino', 'pct_pop_income_gt_100k',
       'pct_pop_foreign_born', 'ave_family_size', 'log_median_house_value',
       'median_house_value', 'pct_white', 'log_median_income', 'pct_pop_hs+',
       'median_income', 'ATT_present', 'employment_rate', 'ave_household_size',
       'pct_pop_some_college', 'population_density']

In [143]:
all_vars = broadband_vars.copy()
all_vars.extend(covariates)

In [144]:
def get_state(name):
    state = name.split(", ")[2]
    return state

training_data["State_Name"] = training_data.NAME.apply(get_state)
test_data["State_Name"] = test_data.NAME.apply(get_state)

In [145]:
dummies = pd.get_dummies(training_data['State_Name'], prefix = "state")
dummies_test = pd.get_dummies(test_data['State_Name'], prefix = "state")

In [146]:
dummies.head()

Unnamed: 0,state_Alabama,state_Alaska,state_Arizona,state_Arkansas,state_California,state_Colorado,state_Connecticut,state_Delaware,state_District of Columbia,state_Florida,...,state_South Dakota,state_Tennessee,state_Texas,state_Utah,state_Vermont,state_Virginia,state_Washington,state_West Virginia,state_Wisconsin,state_Wyoming
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [147]:
data_train = training_data[all_vars].join(dummies)

In [148]:
data_test = test_data[all_vars].join(dummies_test)

## Modeling

### Functions

In [149]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [150]:
Y_train = np.array(training_data[y_variable])
Y_test = np.array(test_data[y_variable])

def RF_model(train_df, 
             test_df, 
             variable_list,
             Y_train = Y_train,  
             Y_test = Y_test):
    train_variables = np.array(train_df[variable_list])
    test_variables = np.array(test_df[variable_list])
    
    model = RandomForestRegressor(n_estimators = 100, random_state = 42).fit(train_variables, Y_train)
    predicted = model.predict(test_variables)
    
    regression_results(Y_test, predicted)
    
    return pd.DataFrame(model.feature_importances_, 
                        index = variable_list).sort_values(by=0, ascending = False)


## Use Grid Search to find best hyperparameters

In [151]:
[int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]

In [152]:
# Number of trees in random forest
n_estimators = [i for i in range(5, 120, 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in range(5, 30, 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 4, 5, 6]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4, 5]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [5, 10, 15, 20, 25, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 3, 4, 5],
 'min_samples_split': [2, 3, 4, 5, 6],
 'n_estimators': [5,
                  10,
                  15,
                  20,
                  25,
                  30,
                  35,
                  40,
                  45,
                  50,
                  55,
                  60,
                  65,
                  70,
                  75,
                  80,
                  85,
                  90,
                  95,
                  100,
                  105,
                  110,
                  115]}


In [153]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 20 different combinations, and us

rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = random_grid, 
                               n_iter = 20, 
                               cv = 3, 
                               verbose=10, 
                               random_state=42, 
                               n_jobs = -1)

In [154]:
# Fit the random search model
rf_random.fit(data_train, Y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=20,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [5, 10, 15, 20, 25, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'min_samples_split': [2, 3, 4, 5, 6],
                                        'n_estimators': [5, 10, 15, 20, 25, 30,
                                                         35, 40, 45, 50, 55, 60,
                                                         65, 70, 75, 80, 85, 90,
                                                         95, 100, 105, 110,
                                                         115]},
                   random_state=42, verbose=10)

In [155]:
best = rf_random.best_params_
best

{'n_estimators': 50,
 'min_samples_split': 4,
 'min_samples_leaf': 5,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': True}

In [156]:
predictions = rf_random.predict(data_test)

In [157]:
regression_results(Y_train, rf_random.predict(data_train))

explained_variance:  0.9061
r2:  0.9061
MAE:  2.1044
MSE:  8.7104
RMSE:  2.9513


In [158]:
regression_results(Y_test, predictions)

explained_variance:  0.7451
r2:  0.7451
MAE:  3.6113
MSE:  24.1044
RMSE:  4.9096


In [159]:
#Now use these "best" parameters in a new model

rf = RandomForestRegressor(n_estimators = best['n_estimators'], min_samples_split = best['min_samples_split'],
                           min_samples_leaf = best['min_samples_leaf'], max_features= best['max_features'], 
                           max_depth = best['max_depth'], bootstrap = best['bootstrap'])



In [160]:
rf.fit(data_train, Y_train)

RandomForestRegressor(min_samples_leaf=5, min_samples_split=4, n_estimators=50)

In [161]:
regression_results(Y_train, rf.predict(data_train))

explained_variance:  0.906
r2:  0.906
MAE:  2.1069
MSE:  8.7183
RMSE:  2.9527


In [162]:
regression_results(Y_test, rf.predict(data_test))

explained_variance:  0.7442
r2:  0.7442
MAE:  3.6211
MSE:  24.1879
RMSE:  4.9181


In [163]:
print(rf.feature_importances_)
print(rf.feature_names_in_)

[1.15228145e-02 1.00349033e-02 6.98069727e-03 1.01342833e-02
 1.09100226e-02 1.23943878e-02 5.25687830e-03 6.20593083e-03
 1.85429406e-03 2.60561264e-01 4.72432846e-02 5.83779950e-02
 4.57630308e-02 3.49920881e-02 6.95206196e-03 7.09258055e-03
 2.40539312e-02 2.79793803e-02 9.62108019e-02 3.41494336e-02
 4.33081214e-03 1.87647688e-02 1.52703355e-02 1.13588407e-02
 1.77886404e-02 1.47321076e-03 1.08847670e-03 3.23217476e-04
 3.73918825e-04 7.86591102e-03 7.23549870e-05 1.25207789e-04
 2.02913242e-05 1.58908908e-05 2.61094933e-02 1.68003928e-02
 5.03916792e-05 8.23808875e-04 3.79475819e-04 4.13828377e-04
 1.06133268e-04 6.87105313e-04 7.12842056e-04 5.61337087e-04
 6.26278700e-04 1.35838755e-04 5.22158482e-03 1.71686530e-03
 1.01855665e-04 1.98053887e-03 3.87821628e-03 1.46898638e-04
 1.08330771e-04 6.63867022e-05 2.37585649e-04 3.94365592e-04
 4.04761784e-05 3.55586060e-03 8.74367706e-03 1.77966013e-05
 1.00649677e-03 8.07558689e-03 2.01904999e-05 8.75649221e-04
 4.65026179e-05 2.383285

In [164]:
for i in rf.feature_importances_:
    print(i)

0.011522814504032468
0.010034903298893668
0.006980697272891882
0.010134283347575684
0.01091002257886105
0.012394387829080242
0.005256878298054609
0.006205930830971133
0.0018542940609332084
0.2605612640251185
0.04724328462573063
0.05837799499336872
0.0457630307680319
0.034992088146665844
0.006952061963709633
0.0070925805477734015
0.024053931207752968
0.02797938031395236
0.09621080185866639
0.03414943358763536
0.004330812142398764
0.01876476884352638
0.015270335477322856
0.011358840653872793
0.017788640446025693
0.0014732107641672204
0.0010884766974640257
0.00032321747565297967
0.0003739188250042748
0.00786591102366396
7.235498701036689e-05
0.0001252077886979284
2.0291324187276077e-05
1.589089076479041e-05
0.026109493263310673
0.016800392828273527
5.0391679184139105e-05
0.0008238088754611942
0.00037947581931004233
0.0004138283770997019
0.00010613326849758389
0.0006871053130220439
0.0007128420555989339
0.0005613370871853434
0.0006262787000720568
0.00013583875469057712
0.005221584824207034

In [165]:
for i in rf.feature_names_in_:
    print(i)

pct_only_cellular
pct_internet_broadband_fiber
pct_computing_device_with_broadband
pct_internet_broadband_satellite
pct_computing_device
Ookla Median Download Speed (Mbps)
pct_internet_broadband_any_type
pct_internet
Wired_Provider_Count_100
pct_only_smartphone
pct_hisp_latino
pct_pop_income_gt_100k
pct_pop_foreign_born
ave_family_size
log_median_house_value
median_house_value
pct_white
log_median_income
pct_pop_hs+
median_income
ATT_present
employment_rate
ave_household_size
pct_pop_some_college
population_density
state_Alabama
state_Alaska
state_Arizona
state_Arkansas
state_California
state_Colorado
state_Connecticut
state_Delaware
state_District of Columbia
state_Florida
state_Georgia
state_Hawaii
state_Idaho
state_Illinois
state_Indiana
state_Iowa
state_Kansas
state_Kentucky
state_Louisiana
state_Maine
state_Maryland
state_Massachusetts
state_Michigan
state_Minnesota
state_Mississippi
state_Missouri
state_Montana
state_Nebraska
state_Nevada
state_New Hampshire
state_New Jersey
stat

In [166]:
import pickle
with open('../../../web_app2/flaskapp/static/rf_hi_model_states.pkl', 'wb') as file:
    pickle.dump(rf, file)