## Imports, Reading Data, Setting up tests

In [3]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


In [28]:
training_data = pd.read_csv("../../../data/current_data/train_3.csv")
test_data = pd.read_csv("../../../data/current_data/test_3.csv")

In [77]:
y_variable = 'pct_health_ins_19_64'

broadband_vars3 = ['pct_internet',  'pct_only_smartphone',
                   'pct_internet_broadband_fiber','Ookla Median Download Speed (Mbps)',
                  'pct_no_computing_device', 'Comcast_present',
 'ATT_present',
 'VSAT_present',
 'Century_Link_present',
 'Spectrum_present',
 'Crown_Castle_present',
 'Etheric_present',
 'Frontier_Communications_present',]

broadband_vars3 = ['pct_internet',  'pct_computing_device', 'pct_internet_broadband_any_type',
                   'pct_computing_device_with_broadband','Ookla Median Download Speed (Mbps)',
                   'pct_internet_broadband_satellite','Wired_Provider_Count_25']

broadband_vars_group = ['pct_only_cellular', 'pct_internet_broadband_fiber', 'pct_computing_device_with_broadband', 
                        'pct_internet_broadband_satellite', 'pct_computing_device',
                        'Ookla Median Download Speed (Mbps)','pct_internet_broadband_any_type', 'pct_internet',
                        'Wired_Provider_Count', 'pct_only_smartphone']

broadband_vars = broadband_vars3

In [78]:
covariates = ['log_median_income', 'pct_hisp_latino', 'pct_white',
              'log_median_income_over_log_median_house',  
             'pct_pop_foreign_born', 'pct_pop_disability', 'employment_rate', 'median_age_overall',
             'pct_ages_lt_19',
 'ruca_metro',
 'ruca_micro',
 'ruca_small_town',
 'ruca_rural',]

covariates = ['median_income', 'ATT_present', 'pct_hisp_latino',
       'pct_desktop_or_laptop', 'pct_pop_income_gt_100k',
       'pct_only_smartphone', 'pct_pop_bachelors+', 'Etheric_present',
       'pct_pop_foreign_born', 'poverty_rate', 'median_house_value',
       '0_25_Form 477 All Terrestrial Broadband: Max Advertised Consumer Download Speed (Mbps)',
       'ave_household_size', 'pct_white', 'Crown_Castle_present',
       '0_25_Ookla Median Download Speed (Mbps)', 'pct_pop_hs+', 'MaxAdDown',
       'Form 477 All Terrestrial Broadband: Max Advertised Consumer Download Speed (Mbps)']

covariates = [ 'pct_hisp_latino', 'pct_pop_income_gt_100k',
       'pct_pop_foreign_born', 'ave_family_size', 'log_median_house_value',
       'median_house_value', 'pct_white', 'log_median_income', 'pct_pop_hs+',
       'median_income', 'ATT_present', 'employment_rate', 'ave_household_size',
       'pct_pop_some_college', 'population_density', 'pct_pop_disability', 'median_age_overall',
        'pct_ages_lt_19', 'log_median_income_over_log_median_house','pct_only_smartphone']

In [79]:
all_vars = broadband_vars.copy()
all_vars.extend(covariates)

## Modeling

### Functions

In [80]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [81]:
Y_train = np.array(training_data['pct_health_ins_19_64'])
Y_test = np.array(test_data['pct_health_ins_19_64'])

def XGBoost_model(train_df, 
                          test_df, 
                          variable_list,
                          Y_train = Y_train,  
                          Y_test = Y_test):
    train_variables = np.array(train_df[variable_list])
    test_variables = np.array(test_df[variable_list])
    
    model = XGBRegressor().fit(train_variables, Y_train)
    predicted = model.predict(test_variables)
    
    regression_results(Y_test, predicted)
    
    return pd.DataFrame(model.feature_importances_, 
                        index = variable_list).sort_values(by=0, ascending = False)


## Tuning the model

In [82]:
params = { 'max_depth': [2, 3, 4, 5, 6,10],
           'learning_rate': [0.001, 0.01, 0.05, 0.1],
           'n_estimators': [i for i in range(5, 120, 5)],
           'colsample_bytree': [0.1, 0.3, 0.7]}

regressor_tuned = XGBRegressor()

regress = RandomizedSearchCV(estimator=regressor_tuned, 
                             param_distributions=params,
                             scoring='neg_mean_squared_error', 
                             verbose=10,
                             n_iter = 20,
                             cv = 3,
                             random_state=42)

regress.fit(training_data[all_vars], Y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV 1/3; 1/20] START colsample_bytree=0.7, learning_rate=0.05, max_depth=5, n_estimators=75
[CV 1/3; 1/20] END colsample_bytree=0.7, learning_rate=0.05, max_depth=5, n_estimators=75;, score=-36.766 total time=   1.5s
[CV 2/3; 1/20] START colsample_bytree=0.7, learning_rate=0.05, max_depth=5, n_estimators=75
[CV 2/3; 1/20] END colsample_bytree=0.7, learning_rate=0.05, max_depth=5, n_estimators=75;, score=-36.800 total time=   1.5s
[CV 3/3; 1/20] START colsample_bytree=0.7, learning_rate=0.05, max_depth=5, n_estimators=75
[CV 3/3; 1/20] END colsample_bytree=0.7, learning_rate=0.05, max_depth=5, n_estimators=75;, score=-36.759 total time=   1.5s
[CV 1/3; 2/20] START colsample_bytree=0.1, learning_rate=0.05, max_depth=10, n_estimators=115
[CV 1/3; 2/20] END colsample_bytree=0.1, learning_rate=0.05, max_depth=10, n_estimators=115;, score=-42.629 total time=   2.5s
[CV 2/3; 2/20] START colsample_bytree=0.1, learning_rate=0.05, max_

[CV 2/3; 13/20] END colsample_bytree=0.3, learning_rate=0.001, max_depth=4, n_estimators=100;, score=-6271.521 total time=   1.2s
[CV 3/3; 13/20] START colsample_bytree=0.3, learning_rate=0.001, max_depth=4, n_estimators=100
[CV 3/3; 13/20] END colsample_bytree=0.3, learning_rate=0.001, max_depth=4, n_estimators=100;, score=-6260.808 total time=   1.2s
[CV 1/3; 14/20] START colsample_bytree=0.7, learning_rate=0.1, max_depth=5, n_estimators=60
[CV 1/3; 14/20] END colsample_bytree=0.7, learning_rate=0.1, max_depth=5, n_estimators=60;, score=-31.894 total time=   1.5s
[CV 2/3; 14/20] START colsample_bytree=0.7, learning_rate=0.1, max_depth=5, n_estimators=60
[CV 2/3; 14/20] END colsample_bytree=0.7, learning_rate=0.1, max_depth=5, n_estimators=60;, score=-32.021 total time=   1.6s
[CV 3/3; 14/20] START colsample_bytree=0.7, learning_rate=0.1, max_depth=5, n_estimators=60
[CV 3/3; 14/20] END colsample_bytree=0.7, learning_rate=0.1, max_depth=5, n_estimators=60;, score=-31.937 total time=  

RandomizedSearchCV(cv=3,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          enable_categorical=False, gamma=None,
                                          gpu_id=None, importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=...
                                          scale_pos_weight=None, subsample=None,
                                          tree_method=None,
                                          validate

In [83]:
print("Best parameters:", regress.best_params_)
print("Lowest RMSE: ", (-regress.best_score_)**(1/2.0))
y_pred = regress.predict(test_data[all_vars])

regression_results(Y_test, y_pred)

Best parameters: {'n_estimators': 75, 'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
Lowest RMSE:  5.536704445420164
explained_variance:  0.677
r2:  0.6769
MAE:  4.0762
MSE:  30.551
RMSE:  5.5273


In [84]:
best = regress.best_params_

In [85]:
xgb = XGBRegressor(n_estimators = best['n_estimators'], max_depth = best['max_depth'],
                  learning_rate = best['learning_rate'], colsample_bytree = best['colsample_bytree']
                  )


In [86]:
xgb.fit(training_data[all_vars], training_data[y_variable])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=75, n_jobs=16,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [87]:
regression_results(Y_train, xgb.predict(training_data[all_vars]))

explained_variance:  0.7284
r2:  0.7284
MAE:  3.7745
MSE:  25.2034
RMSE:  5.0203


In [88]:
regression_results(Y_test, xgb.predict(test_data[all_vars]))

explained_variance:  0.677
r2:  0.6769
MAE:  4.0762
MSE:  30.551
RMSE:  5.5273


In [89]:
print(xgb.feature_importances_)
print(xgb.n_features_in_)

[0.01255804 0.00514598 0.01229521 0.00965409 0.00886886 0.00376771
 0.00722946 0.06684173 0.075543   0.02962485 0.04593682 0.03472873
 0.02433068 0.02605685 0.07389852 0.05020751 0.09739153 0.05382211
 0.01361253 0.02182147 0.00888229 0.01169905 0.00927983 0.00671406
 0.00594201 0.0122807  0.27186638]
27


In [90]:
xgb.get_params

<bound method XGBModel.get_params of XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=75, n_jobs=16,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)>

In [91]:
for i in all_vars:
    print(i)

pct_internet
pct_computing_device
pct_internet_broadband_any_type
pct_computing_device_with_broadband
Ookla Median Download Speed (Mbps)
pct_internet_broadband_satellite
Wired_Provider_Count_25
pct_hisp_latino
pct_pop_income_gt_100k
pct_pop_foreign_born
ave_family_size
log_median_house_value
median_house_value
pct_white
log_median_income
pct_pop_hs+
median_income
ATT_present
employment_rate
ave_household_size
pct_pop_some_college
population_density
pct_pop_disability
median_age_overall
pct_ages_lt_19
log_median_income_over_log_median_house
pct_only_smartphone


In [92]:
for i in xgb.feature_importances_:
    print(i)

0.012558043
0.0051459805
0.0122952135
0.009654087
0.008868862
0.003767705
0.00722946
0.06684173
0.075543
0.029624851
0.04593682
0.03472873
0.024330685
0.026056847
0.07389852
0.050207514
0.09739153
0.05382211
0.013612526
0.021821469
0.008882291
0.011699048
0.009279832
0.0067140614
0.005942008
0.012280703
0.27186638
