## Imports, Reading Data, Setting up tests

In [2]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


In [36]:
training_data = pd.read_csv("../data/train_3.csv")
test_data = pd.read_csv("../data/test_3.csv")

In [55]:
y_variable = 'pct_health_ins_19_64'

broadband_vars3 = ['pct_internet',  'pct_only_smartphone',
                   'pct_internet_broadband_fiber','Ookla Median Download Speed (Mbps)',
                  'pct_no_computing_device', 'Comcast_present',
 'ATT_present',
 'VSAT_present',
 'Century_Link_present',
 'Spectrum_present',
 'Crown_Castle_present',
 'Etheric_present',
 'Frontier_Communications_present',]

broadband_vars = broadband_vars3

In [56]:
covariates = ['log_median_income', 'pct_hisp_latino', 'pct_white',
              'log_median_income_over_log_median_house',  
             'pct_pop_foreign_born', 'pct_pop_disability', 'employment_rate', 'median_age_overall',
             'pct_ages_lt_19',
 'ruca_metro',
 'ruca_micro',
 'ruca_small_town',
 'ruca_rural',]

In [57]:
all_vars = broadband_vars.copy()
all_vars.extend(covariates)

## Modeling

### Functions

In [58]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [59]:
Y_train = np.array(training_data['pct_health_ins_19_64'])
Y_test = np.array(test_data['pct_health_ins_19_64'])

def XGBoost_model(train_df, 
                          test_df, 
                          variable_list,
                          Y_train = Y_train,  
                          Y_test = Y_test):
    train_variables = np.array(train_df[variable_list])
    test_variables = np.array(test_df[variable_list])
    
    model = XGBRegressor().fit(train_variables, Y_train)
    predicted = model.predict(test_variables)
    
    regression_results(Y_test, predicted)
    
    return pd.DataFrame(model.feature_importances_, 
                        index = variable_list).sort_values(by=0, ascending = False)


## Tuning the model

In [60]:
params = { 'max_depth': [2, 3, 4, 5, 6,10],
           'learning_rate': [0.001, 0.01, 0.05, 0.1],
           'n_estimators': [i for i in range(5, 120, 5)],
           'colsample_bytree': [0.1, 0.3, 0.7]}

regressor_tuned = XGBRegressor()

regress = RandomizedSearchCV(estimator=regressor_tuned, 
                             param_distributions=params,
                             scoring='neg_mean_squared_error', 
                             verbose=10,
                             n_iter = 20,
                             cv = 3,
                             random_state=42)

regress.fit(training_data[all_vars], Y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV 1/3; 1/20] START colsample_bytree=0.7, learning_rate=0.05, max_depth=5, n_estimators=75
[CV 1/3; 1/20] END colsample_bytree=0.7, learning_rate=0.05, max_depth=5, n_estimators=75;, score=-38.377 total time=   1.5s
[CV 2/3; 1/20] START colsample_bytree=0.7, learning_rate=0.05, max_depth=5, n_estimators=75
[CV 2/3; 1/20] END colsample_bytree=0.7, learning_rate=0.05, max_depth=5, n_estimators=75;, score=-38.611 total time=   1.5s
[CV 3/3; 1/20] START colsample_bytree=0.7, learning_rate=0.05, max_depth=5, n_estimators=75
[CV 3/3; 1/20] END colsample_bytree=0.7, learning_rate=0.05, max_depth=5, n_estimators=75;, score=-38.308 total time=   1.4s
[CV 1/3; 2/20] START colsample_bytree=0.1, learning_rate=0.05, max_depth=10, n_estimators=115
[CV 1/3; 2/20] END colsample_bytree=0.1, learning_rate=0.05, max_depth=10, n_estimators=115;, score=-47.327 total time=   1.7s
[CV 2/3; 2/20] START colsample_bytree=0.1, learning_rate=0.05, max_

[CV 2/3; 13/20] END colsample_bytree=0.3, learning_rate=0.001, max_depth=4, n_estimators=100;, score=-6272.887 total time=   1.2s
[CV 3/3; 13/20] START colsample_bytree=0.3, learning_rate=0.001, max_depth=4, n_estimators=100
[CV 3/3; 13/20] END colsample_bytree=0.3, learning_rate=0.001, max_depth=4, n_estimators=100;, score=-6262.515 total time=   1.2s
[CV 1/3; 14/20] START colsample_bytree=0.7, learning_rate=0.1, max_depth=5, n_estimators=60
[CV 1/3; 14/20] END colsample_bytree=0.7, learning_rate=0.1, max_depth=5, n_estimators=60;, score=-33.837 total time=   1.5s
[CV 2/3; 14/20] START colsample_bytree=0.7, learning_rate=0.1, max_depth=5, n_estimators=60
[CV 2/3; 14/20] END colsample_bytree=0.7, learning_rate=0.1, max_depth=5, n_estimators=60;, score=-33.786 total time=   1.6s
[CV 3/3; 14/20] START colsample_bytree=0.7, learning_rate=0.1, max_depth=5, n_estimators=60
[CV 3/3; 14/20] END colsample_bytree=0.7, learning_rate=0.1, max_depth=5, n_estimators=60;, score=-33.497 total time=  

RandomizedSearchCV(cv=3,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          enable_categorical=False, gamma=None,
                                          gpu_id=None, importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=...
                                          scale_pos_weight=None, subsample=None,
                                          tree_method=None,
                                          validate

In [61]:
print("Best parameters:", regress.best_params_)
print("Lowest RMSE: ", (-regress.best_score_)**(1/2.0))
y_pred = regress.predict(test_data[all_vars])

regression_results(Y_test, y_pred)

Best parameters: {'n_estimators': 75, 'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
Lowest RMSE:  5.695801520905957
explained_variance:  0.6613
r2:  0.6613
MAE:  4.1843
MSE:  32.0273
RMSE:  5.6593


In [62]:
best = regress.best_params_

In [63]:
xgb = XGBRegressor(n_estimators = best['n_estimators'], max_depth = best['max_depth'],
                  learning_rate = best['learning_rate'], colsample_bytree = best['colsample_bytree']
                  )


In [64]:
xgb.fit(training_data[all_vars], training_data[y_variable])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=75, n_jobs=16,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [65]:
regression_results(Y_train, xgb.predict(training_data[all_vars]))

explained_variance:  0.7033
r2:  0.7033
MAE:  3.9345
MSE:  27.5308
RMSE:  5.247


In [66]:
regression_results(Y_test, xgb.predict(test_data[all_vars]))

explained_variance:  0.6613
r2:  0.6613
MAE:  4.1843
MSE:  32.0273
RMSE:  5.6593


In [67]:
print(xgb.feature_importances_)
print(xgb.n_features_in_)

[0.04634965 0.21896054 0.00612713 0.01480277 0.01486708 0.0130442
 0.0632261  0.00886104 0.00977063 0.01380458 0.03446685 0.01797052
 0.01361689 0.18699193 0.10866741 0.05570505 0.0349374  0.04068022
 0.01747584 0.01777751 0.01051042 0.01265442 0.01498181 0.00770484
 0.00461967 0.01142556]
26


In [68]:
xgb.get_params

<bound method XGBModel.get_params of XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=75, n_jobs=16,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)>

In [69]:
for i in all_vars:
    print(i)

pct_internet
pct_only_smartphone
pct_internet_broadband_fiber
Ookla Median Download Speed (Mbps)
pct_no_computing_device
Comcast_present
ATT_present
VSAT_present
Century_Link_present
Spectrum_present
Crown_Castle_present
Etheric_present
Frontier_Communications_present
log_median_income
pct_hisp_latino
pct_white
log_median_income_over_log_median_house
pct_pop_foreign_born
pct_pop_disability
employment_rate
median_age_overall
pct_ages_lt_19
ruca_metro
ruca_micro
ruca_small_town
ruca_rural


In [70]:
for i in xgb.feature_importances_:
    print(i)

0.046349652
0.21896054
0.006127128
0.014802769
0.014867083
0.013044201
0.063226104
0.008861044
0.009770633
0.013804577
0.03446685
0.017970525
0.013616894
0.18699193
0.10866741
0.055705052
0.0349374
0.040680222
0.017475838
0.01777751
0.0105104195
0.012654421
0.014981815
0.0077048354
0.0046196715
0.011425565
