## Imports, Reading Data, Setting up tests

In [22]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


In [5]:
training_data = pd.read_csv('/home/ec2-user/capstone/broadband-capstone/data/broadband_training_bin_1.csv')
test_data = pd.read_csv('/home/ec2-user/capstone/broadband-capstone/data/broadband_test_bin_1.csv')

In [16]:
all_vars = list(training_data.columns.values[(training_data.dtypes=='float64') | (training_data.dtypes=='int')])
remove_vars = ['pct_health_ins_children','pct_health_ins_19_64','pct_health_ins_65+']

for i in remove_vars:
    all_vars.remove(i)

## Modeling

### Functions

In [7]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [8]:
Y_train = np.array(training_data['pct_health_ins_19_64'])
Y_test = np.array(test_data['pct_health_ins_19_64'])

def XGBoost_model(train_df, 
                          test_df, 
                          variable_list,
                          Y_train = Y_train,  
                          Y_test = Y_test):
    train_variables = np.array(train_df[variable_list])
    test_variables = np.array(test_df[variable_list])
    
    model = XGBRegressor().fit(train_variables, Y_train)
    predicted = model.predict(test_variables)
    
    regression_results(Y_test, predicted)
    
    return pd.DataFrame(model.feature_importances_, 
                        index = variable_list).sort_values(by=0, ascending = False)


In [17]:
feature_importance_all_vars = XGBoost_model(training_data, 
                                            test_data, 
                                            all_vars) 


explained_variance:  0.7611
r2:  0.7611
MAE:  3.502
MSE:  22.6217
RMSE:  4.7562


From the all_vars model, we can pick the top variables to use in the grid search. 

In [None]:
feature_importance_all_vars.head(25).plot.barh()

In [19]:
feature_importance_all_vars.head(7).index.values

array(['median_income', 'pct_hisp_latino', 'poverty_rate', 'pct_pop_hs+',
       'tract_geoid', 'pct_white', 'pct_computer_no_internet'],
      dtype=object)

In [20]:
key_vars = ['median_income', 
            'pct_hisp_latino', 
            'poverty_rate', 
            'pct_pop_hs+',
            'tract_geoid', 
            'pct_white', 
            'pct_computer_no_internet']

## Tuning the model

In [23]:
params = { 'max_depth': [3,6,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000],
           'colsample_bytree': [0.3, 0.7]}

regressor_tuned = XGBRegressor()

regress = RandomizedSearchCV(estimator=regressor_tuned, 
                             param_distributions=params,
                             scoring='neg_mean_squared_error', 
                             verbose=10,
                             n_iter = 20,
                             cv = 3,
                             random_state=42)




Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV 1/3; 1/20] START colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500
[CV 1/3; 1/20] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500;, score=-26.109 total time= 2.2min
[CV 2/3; 1/20] START colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500
[CV 2/3; 1/20] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500;, score=-26.206 total time=  16.6s
[CV 3/3; 1/20] START colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500
[CV 3/3; 1/20] END colsample_bytree=0.3, learning_rate=0.1, max_depth=3, n_estimators=500;, score=-26.848 total time= 2.2min
[CV 1/3; 2/20] START colsample_bytree=0.7, learning_rate=0.1, max_depth=6, n_estimators=500
[CV 1/3; 2/20] END colsample_bytree=0.7, learning_rate=0.1, max_depth=6, n_estimators=500;, score=-22.465 total time=  47.7s
[CV 2/3; 2/20] START colsample_bytree=0.7, learning_rate=0.1, max_depth

[CV 2/3; 13/20] END colsample_bytree=0.3, learning_rate=0.1, max_depth=10, n_estimators=1000;, score=-24.240 total time= 7.4min
[CV 3/3; 13/20] START colsample_bytree=0.3, learning_rate=0.1, max_depth=10, n_estimators=1000
[CV 3/3; 13/20] END colsample_bytree=0.3, learning_rate=0.1, max_depth=10, n_estimators=1000;, score=-24.922 total time= 9.9min
[CV 1/3; 14/20] START colsample_bytree=0.3, learning_rate=0.01, max_depth=10, n_estimators=100
[CV 1/3; 14/20] END colsample_bytree=0.3, learning_rate=0.01, max_depth=10, n_estimators=100;, score=-1061.666 total time= 1.1min
[CV 2/3; 14/20] START colsample_bytree=0.3, learning_rate=0.01, max_depth=10, n_estimators=100
[CV 2/3; 14/20] END colsample_bytree=0.3, learning_rate=0.01, max_depth=10, n_estimators=100;, score=-1067.963 total time=  56.8s
[CV 3/3; 14/20] START colsample_bytree=0.3, learning_rate=0.01, max_depth=10, n_estimators=100
[CV 3/3; 14/20] END colsample_bytree=0.3, learning_rate=0.01, max_depth=10, n_estimators=100;, score=-10

NameError: name 'regressor' is not defined

In [26]:
print("Best parameters:", regress.best_params_)
print("Lowest RMSE: ", (-regress.best_score_)**(1/2.0))
y_pred = regress.predict(test_data[key_vars])

regression_results(Y_test, y_pred)

Best parameters: {'n_estimators': 1000, 'max_depth': 10, 'learning_rate': 0.05, 'colsample_bytree': 0.7}
Lowest RMSE:  4.747979854303688
explained_variance:  0.7628
r2:  0.7628
MAE:  3.4722
MSE:  22.4611
RMSE:  4.7393
