In [8]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV


In [2]:
training_data = pd.read_csv('/home/ec2-user/capstone/broadband-capstone/data/current_data/train_3.csv')
test_data = pd.read_csv('/home/ec2-user/capstone/broadband-capstone/data/current_data/test_3.csv')

### Modeling

In [3]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [4]:
poverty_rate_train = np.array(training_data['poverty_rate'])
poverty_rate_test = np.array(test_data['poverty_rate'])

def poverty_XGBoost_model(train_df, 
                          test_df, 
                          variable_list,
                          poverty_rate_train = poverty_rate_train,  
                          poverty_rate_test = poverty_rate_test):
    train_variables = np.array(train_df[variable_list])
    test_variables = np.array(test_df[variable_list])
    
    model = XGBRegressor().fit(train_variables, poverty_rate_train)
    predicted = model.predict(test_variables)
    
    regression_results(poverty_rate_test, predicted)
    
    return pd.DataFrame(model.feature_importances_, 
                        index = variable_list).sort_values(by=0, ascending = False)


#### Only key broadband variables

In [5]:
key_vars = ['pct_internet', 
            'pct_only_cellular', 
            'pct_computing_device_no_internet', 
            'pct_internet_broadband_satellite', 
            'pct_computing_device_with_broadband']


In [9]:
#Without tuning
poverty_XGBoost_model(training_data, test_data, key_vars)

explained_variance:  0.5047
r2:  0.5045
MAE:  5.5325
MSE:  59.3627
RMSE:  7.7047


Unnamed: 0,0
pct_computing_device_with_broadband,0.544968
pct_internet,0.223466
pct_internet_broadband_satellite,0.094002
pct_only_cellular,0.090538
pct_computing_device_no_internet,0.047026


In [10]:
#With tuning
regressor = XGBRegressor(n_estimators = 500, max_depth = 3, learning_rate = 0.05, colsample_bytree = 0.7)
regressor.fit(training_data[key_vars],  np.array(training_data['poverty_rate']))
predictions = regressor.predict(test_data[key_vars])
regression_results(np.array(test_data['poverty_rate']), predictions)


explained_variance:  0.5162
r2:  0.5161
MAE:  5.4838
MSE:  57.9784
RMSE:  7.6144
