## Imports, Reading Data, Setting up tests

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
import sklearn.metrics as metrics

In [2]:
training_data = pd.read_csv('/home/ec2-user/capstone/broadband-capstone/data/broadband_training_2.csv')
test_data = pd.read_csv('/home/ec2-user/capstone/broadband-capstone/data/broadband_test_2.csv')

In [3]:
#These are taken from the PickingVariables notebook

all_broadband_variables = ["pct_internet", 
                           "pct_computer_with_broadband",
                           "pct_internet_broadband_any_type",
                           "pct_internet_none",
                           "pct_internet_cellular",
                           "pct_internet_no_subscrp",
                           "Ookla Median Download Speed (Mbps)",
                           "Ookla Median Upload Speed (Mbps)",
                           "All_Provider_Count_100",
                           "All_Provider_Count_25",
                           "pct_internet_broadband_satellite",
                           "Fixed_Wireless_Provider_Count_25",
                           "pct_internet_broadband_fiber",
                           "Wired_Provider_Count_100",
                           "Wired_Provider_Count_25",
                           "Fixed_Wireless_Provider_Count_100",
                           "Satellite_Provider_Count_100",
                           "Fixed_Wireless_Provider_Count",
                           "MaxAdUp",
                           "All_Provider_Count",
                           "Satellite_Provider_Count_25",
                           "pct_internet_other",
                           "pct_internet_dial_up",
                           "pct_internet_dial_up",
                           "Wired_Provider_Count",
                           "Satellite_Provider_Count"]

#All variables with correlation > 0.3
high_corr = ["pct_internet", 
             "pct_computer_with_broadband",
             "pct_internet_broadband_any_type",
             "pct_internet_none",
             "pct_internet_cellular",
             "pct_internet_no_subscrp"]

#All variables with correlation > 0.1
mid_corr = ["pct_internet", 
            "pct_computer_with_broadband",
            "pct_internet_broadband_any_type",
            "pct_internet_none",
            "pct_internet_cellular",
            "pct_internet_no_subscrp",
            "Ookla Median Download Speed (Mbps)",
            "Ookla Median Upload Speed (Mbps)",
            "All_Provider_Count_100",
            "All_Provider_Count_25"]

small_covariance_med_corr = ['pct_internet',
                             'Ookla Median Download Speed (Mbps)',
                             'All_Provider_Count_100',
                             'All_Provider_Count_25']

## Modeling

### Functions

In [4]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [8]:
poverty_rate_train = np.array(training_data['poverty_rate'])
poverty_rate_test = np.array(test_data['poverty_rate'])

def poverty_rf_model(train_df, test_df, variable_list):
    train_variables = np.array(train_df[variable_list])
    test_variables = np.array(test_df[variable_list])
    
    model = RandomForestRegressor(n_estimators = 10, random_state = 42).fit(train_variables, poverty_rate_train)
    predicted = model.predict(test_variables)
    
    regression_results(poverty_rate_test, predicted)
    return True

In [9]:
poverty_rf_model(training_data, test_data, all_broadband_variables)

explained_variance:  0.4247
r2:  0.4243
MAE:  5.7696
MSE:  65.2931
RMSE:  8.0804


True

In [None]:
poverty_rf_model(training_data, test_data, high_corr)

In [None]:
poverty_rf_model(training_data, test_data, mid_corr)

In [None]:
poverty_rf_model(training_data, test_data, small_covariance_med_corr)