## Imports, Reading Data, Setting up tests

In [119]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics

In [10]:
training_data = pd.read_csv('/home/ec2-user/capstone/broadband-capstone/data/broadband_training_2.csv')
test_data = pd.read_csv('/home/ec2-user/capstone/broadband-capstone/data/broadband_test_2.csv')

In [13]:
#These are taken from the PickingVariables notebook

all_broadband_variables = ["pct_internet", 
                           "pct_computer_with_broadband",
                           "pct_internet_broadband_any_type",
                           "pct_internet_none",
                           "pct_internet_cellular",
                           "pct_internet_no_subscrp",
                           "Ookla Median Download Speed (Mbps)",
                           "Ookla Median Upload Speed (Mbps)",
                           "All_Provider_Count_100",
                           "All_Provider_Count_25",
                           "pct_internet_broadband_satellite",
                           "Fixed_Wireless_Provider_Count_25",
                           "pct_internet_broadband_fiber",
                           "Wired_Provider_Count_100",
                           "Wired_Provider_Count_25",
                           "Fixed_Wireless_Provider_Count_100",
                           "Satellite_Provider_Count_100",
                           "Fixed_Wireless_Provider_Count",
                           "MaxAdUp",
                           "All_Provider_Count",
                           "Satellite_Provider_Count_25",
                           "pct_internet_other",
                           "pct_internet_dial_up",
                           "pct_internet_dial_up",
                           "Wired_Provider_Count",
                           "Satellite_Provider_Count"]

#All variables with correlation > 0.3
high_corr = ["pct_internet", 
             "pct_computer_with_broadband",
             "pct_internet_broadband_any_type",
             "pct_internet_none",
             "pct_internet_cellular",
             "pct_internet_no_subscrp"]

#All variables with correlation > 0.1
mid_corr = ["pct_internet", 
            "pct_computer_with_broadband",
            "pct_internet_broadband_any_type",
            "pct_internet_none",
            "pct_internet_cellular",
            "pct_internet_no_subscrp",
            "Ookla Median Download Speed (Mbps)",
            "Ookla Median Upload Speed (Mbps)",
            "All_Provider_Count_100",
            "All_Provider_Count_25"]

small_covariance_med_corr = ['pct_internet',
                             'Ookla Median Download Speed (Mbps)',
                             'All_Provider_Count_100',
                             'All_Provider_Count_25']

## Modeling

### Functions 

In [45]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [118]:
poverty_rate_train = np.array(training_data['poverty_rate'])
poverty_rate_test = np.array(test_data['poverty_rate'])

def poverty_model(train_df, test_df, variable_list):
    train_variables = np.array(train_df[variable_list])
    test_variables = np.array(test_df[variable_list])
    
    model = LinearRegression().fit(train_variables, poverty_rate_train)
    predicted = model.predict(test_variables)
    
    regression_results(poverty_rate_test, predicted)
    
    print("\n")
    coefficients = pd.DataFrame()
    coefficients['variable'] = variable_list
    coefficients['coef'] = model.coef_
    coefficients['abs_coef'] = np.absolute(model.coef_)

    coefficients.sort_values(by="abs_coef", ascending=False, inplace=True)
    
    for i in range(len(coefficients)):
        print (f"{coefficients['variable'].iloc[i]}: {coefficients['coef'].iloc[i]}")
    
    return True

### Trying different Sets of Variables

In [111]:
poverty_model(training_data, test_data, all_broadband_variables)

explained_variance:  0.4533
r2:  0.4532
MAE:  5.6337
MSE:  62.0229
RMSE:  7.8755


Satellite_Provider_Count_25: 3.71610709554571
Fixed_Wireless_Provider_Count_100: 1.2885797413977291
Satellite_Provider_Count: -1.2727398430083605
pct_computer_with_broadband: -0.9647865509902543
Wired_Provider_Count_100: 0.7639581379425513
All_Provider_Count_100: -0.6466586704494924
pct_internet_dial_up: -0.6298825055177137
pct_internet_dial_up: -0.6298825055177134
Satellite_Provider_Count_100: 0.6180255115329474
pct_internet_broadband_any_type: 0.4728528407355493
Fixed_Wireless_Provider_Count_25: 0.3693850860189547
pct_internet_other: 0.29924613936473793
All_Provider_Count_25: -0.28274412765840823
Wired_Provider_Count_25: -0.260934788864434
Fixed_Wireless_Provider_Count: -0.23973886630074578
pct_internet_broadband_satellite: -0.22219066278448688
pct_internet: -0.1570296647821671
pct_internet_no_subscrp: 0.10508619923175942
pct_internet_cellular: 0.08025425513943917
All_Provider_Count: 0.0558432058836455

True

In [115]:
poverty_model(training_data, test_data, high_corr)

explained_variance:  0.4335
r2:  0.4334
MAE:  5.7442
MSE:  64.2595
RMSE:  8.0162


pct_internet: -838284024778.3242
pct_internet_none: -838284024776.5057
pct_internet_no_subscrp: -838284024776.4218
pct_internet_broadband_any_type: 2.2038716265987817
pct_computer_with_broadband: -1.07354736328125
pct_internet_cellular: 0.08555968260564757


True

In [113]:
poverty_model(training_data, test_data, mid_corr)

explained_variance:  0.4433
r2:  0.4432
MAE:  5.6627
MSE:  63.1552
RMSE:  7.947


pct_internet_broadband_any_type: 1.8975544534073447
pct_internet: -1.083841766100386
pct_computer_with_broadband: -1.009853755526319
pct_internet_no_subscrp: 0.5789910638128593
pct_internet_none: 0.5048507022875212
All_Provider_Count_25: -0.32016394139616666
All_Provider_Count_100: 0.21140454581813406
pct_internet_cellular: 0.08023860850744824
Ookla Median Upload Speed (Mbps): -0.043456167425752305
Ookla Median Download Speed (Mbps): 0.023195377546676436


True

In [114]:
poverty_model(training_data, test_data, small_covariance_med_corr)

explained_variance:  0.4199
r2:  0.4199
MAE:  5.7744
MSE:  65.7997
RMSE:  8.1117


pct_internet: -0.6634353379761093
All_Provider_Count_25: -0.3763132164083473
All_Provider_Count_100: 0.28527417584146936
Ookla Median Download Speed (Mbps): 0.02217068726250546


True

The best of these models is the all_broadband_variables model. Oddly, the variables in the model with the highest impact on the model  (highest coefficients) did not have high correlations with poverty_rate itself.