In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics 
from sklearn.preprocessing import MinMaxScaler

In [2]:
training_data = pd.read_csv('/home/ec2-user/capstone/broadband-capstone/data/broadband_training_bin_2.csv')
test_data = pd.read_csv('/home/ec2-user/capstone/broadband-capstone/data/broadband_test_bin_2.csv')


In [3]:
broadband_vars = ['MaxAdDown',
                 'MaxAdUp',
                 'Wired_Provider_Count',
                 'Satellite_Provider_Count',
                 'Fixed_Wireless_Provider_Count',
                 'All_Provider_Count_25',
                 'All_Provider_Count_100',
                 'Fixed_Wireless_Provider_Count_25',
                 'Wired_Provider_Count_25',
                 'Satellite_Provider_Count_25',
                 'Fixed_Wireless_Provider_Count_100',
                 'Wired_Provider_Count_100',
                 'Satellite_Provider_Count_100',
                 'pct_internet',
                 'pct_internet_dial_up',
                 'pct_internet_broadband_any_type',
                 'pct_internet_cellular',
                 'pct_only_cellular',
                 'pct_internet_broadband_fiber',
                 'pct_internet_broadband_satellite',
                 'pct_internet_only_satellite',
                 'pct_internet_other',
                 'pct_internet_no_subscrp',
                 'pct_internet_none',
                 'pct_computer',
                 'pct_computer_with_dialup',
                 'pct_computer_with_broadband',
                 'pct_computer_no_internet',
                 'pct_no_computer',
                 'Form 477 All Terrestrial Broadband: Max Advertised Consumer Download Speed (Mbps)',
                 'Form 477 All Terrestrial Broadband: Max Advertised Consumer Upload Speed (Mbps)',
                 'Ookla Median Download Speed (Mbps)',
                 'Ookla Median Upload Speed (Mbps)',
                 '0_5_providers',
                 '6_10_providers',
                 '11_15_providers',
                 '16_20_providers',
                 '20+_providers',
                 '0_25_MaxAdDown',
                 '25_99_MaxAdDown',
                 '99_499_MaxAdDown',
                 '499_inf_MaxAdDown',
                 '0_25_Ookla Median Download Speed (Mbps)',
                 '25_99_Ookla Median Download Speed (Mbps)',
                 '99_499_Ookla Median Download Speed (Mbps)',
                 '499_inf_Ookla Median Download Speed (Mbps)',
                 '0_25_Form 477 All Terrestrial Broadband: Max Advertised Consumer Download Speed (Mbps)',
                 '25_99_Form 477 All Terrestrial Broadband: Max Advertised Consumer Download Speed (Mbps)',
                 '99_499_Form 477 All Terrestrial Broadband: Max Advertised Consumer Download Speed (Mbps)',
                 '499_inf_Form 477 All Terrestrial Broadband: Max Advertised Consumer Download Speed (Mbps)',
                 'single_provider',
                 'Primary RUCA Code - 1.0',
                 'Primary RUCA Code - 2.0',
                 'Primary RUCA Code - 4.0',
                 'Primary RUCA Code - 10.0',
                 'Primary RUCA Code - 7.0',
                 'Primary RUCA Code - 5.0',
                 'Primary RUCA Code - 8.0',
                 'Primary RUCA Code - 3.0',
                 'Primary RUCA Code - 6.0',
                 'Primary RUCA Code - 9.0',
                 'Primary RUCA Code - 99.0',
                 'Secondary RUCA Code - 1.0',
                 'Secondary RUCA Code - 2.0',
                 'Secondary RUCA Code - 4.0',
                 'Secondary RUCA Code - 10.0',
                 'Secondary RUCA Code - 7.0',
                 'Secondary RUCA Code - 5.0',
                 'Secondary RUCA Code - 1.1',
                 'Secondary RUCA Code - 8.0',
                 'Secondary RUCA Code - 3.0',
                 'Secondary RUCA Code - 6.0',
                 'Secondary RUCA Code - 9.0',
                 'Secondary RUCA Code - 4.1',
                 'Secondary RUCA Code - 99.0',
                 'Secondary RUCA Code - 7.1',
                 'Secondary RUCA Code - 10.2',
                 'Secondary RUCA Code - 2.1',
                 'Secondary RUCA Code - 10.1',
                 'Secondary RUCA Code - 10.3',
                 'Secondary RUCA Code - 7.2',
                 'Secondary RUCA Code - 5.1',
                 'Secondary RUCA Code - 8.1',
                 'Secondary RUCA Code - 8.2',
                 'SE',
                 'INFA',
                 'DDI',
                 'ruca_metro',
                 'ruca_micro',
                 'ruca_small_town',
                 'ruca_rural',
                 'Comcast_present',
                 'ATT_present',
                 'HughesNet_present',
                 'GCI_Comm_Corp_present',
                 'ViaSat_present',
                 'VSAT_present',
                 'Century_Link_present',
                 'Spectrum_present',
                 'Crown_Castle_present',
                 'Etheric_present',
                 'Frontier_Communications_present']

## Modeling

In [4]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [5]:
poverty_rate_train = np.array(training_data['poverty_rate'])
poverty_rate_test = np.array(test_data['poverty_rate'])

def poverty_model(train_df, 
                  test_df, 
                  variable_list,
                  poverty_rate_train = poverty_rate_train,  
                  poverty_rate_test = poverty_rate_test):
    
    #Convert to numpy array
    train_variables = np.array(train_df[variable_list])
    test_variables = np.array(test_df[variable_list])
    
    #Fit train and test variables
    model = LinearRegression().fit(train_variables, poverty_rate_train)
    predicted = model.predict(test_variables)
    
    #Use regression results function to score the model
    regression_results(poverty_rate_test, predicted)
    
    #Print how important features in the model are
    print("\n")
    coefficients = pd.DataFrame()
    coefficients['variable'] = variable_list
    coefficients['coef'] = model.coef_
    coefficients['abs_coef'] = np.absolute(model.coef_)

    coefficients.sort_values(by="abs_coef", ascending=False, inplace=True)
    
    for i in range(len(coefficients)):
        print (f"{coefficients['variable'].iloc[i]}: {coefficients['coef'].iloc[i]}")
    
    return True

In [7]:
poverty_model(train_df = training_data, 
              test_df = test_data, 
              variable_list = broadband_vars)

explained_variance:  0.6029
r2:  0.6028
MAE:  5.0189
MSE:  47.5826
RMSE:  6.898


Secondary RUCA Code - 8.2: 3.86668989257614
ViaSat_present: 3.744643606900877
Satellite_Provider_Count_25: 2.560160420398773
Satellite_Provider_Count: -2.3473663055901954
Secondary RUCA Code - 8.0: -2.322643010235208
Fixed_Wireless_Provider_Count_100: 1.870358126486269
Primary RUCA Code - 4.0: 1.490011620645022
Primary RUCA Code - 1.0: 1.4806365069748706
Secondary RUCA Code - 4.0: 1.3939630594044832
0_25_Form 477 All Terrestrial Broadband: Max Advertised Consumer Download Speed (Mbps): 1.3320192458373181
Secondary RUCA Code - 10.1: 1.2402117130044843
Secondary RUCA Code - 1.0: 1.2108170445387603
Etheric_present: -1.1827436839670484
Primary RUCA Code - 9.0: -1.1736141346299311
Secondary RUCA Code - 9.0: -1.1736141346299307
20+_providers: 1.1004029147939494
Secondary RUCA Code - 7.2: 0.9292097010857072
All_Provider_Count_100: -0.9261372758898714
Primary RUCA Code - 5.0: -0.9144795312943134
Satellite_Provide

True

### California Data

In [6]:
cali_train_data = training_data[training_data.state == 6]
cali_test_data = test_data[test_data.state == 6]

In [7]:
poverty_model(train_df = cali_train_data, 
              test_df = cali_test_data, 
              variable_list = broadband_vars,
              poverty_rate_train = np.array(cali_train_data['poverty_rate']),  
              poverty_rate_test =  np.array(cali_test_data['poverty_rate']))

explained_variance:  0.6156
r2:  0.6151
MAE:  4.4129
MSE:  37.7597
RMSE:  6.1449


Secondary RUCA Code - 8.1: 8.935299375945494
499_inf_MaxAdDown: 7.064865261217412
Secondary RUCA Code - 8.0: -6.5233972108467135
Secondary RUCA Code - 10.2: 4.573145716207579
99_499_MaxAdDown: -3.897288785801129
Etheric_present: -3.3342111911048034
25_99_MaxAdDown: -3.1675764754162046
Secondary RUCA Code - 7.2: -2.78895229725338
Secondary RUCA Code - 10.3: -2.7551156722496146
Wired_Provider_Count_100: -2.124910600310719
All_Provider_Count_100: 1.9337488347620522
Primary RUCA Code - 8.0: 1.8870564544788302
Primary RUCA Code - 7.0: -1.7602850048643182
Frontier_Communications_present: -1.4321404365036976
Secondary RUCA Code - 10.0: -1.264636975489642
Secondary RUCA Code - 1.1: -1.2329784928882104
Fixed_Wireless_Provider_Count_100: -1.1996971754833372
Wired_Provider_Count_25: 1.060683486038329
Secondary RUCA Code - 7.0: 1.0286672923890436
20+_providers: 1.009388784249823
Secondary RUCA Code - 1.0: 0.99465683

True