From David: "Percent of the population with access to the internet makes sense if it is  population with *any* access to the internet."

For this baseline model, I'll use pct_internet.

### Imports and Data Load

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics 

In [2]:
training_data = pd.read_csv('/home/ec2-user/capstone/broadband-capstone/data/current_data/train_3.csv')
test_data = pd.read_csv('/home/ec2-user/capstone/broadband-capstone/data/current_data/test_3.csv')

In [3]:
training_data.columns

Index(['Unnamed: 0', 'tract_geoid', 'All_Provider_Count', 'MaxAdDown',
       'MaxAdUp', 'Wired_Provider_Count', 'Satellite_Provider_Count',
       'Fixed_Wireless_Provider_Count', 'All_Provider_Count_25',
       'All_Provider_Count_100',
       ...
       'log_median_income', 'log_median_house_value',
       'median_income_over_median_rent',
       'median_income_over_median_house_value',
       'pct_of_internet_users_with_broadband',
       'log_median_income_over_log_median_house', 'pct_desktop_or_laptop',
       'pct_smartphone', 'pct_only_smartphone', 'pct_tablet'],
      dtype='object', length=204)

### Modeling

In [13]:
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [11]:
poverty_rate_train = np.array(training_data['poverty_rate'])
poverty_rate_test = np.array(test_data['poverty_rate'])

def poverty_model(train_df, 
                  test_df, 
                  variable_list,
                  poverty_rate_train = poverty_rate_train,  
                  poverty_rate_test = poverty_rate_test):
    
    #Convert to numpy array
    train_variables = np.array(train_df[variable_list])
    test_variables = np.array(test_df[variable_list])
    
    #Fit train and test variables
    model = LinearRegression().fit(train_variables, poverty_rate_train)
    predicted = model.predict(test_variables)
    
    #Use regression results function to score the model
    regression_results(poverty_rate_test, predicted)
    
    #Print how important features in the model are
    print("\n")
    coefficients = pd.DataFrame()
    coefficients['variable'] = variable_list
    coefficients['coef'] = model.coef_
    coefficients['abs_coef'] = np.absolute(model.coef_)

    coefficients.sort_values(by="abs_coef", ascending=False, inplace=True)
    
    for i in range(len(coefficients)):
        print (f"{coefficients['variable'].iloc[i]}: {coefficients['coef'].iloc[i]}")
    
    return True

In [14]:
poverty_model(train_df = training_data,
              test_df = test_data, 
              variable_list = ['pct_internet'])

explained_variance:  0.4262
r2:  0.426
MAE:  6.0408
MSE:  68.7683
RMSE:  8.2927


pct_internet: -0.6349722279973397


True

In [20]:
poverty_model(train_df = training_data,
              test_df = test_data, 
              variable_list = ['pct_pop_hs+'])

explained_variance:  0.3044
r2:  0.3043
MAE:  6.65
MSE:  83.3492
RMSE:  9.1296


pct_pop_hs+: -0.37655998864058987


True