In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import statsmodels.api as sm 
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import  r2_score

In [2]:
## Helper function
def add_front_padding(x):
    zipcode = str(x)
    while len(zipcode) < 5:
        zipcode = "0"+zipcode
    return zipcode

In [3]:
data = pd.read_csv("../data/broadband_training_2.csv")
data = data.set_index('tract_geoid')
data = data.loc[:,data.dtypes =='float64']
data=data.dropna(how='any', axis = 1)
data.head()

Unnamed: 0_level_0,MaxAdDown,MaxAdUp,Wired_Provider_Count,Fixed_Wireless_Provider_Count,All_Provider_Count_25,All_Provider_Count_100,Fixed_Wireless_Provider_Count_25,Wired_Provider_Count_25,Satellite_Provider_Count_25,Fixed_Wireless_Provider_Count_100,...,pct_pop_60_to_64,pct_pop_65_to_69,pct_pop_70_to_74,pct_pop_75_to_79,pct_pop_80_to_84,pct_pop_gt_85,pct_pop_disability,pct_pop_households_with_kids,Ookla Median Download Speed (Mbps),Ookla Median Upload Speed (Mbps)
tract_geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24031700607,987.0,880.0,3.0,0.0,4.0,2.0,0.0,2.0,2.0,0.0,...,7.475754,6.384698,3.852371,2.693966,0.431034,0.080819,3.811961,1.997428,132.108002,88.997002
37005950200,1000.0,500.0,3.0,2.0,4.0,1.0,1.0,1.0,2.0,0.0,...,9.862571,5.524117,6.467259,6.89841,3.826462,2.425222,17.921645,1.986085,36.334999,34.022999
20027458200,1000.0,1000.0,4.0,5.0,4.0,2.0,1.0,2.0,2.0,0.0,...,10.346399,6.654512,5.423883,3.418414,3.828624,5.332726,19.925599,1.308642,67.499001,48.374001
48091310801,1000.0,1000.0,10.0,3.0,9.0,5.0,3.0,4.0,2.0,0.0,...,11.537878,9.154395,5.768939,2.56566,1.487779,0.743889,15.986602,1.401302,121.716003,18.773001
37021002203,1000.0,1000.0,9.0,2.0,5.0,3.0,1.0,2.0,2.0,0.0,...,2.844886,5.983292,2.528788,3.748024,2.370738,2.912621,17.2951,0.97748,41.883999,10.521


In [4]:
test_data = pd.read_csv("../data/broadband_test_2.csv")
test_data = test_data.set_index('tract_geoid')
test_data = test_data.loc[:,data.columns]
test_data=test_data.dropna(how='any', axis = 1)
test_data.head()

Unnamed: 0_level_0,MaxAdDown,MaxAdUp,Wired_Provider_Count,Fixed_Wireless_Provider_Count,All_Provider_Count_25,All_Provider_Count_100,Fixed_Wireless_Provider_Count_25,Wired_Provider_Count_25,Satellite_Provider_Count_25,Fixed_Wireless_Provider_Count_100,...,pct_pop_60_to_64,pct_pop_65_to_69,pct_pop_70_to_74,pct_pop_75_to_79,pct_pop_80_to_84,pct_pop_gt_85,pct_pop_disability,pct_pop_households_with_kids,Ookla Median Download Speed (Mbps),Ookla Median Upload Speed (Mbps)
tract_geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17097860906,1000.0,1000.0,3.0,2.0,5.0,3.0,1.0,2.0,2.0,0.0,...,3.898676,3.846555,2.981341,2.752007,0.323152,0.416971,6.04729,1.263061,118.959999,11.917
24021751600,987.0,35.0,2.0,3.0,6.0,2.0,3.0,1.0,2.0,0.0,...,7.145209,7.145209,3.720777,2.271979,2.041488,3.424432,15.278235,2.872629,15.215,5.194
36087013100,1000.0,1000.0,8.0,0.0,5.0,3.0,0.0,3.0,2.0,0.0,...,8.178792,9.082263,4.707561,5.11967,1.410683,4.041845,16.362192,0.58642,103.932999,36.136002
19061010201,1000.0,940.0,11.0,4.0,8.0,4.0,2.0,4.0,2.0,0.0,...,7.954194,4.688951,3.884246,2.445063,0.324977,0.402352,5.804954,2.057745,57.477001,32.389
36101960800,1000.0,50.0,5.0,0.0,4.0,2.0,0.0,2.0,2.0,0.0,...,6.267628,4.387339,4.638045,1.84895,2.601065,2.977123,18.617542,3.083404,89.845001,22.826


In [29]:
y_variables = ['employment_rate']
x_variables = ['pct_internet_none', 
                'MaxAdUp',
                'pct_internet_broadband_any_type',
              'pct_computer_with_broadband','pct_no_computer','pct_computer','pct_internet_broadband_satellite',
              'pct_internet_broadband_fiber','pct_internet_dial_up']
X_train = data[x_variables]
y_train = data[y_variables]
X_test = test_data[x_variables]
y_test = test_data[y_variables]
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [9]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

In [10]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 6.055522260992066
Mean Squared Error: 65.39158540202297
Root Mean Squared Error: 8.086506378036376


In [20]:
feature_importances = pd.DataFrame(regr.coef_, columns = x_variables)
feature_importances

Unnamed: 0,pct_internet_none,MaxAdUp,pct_internet_broadband_any_type,pct_computer_with_broadband,pct_no_computer,pct_computer,pct_internet_broadband_satellite,pct_internet_broadband_fiber,pct_internet_dial_up
0,0.730777,0.985659,1.774277,1.151162,39840200000000.0,39840200000000.0,-1.449816,-1.05995,-0.094017


In [21]:
#remove outliers
threshold = 500
X_train_df = pd.DataFrame(X_train, columns = x_variables, index = y_train.index)
X_train_copy = X_train_df.copy()
X_train_copy = X_train_copy.sort_values('pct_internet_none').iloc[:-500,:]
X_train_copy = X_train_copy.sort_values('pct_internet_broadband_any_type').iloc[500:,:]
X_train_copy = X_train_copy.sort_values('pct_computer_with_broadband').iloc[500:,:]
X_train_copy = X_train_copy.sort_values('pct_no_computer').iloc[:-500,:]
X_train_copy = X_train_copy.sort_values('pct_computer').iloc[:-500,:]
X_train_copy = X_train_copy.sort_values('pct_internet_broadband_satellite').iloc[:-500,:]
X_train_copy = X_train_copy.sort_values('pct_internet_broadband_fiber').iloc[:-500,:]
X_train_copy = X_train_copy.sort_values('pct_internet_dial_up').iloc[:-500,:]
print(len(X_train_copy.index))
print(len(X_train_df.index))

48535
52535


In [22]:
y_train_copy = y_train.loc[X_train_copy.index]

In [23]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train_copy, y_train_copy)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

In [24]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 6.061477749418405
Mean Squared Error: 65.41596804314194
Root Mean Squared Error: 8.088013850330745


In [25]:
feature_importances = pd.DataFrame(regr.coef_, columns = x_variables)
feature_importances


Unnamed: 0,pct_internet_none,MaxAdUp,pct_internet_broadband_any_type,pct_computer_with_broadband,pct_no_computer,pct_computer,pct_internet_broadband_satellite,pct_internet_broadband_fiber,pct_internet_dial_up
0,0.799091,0.949575,2.063638,0.892342,37339090000000.0,37339090000000.0,-1.554418,-1.18192,-0.113233


In [31]:
y_variables = ['employment_rate']
x_variables2 = ['pct_internet_none', 
                'MaxAdUp',
                'pct_internet_broadband_any_type',
              'pct_computer_with_broadband','pct_internet_broadband_satellite',
              'pct_internet_broadband_fiber','pct_internet_dial_up']
X_train_copy2 = X_train_copy[x_variables2]
X_test2 = pd.DataFrame(X_test, columns = x_variables, index = y_test.index)
X_test2 = X_test2[x_variables2]
# Create linear regression object
regr2 = linear_model.LinearRegression()

# Train the model using the training sets
regr2.fit(X_train_copy2, y_train_copy)

# Make predictions using the testing set
y_pred2 = regr2.predict(X_test2)

In [34]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred2))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred2))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred2)))

Mean Absolute Error: 6.127365980211668
Mean Squared Error: 66.60218939252461
Root Mean Squared Error: 8.161016443588666


In [35]:
feature_importances = pd.DataFrame(regr2.coef_, columns = x_variables2)
feature_importances

Unnamed: 0,pct_internet_none,MaxAdUp,pct_internet_broadband_any_type,pct_computer_with_broadband,pct_internet_broadband_satellite,pct_internet_broadband_fiber,pct_internet_dial_up
0,-1.62149,1.010651,-2.559726,5.762903,-1.46232,-1.23774,-0.212968
