In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import statsmodels.api as sm 

In [2]:
## Helper function
def add_front_padding(x):
    zipcode = str(x)
    while len(zipcode) < 5:
        zipcode = "0"+zipcode
    return zipcode

In [3]:
data = pd.read_csv("../data/weighted_merged_all.csv", converters={'Zip' : lambda x: add_front_padding(x)})
data = data.set_index('Zip')
data.head()

Unnamed: 0_level_0,WiredCount_2020,Fwcount_2020,AllProviderCount_2020,Wired25_3_2020,Wired100_3_2020,All25_3_2020,All100_3,TestCount,AverageMbps,FastestAverageMbps,...,pct_computer_no_internet,pct_no_computer,pct_health_ins_children,pct_health_ins_19_64,pct_health_ins_65+,total_pop2,STATE,ZIP_TYPE,RUCA1,RUCA2
Zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
29639,3.0,0.0,8.0,3.0,3.0,5.0,3.0,163.0,93.12,223.75,...,15.151199,17.506064,88.950971,86.189438,100.0,4306.115164,SC,Zip Code Area,6.0,6.0
29620,6.0,0.0,11.0,5.0,3.0,7.0,3.0,2536.0,212.5,536.35,...,14.892564,19.287297,93.098385,81.591357,100.0,5274.299089,SC,Zip Code Area,4.0,4.0
29659,,,,,,,,,,,...,23.478883,14.745884,100.0,83.9,100.0,3464.0,SC,Zip Code Area,2.0,2.0
29638,6.0,1.0,13.0,4.0,4.0,6.0,4.0,272.0,82.79,222.35,...,10.668091,18.68026,93.406966,84.480742,100.0,3849.135001,SC,Zip Code Area,6.0,6.0
29628,4.0,0.0,8.0,3.0,2.0,5.0,2.0,100.0,51.12,126.06,...,10.39063,21.713915,97.927401,78.157411,100.0,2748.179543,SC,Zip Code Area,5.0,5.0


In [4]:
len(data.index)

32653

In [15]:
corr = data.corr()
emp_corr = corr['employment_rate']


In [23]:
emp_corr.head(30).sort_values()

median_age_female                          -0.196144
ERROR RANGE (MAE)(+/-)                     -0.190346
median_age_overall                         -0.184141
ERROR RANGE (95%)(+/-)                     -0.180675
median_age_male                            -0.165126
MSD                                        -0.137599
Lowest Priced Terrestrial Broadband Plan   -0.126217
Total_Enrolled_Households                   0.001452
TestCount                                   0.027628
ave_household_size                          0.083391
Fwcount_2020                                0.127731
Fwcount_2015                                0.159792
WiredCount_2015                             0.187844
AllProviderCount_2015                       0.203456
FastestAverageMbps                          0.247505
AverageMbps                                 0.247776
WiredCount_2020                             0.253533
total_households                            0.257124
Wired25_3_2020                              0.

In [18]:
emp_corr[emp_corr>0.3]

Wired25_3_2015                     0.314371
All25_3_2015                       0.311422
employment_rate                    1.000000
median_income                      0.530749
pct_pop_bachelors+                 0.431456
pct_pop_hs+                        0.402482
pct_internet                       0.534411
pct_internet_broadband_any_type    0.533962
pct_internet_cellular              0.524998
pct_computer                       0.517215
pct_computer_with_broadband        0.537322
Name: employment_rate, dtype: float64

In [35]:
y_variables = ['employment_rate']
x_variables = ['WiredCount_2020','Wired25_3_2020','Wired100_3_2020',
               'BROADBAND USAGE','AllProviderCount_2020','All100_3.1', 
               'All100_3','All25_3_2020', 'median_income', 
               'pct_pop_bachelors+', 'pct_internet_broadband_any_type',
              'pct_computer_with_broadband']

In [26]:
all_variables = x_variables+y_variables

In [32]:
clean_data = data[all_variables].dropna(how = 'any', axis = 0)
len(clean_data.index)

29396

In [36]:
X_train, X_test, y_train, y_test = train_test_split(
...     clean_data[x_variables], clean_data[y_variables], test_size=0.2,)

In [37]:
 len(X_train.index)


23516

In [38]:
 len(X_test.index)

5880

In [39]:
23516/29396

0.7999727854129813

In [40]:
5880/29396

0.20002721458701864

In [43]:
reg_summary = sm.OLS(y_train, X_train).fit()
reg_summary.summary()

0,1,2,3
Dep. Variable:,employment_rate,R-squared (uncentered):,0.98
Model:,OLS,Adj. R-squared (uncentered):,0.98
Method:,Least Squares,F-statistic:,95800.0
Date:,"Mon, 21 Feb 2022",Prob (F-statistic):,0.0
Time:,15:24:08,Log-Likelihood:,-82590.0
No. Observations:,23516,AIC:,165200.0
Df Residuals:,23504,BIC:,165300.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
WiredCount_2020,-0.0319,0.065,-0.489,0.625,-0.160,0.096
Wired25_3_2020,-0.1359,0.122,-1.112,0.266,-0.375,0.104
Wired100_3_2020,-0.2973,0.128,-2.328,0.020,-0.548,-0.047
BROADBAND USAGE,-1.3711,0.207,-6.631,0.000,-1.776,-0.966
AllProviderCount_2020,0.6531,0.033,19.939,0.000,0.589,0.717
All100_3.1,-0.0666,0.063,-1.060,0.289,-0.190,0.057
All100_3,-0.1128,0.090,-1.257,0.209,-0.289,0.063
All25_3_2020,0.4390,0.079,5.533,0.000,0.283,0.594
median_income,9.142e-05,3.7e-06,24.685,0.000,8.42e-05,9.87e-05

0,1,2,3
Omnibus:,5873.467,Durbin-Watson:,2.012
Prob(Omnibus):,0.0,Jarque-Bera (JB):,60957.601
Skew:,-0.903,Prob(JB):,0.0
Kurtosis:,10.678,Cond. No.,263000.0


In [45]:
x_variables2 = ['WiredCount_2020','Wired25_3_2020','Wired100_3_2020',
               'BROADBAND USAGE','AllProviderCount_2020','All100_3.1', 
               'All100_3','All25_3_2020',
               'pct_pop_bachelors+', 'pct_internet_broadband_any_type',
              'pct_computer_with_broadband']

In [48]:
X_train2 = X_train[x_variables2].copy()


In [49]:
X_test2=X_test[x_variables2].copy()

In [50]:
reg_summary2 = sm.OLS(y_train, X_train2).fit()
reg_summary2.summary()

0,1,2,3
Dep. Variable:,employment_rate,R-squared (uncentered):,0.979
Model:,OLS,Adj. R-squared (uncentered):,0.979
Method:,Least Squares,F-statistic:,101800.0
Date:,"Mon, 21 Feb 2022",Prob (F-statistic):,0.0
Time:,15:26:42,Log-Likelihood:,-82891.0
No. Observations:,23516,AIC:,165800.0
Df Residuals:,23505,BIC:,165900.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
WiredCount_2020,-0.0305,0.066,-0.461,0.645,-0.160,0.099
Wired25_3_2020,-0.2454,0.124,-1.984,0.047,-0.488,-0.003
Wired100_3_2020,-0.3414,0.129,-2.640,0.008,-0.595,-0.088
BROADBAND USAGE,-1.3843,0.209,-6.610,0.000,-1.795,-0.974
AllProviderCount_2020,0.6416,0.033,19.342,0.000,0.577,0.707
All100_3.1,0.0553,0.063,0.872,0.383,-0.069,0.180
All100_3,-0.0386,0.091,-0.424,0.671,-0.217,0.140
All25_3_2020,0.4304,0.080,5.357,0.000,0.273,0.588
pct_pop_bachelors+,0.0320,0.005,6.691,0.000,0.023,0.041

0,1,2,3
Omnibus:,6122.437,Durbin-Watson:,2.015
Prob(Omnibus):,0.0,Jarque-Bera (JB):,58776.335
Skew:,-0.977,Prob(JB):,0.0
Kurtosis:,10.495,Cond. No.,450.0


In [52]:
x_variables3 = ['Wired25_3_2020','Wired100_3_2020',
               'BROADBAND USAGE','AllProviderCount_2020','All100_3.1', 
               'All25_3_2020',
               'pct_pop_bachelors+', 'pct_internet_broadband_any_type',
              'pct_computer_with_broadband']

In [53]:
X_train3 = X_train[x_variables3].copy()
X_test3=X_test[x_variables3].copy()
reg_summary3 = sm.OLS(y_train, X_train3).fit()
reg_summary3.summary()

0,1,2,3
Dep. Variable:,employment_rate,R-squared (uncentered):,0.979
Model:,OLS,Adj. R-squared (uncentered):,0.979
Method:,Least Squares,F-statistic:,124500.0
Date:,"Mon, 21 Feb 2022",Prob (F-statistic):,0.0
Time:,15:27:59,Log-Likelihood:,-82891.0
No. Observations:,23516,AIC:,165800.0
Df Residuals:,23507,BIC:,165900.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Wired25_3_2020,-0.2618,0.104,-2.516,0.012,-0.466,-0.058
Wired100_3_2020,-0.3844,0.085,-4.526,0.000,-0.551,-0.218
BROADBAND USAGE,-1.3871,0.209,-6.625,0.000,-1.797,-0.977
AllProviderCount_2020,0.6316,0.025,24.928,0.000,0.582,0.681
All100_3.1,0.0514,0.063,0.814,0.416,-0.072,0.175
All25_3_2020,0.4329,0.071,6.074,0.000,0.293,0.573
pct_pop_bachelors+,0.0319,0.005,6.673,0.000,0.023,0.041
pct_internet_broadband_any_type,0.6931,0.079,8.791,0.000,0.539,0.848
pct_computer_with_broadband,-0.0754,0.080,-0.945,0.344,-0.232,0.081

0,1,2,3
Omnibus:,6119.683,Durbin-Watson:,2.015
Prob(Omnibus):,0.0,Jarque-Bera (JB):,58771.347
Skew:,-0.976,Prob(JB):,0.0
Kurtosis:,10.495,Cond. No.,449.0


In [55]:
x_variables4 = ['Wired25_3_2020','Wired100_3_2020',
               'BROADBAND USAGE','AllProviderCount_2020', 
               'All25_3_2020',
               'pct_pop_bachelors+', 'pct_internet_broadband_any_type',
              ]

In [56]:
X_train4 = X_train[x_variables4].copy()
X_test4=X_test[x_variables4].copy()
reg_summary4 = sm.OLS(y_train, X_train4).fit()
reg_summary4.summary()

0,1,2,3
Dep. Variable:,employment_rate,R-squared (uncentered):,0.979
Model:,OLS,Adj. R-squared (uncentered):,0.979
Method:,Least Squares,F-statistic:,160000.0
Date:,"Mon, 21 Feb 2022",Prob (F-statistic):,0.0
Time:,15:29:08,Log-Likelihood:,-82892.0
No. Observations:,23516,AIC:,165800.0
Df Residuals:,23509,BIC:,165900.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Wired25_3_2020,-0.2617,0.104,-2.515,0.012,-0.466,-0.058
Wired100_3_2020,-0.3740,0.084,-4.444,0.000,-0.539,-0.209
BROADBAND USAGE,-1.3545,0.205,-6.595,0.000,-1.757,-0.952
AllProviderCount_2020,0.6331,0.025,25.011,0.000,0.583,0.683
All25_3_2020,0.4331,0.071,6.079,0.000,0.293,0.573
pct_pop_bachelors+,0.0313,0.005,6.846,0.000,0.022,0.040
pct_internet_broadband_any_type,0.6185,0.003,196.636,0.000,0.612,0.625

0,1,2,3
Omnibus:,6134.559,Durbin-Watson:,2.015
Prob(Omnibus):,0.0,Jarque-Bera (JB):,59050.44
Skew:,-0.978,Prob(JB):,0.0
Kurtosis:,10.513,Cond. No.,323.0


In [59]:
reg_summary4_test = sm.OLS(y_test, X_test4).fit()
reg_summary4_test.summary()

0,1,2,3
Dep. Variable:,employment_rate,R-squared (uncentered):,0.981
Model:,OLS,Adj. R-squared (uncentered):,0.981
Method:,Least Squares,F-statistic:,43680.0
Date:,"Mon, 21 Feb 2022",Prob (F-statistic):,0.0
Time:,15:31:45,Log-Likelihood:,-20483.0
No. Observations:,5880,AIC:,40980.0
Df Residuals:,5873,BIC:,41030.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Wired25_3_2020,-0.3655,0.203,-1.800,0.072,-0.763,0.032
Wired100_3_2020,-0.1813,0.164,-1.108,0.268,-0.502,0.140
BROADBAND USAGE,-1.9714,0.396,-4.973,0.000,-2.748,-1.194
AllProviderCount_2020,0.5886,0.049,11.932,0.000,0.492,0.685
All25_3_2020,0.4734,0.138,3.430,0.001,0.203,0.744
pct_pop_bachelors+,0.0498,0.009,5.696,0.000,0.033,0.067
pct_internet_broadband_any_type,0.6168,0.006,100.547,0.000,0.605,0.629

0,1,2,3
Omnibus:,1621.2,Durbin-Watson:,2.035
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9026.021
Skew:,-1.204,Prob(JB):,0.0
Kurtosis:,8.571,Cond. No.,326.0
