## Importing Libraries & Data

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor    

In [2]:
df = pd.read_csv('../data/exports/health_landscape_clean.csv').drop(columns=['avgLifeExpec',
                                                                             'pct_diabetes_13',
                                                                             'strokeDeath_rate'])

In [3]:
def fips_to_obj(df):
    df['FIPS'] = [int(i) for i in df['FIPS'].fillna(0)]
    df['FIPS'] = df['FIPS'].astype('object')
    fips_list = []
    for i in df['FIPS']:
        if len(str(i)) < 5:
            fips_list.append('0' + str(i))
        else:
            fips_list.append(str(i))
    df['FIPS'] = fips_list
    return df

df = fips_to_obj(df)

## Baseline

In [4]:
print('Baseline:',df['obesityRate_13'].mean())

Baseline: 31.040885416666697


## Replicating Study

In [5]:
X = df[['foodSwamp_14','pct_lowAccessInc_15','recreationFacilities_14', 'povRate_15','natAmenityIndex',
        'milkSoda_priceRatio_10', 'pct_black', 'pct_hispanicORlatino','pct_over65','totalArea']]
y = df['obesityRate_13']

lr = LinearRegression()
lr.fit(X, y)
print('Score:',lr.score(X, y))
print('Cross val score:',cross_val_score(lr, X, y, cv=5, scoring='explained_variance').mean())

Score: 0.44345542712843067
Cross val score: 0.3508528154052518


In [6]:
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,obesityRate_13,R-squared:,0.443
Model:,OLS,Adj. R-squared:,0.442
Method:,Least Squares,F-statistic:,243.9
Date:,"Fri, 10 May 2019",Prob (F-statistic):,0.0
Time:,14:40:08,Log-Likelihood:,-8085.6
No. Observations:,3072,AIC:,16190.0
Df Residuals:,3061,BIC:,16260.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,31.4872,0.574,54.833,0.000,30.361,32.613
foodSwamp_14,0.1325,0.025,5.281,0.000,0.083,0.182
pct_lowAccessInc_15,0.0058,0.008,0.715,0.475,-0.010,0.022
recreationFacilities_14,-0.0254,0.002,-13.008,0.000,-0.029,-0.022
povRate_15,0.2377,0.012,20.196,0.000,0.215,0.261
natAmenityIndex,-1.4253,0.068,-20.819,0.000,-1.560,-1.291
milkSoda_priceRatio_10,1.3419,0.528,2.539,0.011,0.306,2.378
pct_black,3.9928,0.554,7.209,0.000,2.907,5.079
pct_hispanicORlatino,-5.8246,0.527,-11.056,0.000,-6.858,-4.792

0,1,2,3
Omnibus:,53.436,Durbin-Watson:,1.448
Prob(Omnibus):,0.0,Jarque-Bera (JB):,73.589
Skew:,-0.211,Prob(JB):,1.05e-16
Kurtosis:,3.63,Cond. No.,43900.0


In [7]:
pd.Series([variance_inflation_factor(X.values, i) 
            for i in range(X.shape[1])], 
            index=X.columns)

const                      89.200261
foodSwamp_14                1.189776
pct_lowAccessInc_15         1.137420
recreationFacilities_14     1.166298
povRate_15                  1.540199
natAmenityIndex             1.376809
milkSoda_priceRatio_10      1.402644
pct_black                   1.750550
pct_hispanicORlatino        1.405352
pct_over65                  1.325864
totalArea                   1.356419
dtype: float64