# In this notebook, we use different food access factors to predict outcomes related to food deserts in order to identify the most important features for labelling a food desert. #

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
food_obesity = pd.read_csv('./data/food_obesity.csv')

In [3]:
food_diabetes = pd.read_csv('./data/food_diabetes.csv')

In [4]:
food_obesity['TractSNAP_percent'] = food_obesity['TractSNAP'] / food_obesity['Pop2010']

In [5]:
food_diabetes['TractSNAP_percent'] = food_diabetes['TractSNAP'] / food_diabetes['Pop2010']

In [16]:
def linear_fit(df, features, target):
    
    # Set up train and test data.
    X = df[features]
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3224)
    
    # Fit the model.
    linreg = LinearRegression()
    linreg.fit(X_train, y_train)
    
    # Calculate RMSE for train and test.
    preds = linreg.predict(X_test)
    rmse_regression = mean_squared_error(y_test, preds)**0.5
    base_preds = np.ones(len(y_test))*y_test.mean()
    rmse_base = mean_squared_error(y_test, base_preds)**0.5
    
    # Print metrics and return dataframe that shows the coefficient of each feature.
    print(f'Training R2: {linreg.score(X_train, y_train)}')
    print(f'Testing R2: {linreg.score(X_test, y_test)}')
    print(f'Baseline RMSE: {rmse_base}')
    print(f'Regression RMSE: {rmse_regression}')
    print(f'Improvement over baseline: {rmse_base - rmse_regression}')
    print(f'Proportional improvement: {(rmse_base - rmse_regression)/rmse_base}')
    return pd.DataFrame(zip(X_train.columns, linreg.coef_), columns=['feature', 'coefficient']).sort_values(by='coefficient', ascending=False)

### Predicting obesity with 1-mile food access data ###

In [17]:
cols_1_mile = ['LowIncomeTracts', 'LATracts1', 'HUNVFlag', 'TractSNAP_percent']

In [18]:
linear_fit(food_obesity, cols_1_mile, 'HCSOBP_2016-2018')

Training R2: 0.32764373549260173
Testing R2: 0.288560034822962
Baseline RMSE: 11.022085898368802
Regression RMSE: 9.296787843894386
Improvement over baseline: 1.7252980544744165
Proportional improvement: 0.15653099335124485


Unnamed: 0,feature,coefficient
3,TractSNAP_percent,16.266496
0,LowIncomeTracts,9.766421
2,HUNVFlag,4.887343
1,LATracts1,-0.475046


We see about 15% improvement over baseline.  SNAP percentage and the Low Income flag are the most useful predictors.  Food access within 1 mile appears to be a poor predictor.

### Predicting obesity with half-mile food access data ###

In [19]:
cols_half_mile = ['LowIncomeTracts', 'LATracts_half', 'HUNVFlag', 'TractSNAP_percent']

In [20]:
linear_fit(food_obesity, cols_half_mile, 'HCSOBP_2016-2018')

Training R2: 0.3783168689440376
Testing R2: 0.32958420271894384
Baseline RMSE: 11.022085898368802
Regression RMSE: 9.024765186122835
Improvement over baseline: 1.9973207122459673
Proportional improvement: 0.1812107735924611


Unnamed: 0,feature,coefficient
3,TractSNAP_percent,17.591663
0,LowIncomeTracts,10.07391
1,LATracts_half,6.430744
2,HUNVFlag,-0.246612


We see about 18% improvement over baseline. SNAP percentage and the Low Income flag are the most useful predictors.  Food access within a half mile appears to be a much better predictor than access within 1 mile.

In [21]:
linear_fit(food_diabetes, cols_1_mile, 'HCSDIAP_2016-2018')

Training R2: 0.16989347897522222
Testing R2: 0.26334518862607126
Baseline RMSE: 3.5541007131375006
Regression RMSE: 3.0504347885606986
Improvement over baseline: 0.503665924576802
Proportional improvement: 0.14171402704347513


Unnamed: 0,feature,coefficient
0,LowIncomeTracts,2.957291
1,LATracts1,1.054966
2,HUNVFlag,0.937767
3,TractSNAP_percent,-2.248496


In [22]:
linear_fit(food_diabetes, cols_half_mile, 'HCSDIAP_2016-2018')

Training R2: 0.2097004066954049
Testing R2: 0.27455355228964995
Baseline RMSE: 3.5541007131375006
Regression RMSE: 3.027139322115134
Improvement over baseline: 0.5269613910223665
Proportional improvement: 0.14826855892813848


Unnamed: 0,feature,coefficient
0,LowIncomeTracts,3.102989
1,LATracts_half,2.030085
2,HUNVFlag,-0.579829
3,TractSNAP_percent,-2.511129
