In [24]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from config import db_password
from sqlalchemy import create_engine
import psycopg2

In [25]:
## Load Files

In [26]:
### File loading from csvs
# file_path_city = "Data/location_data_clean.csv"
# file_path_life = "Data/life_expectancy_clean.csv"
# file_path_obesity = "Data/obesity_clean.csv"
# city_df = pd.read_csv(file_path_city)
# life_df = pd.read_csv(file_path_life)
# obesity_df = pd.read_csv(file_path_obesity)

In [27]:
# File loading from SQL
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/FinalProject"
engine = create_engine(db_string)
city_df = pd.read_sql_table('location_data', db_string)
target_df = pd.read_sql_table('health_data', db_string)

In [43]:
# Rename census tract number in order to merge
city_df.rename(columns = {'Census_Tract_Number' : 'census_tract_number', 'State' : 'state', 'County' : 
                         'county'}, inplace = True)
city_df.dtypes

census_tract_number             int64
Walkability                   float64
Total Population                int64
% 0 Vehicle HH                float64
% 1 Vehicle HH                float64
% 2+ Vehicle HH               float64
% Low Wage Workers            float64
Population Density            float64
Employment Density            float64
Pedestrian Network Density    float64
Transit Frequency             float64
state                          object
county                         object
LILATracts_1And10               int64
LILATracts_halfAnd10            int64
LILATracts_1And20               int64
LILATracts_Vehicle              int64
dtype: object

In [44]:
# change census tract number data type in order to merge
city_df['census_tract_number'] = city_df['census_tract_number'].astype(np.int64)
city_df.dtypes

census_tract_number             int64
Walkability                   float64
Total Population                int64
% 0 Vehicle HH                float64
% 1 Vehicle HH                float64
% 2+ Vehicle HH               float64
% Low Wage Workers            float64
Population Density            float64
Employment Density            float64
Pedestrian Network Density    float64
Transit Frequency             float64
state                          object
county                         object
LILATracts_1And10               int64
LILATracts_halfAnd10            int64
LILATracts_1And20               int64
LILATracts_Vehicle              int64
dtype: object

In [45]:
target_df

Unnamed: 0,census_tract_number,state,county,life_expectancy,obesity
0,4013103205,AZ,Phoenix,79.7,26.9
1,4013103206,AZ,Phoenix,83.1,24.8
2,4013103207,AZ,Phoenix,79.1,25.3
3,4013103208,AZ,Phoenix,82.2,24.9
4,4013103209,AZ,Phoenix,79.1,23.8
...,...,...,...,...,...
5463,55133203000,WI,Waukesha,78.9,33.1
5464,55133203101,WI,Waukesha,82.5,29.4
5465,55133203102,WI,Waukesha,78.1,29.5
5466,55133203103,WI,Waukesha,78.7,33.6


In [46]:
# Merge city and health into one dataframe, ensuring all datapoints match
features_df = pd.merge(city_df, target_df, on = 'census_tract_number')
features_df

Unnamed: 0,census_tract_number,Walkability,Total Population,% 0 Vehicle HH,% 1 Vehicle HH,% 2+ Vehicle HH,% Low Wage Workers,Population Density,Employment Density,Pedestrian Network Density,...,state_x,county_x,LILATracts_1And10,LILATracts_halfAnd10,LILATracts_1And20,LILATracts_Vehicle,state_y,county_y,life_expectancy,obesity
0,48439100400,14.833333,821,0.113861,0.247525,0.638614,0.236443,8.284813,1.402666,21.637878,...,Texas,Tarrant County,0,1,0,0,TX,Fort Worth,79.8,42.9
1,48439105800,13.166667,1048,0.000000,0.415842,0.584158,0.252907,10.586268,1.424298,13.318689,...,Texas,Tarrant County,0,1,0,0,TX,Fort Worth,77.5,40.7
2,48439104400,14.666667,461,0.250000,0.226190,0.523810,0.199115,6.334020,1.964783,16.608250,...,Texas,Tarrant County,0,1,0,0,TX,Fort Worth,77.1,36.4
3,48439111544,9.333333,1338,0.000000,0.054762,0.945238,0.222368,9.078248,1.160225,19.218476,...,Texas,Tarrant County,0,0,0,0,TX,Arlington,78.6,33.3
4,48439102302,9.000000,941,0.062162,0.310811,0.627027,0.250000,6.514366,1.356871,13.860563,...,Texas,Tarrant County,1,1,1,0,TX,Fort Worth,72.4,37.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
964,55133202901,13.833333,1169,0.011547,0.344111,0.644342,0.194257,3.587088,1.113869,7.923397,...,Wisconsin,Waukesha County,0,0,0,0,WI,Waukesha,84.4,27.5
965,55133202400,13.333333,1135,0.028283,0.365657,0.606061,0.210938,4.010525,1.438135,13.524364,...,Wisconsin,Waukesha County,0,0,0,0,WI,Waukesha,79.5,30.0
966,55133202301,7.500000,2215,0.016234,0.270563,0.713203,0.234266,2.856509,0.359804,7.499182,...,Wisconsin,Waukesha County,0,0,0,0,WI,Waukesha,83.4,31.4
967,55133202102,11.000000,746,0.026059,0.413681,0.560261,0.227545,5.112132,0.089085,10.775377,...,Wisconsin,Waukesha County,0,0,0,0,WI,Waukesha,83.3,29.8


In [47]:
# Save location data in separate dataframe
loc_df = pd.DataFrame(features_df[['state_x', 'county_x', 'census_tract_number']])

## Obesity Model

In [48]:
# Create Obesity Features
## Removing % Low Income since it does not pertain to city structure
obesity_X = pd.get_dummies(features_df.drop(columns=['life_expectancy', 'state_x', 'county_x', 'state_y', 'county_y','census_tract_number', 'obesity', '% Low Wage Workers']))

# Create Obesity Target
obesity_y = features_df['obesity']

# Saving feature names for later use
obesity_features_list = list(obesity_X.columns)
obesity_features = np.array(obesity_X)
obesity_X

Unnamed: 0,Walkability,Total Population,% 0 Vehicle HH,% 1 Vehicle HH,% 2+ Vehicle HH,Population Density,Employment Density,Pedestrian Network Density,Transit Frequency,LILATracts_1And10,LILATracts_halfAnd10,LILATracts_1And20,LILATracts_Vehicle
0,14.833333,821,0.113861,0.247525,0.638614,8.284813,1.402666,21.637878,21.506204,0,1,0,0
1,13.166667,1048,0.000000,0.415842,0.584158,10.586268,1.424298,13.318689,17.261274,0,1,0,0
2,14.666667,461,0.250000,0.226190,0.523810,6.334020,1.964783,16.608250,52.760606,0,1,0,0
3,9.333333,1338,0.000000,0.054762,0.945238,9.078248,1.160225,19.218476,-99999.000000,0,0,0,0
4,9.000000,941,0.062162,0.310811,0.627027,6.514366,1.356871,13.860563,-99999.000000,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
964,13.833333,1169,0.011547,0.344111,0.644342,3.587088,1.113869,7.923397,5.590615,0,0,0,0
965,13.333333,1135,0.028283,0.365657,0.606061,4.010525,1.438135,13.524364,14.314923,0,0,0,0
966,7.500000,2215,0.016234,0.270563,0.713203,2.856509,0.359804,7.499182,0.552989,0,0,0,0
967,11.000000,746,0.026059,0.413681,0.560261,5.112132,0.089085,10.775377,18.990268,0,0,0,0


In [49]:
from sklearn.model_selection import train_test_split
obesity_X_train, obesity_X_test, obesity_y_train, obesity_y_test = train_test_split(obesity_X, obesity_y, random_state=1)

### Random Forest Regressor

In [50]:
# Resample the training data with the RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
obesity_forest = RandomForestRegressor(n_estimators = 100, random_state=1)
obesity_forest

RandomForestRegressor(random_state=1)

In [51]:
# Calculate accuracy score
obesity_forest_model = obesity_forest.fit(obesity_X_train, obesity_y_train)
obesity_predictions = obesity_forest.predict(obesity_X_test)
obesity_errors = abs(obesity_predictions - obesity_y_test)
obesity_mape = 100 * (obesity_errors / obesity_y_test)
obesity_accuracy = 100 - np.mean(obesity_mape)
print('Accuracy:', round(obesity_accuracy, 2), '%.')

Accuracy: 88.33 %.


In [52]:
# List feature importance
obesity_importances = list(obesity_forest.feature_importances_)
obesity_feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(obesity_features_list, obesity_importances)]
obesity_feature_importances = sorted(obesity_feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in obesity_feature_importances];

Variable: LILATracts_halfAnd10 Importance: 0.311
Variable: % 0 Vehicle HH       Importance: 0.249
Variable: Employment Density   Importance: 0.096
Variable: Transit Frequency    Importance: 0.067
Variable: Total Population     Importance: 0.06
Variable: Population Density   Importance: 0.05
Variable: Pedestrian Network Density Importance: 0.049
Variable: % 2+ Vehicle HH      Importance: 0.041
Variable: % 1 Vehicle HH       Importance: 0.038
Variable: Walkability          Importance: 0.034
Variable: LILATracts_1And10    Importance: 0.002
Variable: LILATracts_1And20    Importance: 0.002
Variable: LILATracts_Vehicle   Importance: 0.002


## Life Expectancy Model

In [53]:
# Create Life Expectancy Features
## Removing % Low Income since it does not pertain to city structure
life_X = pd.get_dummies(features_df.drop(columns=['obesity', 'state_x', 'county_x', 'state_y', 'county_y', 'census_tract_number', 'life_expectancy', '% Low Wage Workers']))

# Create Life Expectancy Target
life_y = features_df['life_expectancy']

# Saving feature names for later use
life_features_list = list(life_X.columns)
life_features = np.array(life_X)
life_X

Unnamed: 0,Walkability,Total Population,% 0 Vehicle HH,% 1 Vehicle HH,% 2+ Vehicle HH,Population Density,Employment Density,Pedestrian Network Density,Transit Frequency,LILATracts_1And10,LILATracts_halfAnd10,LILATracts_1And20,LILATracts_Vehicle
0,14.833333,821,0.113861,0.247525,0.638614,8.284813,1.402666,21.637878,21.506204,0,1,0,0
1,13.166667,1048,0.000000,0.415842,0.584158,10.586268,1.424298,13.318689,17.261274,0,1,0,0
2,14.666667,461,0.250000,0.226190,0.523810,6.334020,1.964783,16.608250,52.760606,0,1,0,0
3,9.333333,1338,0.000000,0.054762,0.945238,9.078248,1.160225,19.218476,-99999.000000,0,0,0,0
4,9.000000,941,0.062162,0.310811,0.627027,6.514366,1.356871,13.860563,-99999.000000,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
964,13.833333,1169,0.011547,0.344111,0.644342,3.587088,1.113869,7.923397,5.590615,0,0,0,0
965,13.333333,1135,0.028283,0.365657,0.606061,4.010525,1.438135,13.524364,14.314923,0,0,0,0
966,7.500000,2215,0.016234,0.270563,0.713203,2.856509,0.359804,7.499182,0.552989,0,0,0,0
967,11.000000,746,0.026059,0.413681,0.560261,5.112132,0.089085,10.775377,18.990268,0,0,0,0


In [54]:
from sklearn.model_selection import train_test_split
life_X_train, life_X_test, life_y_train, life_y_test = train_test_split(life_X, life_y, random_state=1)

In [55]:
### Random Forest Regressor

In [56]:
# Resample the training data with the RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
life_forest = RandomForestRegressor(n_estimators = 100, random_state=1)
life_forest

RandomForestRegressor(random_state=1)

In [57]:
# Calculate accuracy score
life_forest_model = life_forest.fit(life_X_train, life_y_train)
life_predictions = life_forest.predict(life_X_test)
life_errors = abs(life_predictions - life_y_test)
life_mape = 100 * (life_errors / life_y_test)
life_accuracy = 100 - np.mean(life_mape)
print('Accuracy:', round(life_accuracy, 2), '%.')

Accuracy: 96.67 %.


In [58]:
# List feature importance
life_importances = list(life_forest.feature_importances_)
life_feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(life_features_list, life_importances)]
life_feature_importances = sorted(life_feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in life_feature_importances];

Variable: % 0 Vehicle HH       Importance: 0.315
Variable: LILATracts_halfAnd10 Importance: 0.211
Variable: Total Population     Importance: 0.107
Variable: Employment Density   Importance: 0.074
Variable: Population Density   Importance: 0.06
Variable: Pedestrian Network Density Importance: 0.056
Variable: Walkability          Importance: 0.046
Variable: Transit Frequency    Importance: 0.046
Variable: % 2+ Vehicle HH      Importance: 0.043
Variable: % 1 Vehicle HH       Importance: 0.038
Variable: LILATracts_Vehicle   Importance: 0.003
Variable: LILATracts_1And10    Importance: 0.002
Variable: LILATracts_1And20    Importance: 0.001


## Saving Data for Visualization

In [64]:
# Constructing dataframe of actual and predicted values for obesity
obesity_actual_data = pd.DataFrame(features_df[['obesity', '% 0 Vehicle HH', 'LILATracts_halfAnd10', 'Pedestrian Network Density', 'Employment Density','Walkability', 'Transit Frequency']])
obesity_predicted_data = pd.DataFrame(data = {'% 0 Vehicle HH': obesity_X_test['% 0 Vehicle HH'], 
                                           'LILATracts_halfAnd10': obesity_X_test['LILATracts_halfAnd10'],
                                           'Employment Density': obesity_X_test['Employment Density'],
                                           'Pedestrian Network Density': obesity_X_test['Pedestrian Network Density'], 
                                           'Walkability': obesity_X_test['Walkability'], 
                                           'Transit Frequency': obesity_X_test['Transit Frequency'], 
                                           'Rate of Obesity Predictions': obesity_predictions})

In [65]:
obesity_actual_data

Unnamed: 0,obesity,% 0 Vehicle HH,LILATracts_halfAnd10,Pedestrian Network Density,Employment Density,Walkability,Transit Frequency
0,42.9,0.113861,1,21.637878,1.402666,14.833333,21.506204
1,40.7,0.000000,1,13.318689,1.424298,13.166667,17.261274
2,36.4,0.250000,1,16.608250,1.964783,14.666667,52.760606
3,33.3,0.000000,0,19.218476,1.160225,9.333333,-99999.000000
4,37.4,0.062162,1,13.860563,1.356871,9.000000,-99999.000000
...,...,...,...,...,...,...,...
964,27.5,0.011547,0,7.923397,1.113869,13.833333,5.590615
965,30.0,0.028283,0,13.524364,1.438135,13.333333,14.314923
966,31.4,0.016234,0,7.499182,0.359804,7.500000,0.552989
967,29.8,0.026059,0,10.775377,0.089085,11.000000,18.990268


In [66]:
# Constructing dataframe of actual and predicted values for life expectancy
life_actual_data = pd.DataFrame(features_df[['life_expectancy', '% 0 Vehicle HH', 'LILATracts_halfAnd10', 'Pedestrian Network Density', 'Employment Density', 'Walkability', 'Transit Frequency']])
life_predicted_data = pd.DataFrame(data = {'% 0 Vehicle HH': life_X_test['% 0 Vehicle HH'], 
                                           'LILATracts_halfAnd10': life_X_test['LILATracts_halfAnd10'],
                                           'Pedestrian Network Density': life_X_test['Pedestrian Network Density'], 
                                           'Walkability': life_X_test['Walkability'], 
                                           'Transit Frequency': life_X_test['Transit Frequency'], 
                                           'Life Expectancy Predictions': life_predictions})

In [67]:
# Export cleaned dataframe to csvs
obesity_actual_data.to_csv('obesity_actual_data.csv', index = False)
obesity_predicted_data.to_csv('obesity_predicted_data.csv', index = False)
life_actual_data.to_csv('life_actual_data.csv', index = False)
life_predicted_data.to_csv('life_predicted_data.csv', index = False)