In [119]:
# Requisite imports
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing

# The Datasets

The first dataset was obtained from NYC's Community Health Survey (CHS): https://www1.nyc.gov/site/doh/data/data-publications/profiles.page


> Metadata for this table can be found here: https://www.dropbox.com/s/8upe14gyxrtcja5/2015_CHP_PUD.xlsx?dl=0
Some headings are out of order in the metadata. 


The second dataset was taken from NYC's Environmental & Health Data Portal which aggregates data from many sources. For more info see: http://a816-dohbesp.nyc.gov/IndicatorPublic/BuildATable.aspx

In [55]:
raw_CHS_data = pd.read_csv('https://www.dropbox.com/s/thl1b27lxqvtpwo/2015_CHPdata.csv?raw=1',
                           index_col=0)

raw_EPHT_data = pd.read_csv('https://www.dropbox.com/s/gr9zh288m73dsvo/AllEPHTData.csv?raw=1',
                            index_col=False, header=None)

## Initial Processing

First, I take a look at how the dataframes are organized so I can eventually merge them.

In [56]:
print('Size of CHS Data: {}\nSize of EPHT Data:{}'.format(raw_CHS_data.shape,
                                                         raw_EPHT_data.shape))

Size of CHS Data: (65, 191)
Size of EPHT Data:(65500, 12)


In [57]:
original_CHS_columns = raw_CHS_data.columns.values.tolist()

print('Feature Names:\n', original_CHS_columns)

raw_CHS_data.head(8)

Feature Names:
 ['Name', 'OverallPopulation_rate', 'OverallPopulation_rank', 'Racewhite_Rate', 'Racewhite_rank', 'Raceblack_rate', 'Raceblack_rank', 'Raceasian_rate', 'Raceasian_rank', 'Racehispanic_rate', 'Racehispanic_rank', 'Raceother_rate', 'Raceother_rank', 'Nonwhite_rate', 'Nonwhite_rank', 'Age0to17_rate', 'Age0to17_rank', 'Age18to24_rate', 'Age18to24_rank', 'Age25to44_rate', 'Age25to44_rank', 'Age45to64_rate', 'Age45to64_rank', 'Age65plus_rate', 'Age65plus_rank', 'Foreign_born', 'Foreign_born_rank', 'lower_95CL', 'upper_95CL', 'Ltd_Eng_Prof', 'Ltd_eng_prof_rank', 'lower_95CL.1', 'upper_95CL.1', 'Housing_Defects', 'Housing_Defects_rank', 'lower_95CL.2', 'upper_95CL.2', 'Airquality_rate', 'Airquality_rank', 'Tobaccoretail_rate', 'Tobaccoretail_rank', 'Supermarketarea_rate', 'Supermarketarea_rank', 'Edudidnotcompletehs_rate', 'Edudidnotcompletehs_rank', 'Eduhsdegreeorsomecollege_rate', 'Eduhsdegreeorsomecollege_rank', 'Educollegedegreeandhigher_rate', 'Educollegedegreeandhigher_ran

Unnamed: 0_level_0,Name,OverallPopulation_rate,OverallPopulation_rank,Racewhite_Rate,Racewhite_rank,Raceblack_rate,Raceblack_rank,Raceasian_rate,Raceasian_rank,Racehispanic_rate,...,reliability note.15,Imr_rank,lower_95CL.31,upper_95CL.31,Premature_Mort,Premature_mort_rank,lower_95CL.32,upper_95CL.32,Life_expectancy_rate,Life_expectancy_rank
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,NYC,8405837,,33,,23,,14,,29,...,,,4.5,4.9,198.4,,196.8,200.0,,
1,Manhattan,1626159,,48,,13,,12,,26,...,,,2.9,3.9,152.7,,150.0,155.4,,
2,Bronx,1418733,,11,,30,,4,,55,...,,,5.1,6.3,238.9,,235.2,242.6,,
3,Brooklyn,2592149,,36,,31,,11,,20,...,,,3.6,4.2,194.5,,191.8,197.2,,
4,Queens,2296175,,27,,18,,25,,28,...,,,4.2,5.2,140.8,,138.6,143.0,,
5,Staten Island,472621,,63,,10,,8,,18,...,,,3.6,5.8,184.7,,179.3,190.1,,
101,Financial District,62829,57.0,66,9.0,4,44.0,20,13.0,8,...,*,58.0,0.2,2.8,75.6,59.0,64.3,86.9,85.4,1.0
102,Greenwich Village and Soho,91961,53.0,75,3.0,2,55.0,14,21.0,6,...,*,55.0,0.2,3.8,93.3,57.0,84.0,102.6,84.3,4.0


In [58]:
raw_EPHT_data.head(8)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,76966,462,Access to Alcohol,Number of Retail Outlets,Borough,1,Bronx,2009,1760,,,
1,77014,464,Access to Alcohol,Number of Service Outlets,Borough,1,Bronx,2009,931,,,
2,77062,463,Access to Alcohol,Retail Outlet Density,Borough,1,Bronx,2009,127,,,
3,77110,465,Access to Alcohol,Service Outlet Density,Borough,1,Bronx,2009,67,,,
4,76967,462,Access to Alcohol,Number of Retail Outlets,Borough,2,Brooklyn,2009,3206,,,
5,77015,464,Access to Alcohol,Number of Service Outlets,Borough,2,Brooklyn,2009,2169,,,
6,77063,463,Access to Alcohol,Retail Outlet Density,Borough,2,Brooklyn,2009,125,,,
7,77111,465,Access to Alcohol,Service Outlet Density,Borough,2,Brooklyn,2009,85,,,


### Organization Insights

The first five rows in the CHS dataframe are borough summaries, but I am interested in Small Area Analysis. As a result I will omit these summaries. The remaining data of the Community Health Survey divides each borough into several Community Districts (CDs). 

Because the Environmental & Health Data Portal takes data from many sources, the aspects are organized in many different ways. As a result, each row represents a single statistic--with the columns enumerating the details regarding what was being measured and the corresponding location. This will be dealth with later.
For a first pass, I will take only those attributes organized by CD to ensure consistency when merging the datasets.

In [103]:
CD_EPHT_data = raw_EPHT_data[raw_EPHT_data.iloc[:,4]=='CD']

CD_CHS_data = raw_CHS_data.iloc[6:,:]

print('Environmental & Health Data:\n{}\n\n\nCommunity Health Survey Data:\n{}'.format(
    CD_EPHT_data.head(), CD_CHS_data.head()))

Environmental & Health Data:
           0    1                      2                3   4    5   \
50310  119438  373  Elemental Carbon (EC)  10th Percentile  CD  101   
50311  119497  374  Elemental Carbon (EC)  90th Percentile  CD  101   
50312  119379  370  Elemental Carbon (EC)             Mean  CD  101   
50313  119439  373  Elemental Carbon (EC)  10th Percentile  CD  102   
50314  119498  374  Elemental Carbon (EC)  90th Percentile  CD  102   

                      6         7                         8    9   10  11  
50310  Battery Park City   Tribeca  Annual Average 2009-2010  1.3 NaN NaN  
50311  Battery Park City   Tribeca  Annual Average 2009-2010  1.8 NaN NaN  
50312  Battery Park City   Tribeca  Annual Average 2009-2010  1.5 NaN NaN  
50313  Greenwich Village      SOHO  Annual Average 2009-2010  1.5 NaN NaN  
50314  Greenwich Village      SOHO  Annual Average 2009-2010    2 NaN NaN  


Community Health Survey Data:
                              Name  OverallPopulation_ra

### Inspecting NaN Values

In [104]:
# Because the CHS table has so many features, I will print out only those columns
# which contain NaNs. I will print out the result of all columns for the EPHT data.
CHS_NaN_columns = []

for column in original_CHS_columns:
    if CD_CHS_data.loc[:, column].isna().any():
        CHS_NaN_columns.append(column)

print('Columns in CHS data with NaN values:\n', CHS_NaN_columns)


print('\nColumns in Environment & Health data with any NaN values:\n',
      CD_EPHT_data.isna().any())

Columns in CHS data with NaN values:
 ['Avertabledeaths_rate', 'reliability note', 'reliability note.1', 'reliability note.2', 'reliability note.3', 'reliability note.4', 'reliability note.5', 'reliability note.6', 'reliability note.7', 'reliability note.8', 'reliability note.9', 'reliability note.10', 'reliability note.11', 'reliability note.12', 'reliability note.13', 'reliability note.14', 'reliability note.15']

Columns in Environment & Health data with any NaN values:
 0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9      True
10     True
11     True
dtype: bool


#### Dealing with NaNs in CHS Data

For ease of data analysis, reliability notes of CHS data will be discarded.

In [105]:
CHS_data_to_merge = CD_CHS_data.drop(columns=CHS_NaN_columns[1:])

print('Number of NaNs in Avertabledeaths_rate:', 
      CHS_data_to_merge.Avertabledeaths_rate.isna().sum())

Number of NaNs in Avertabledeaths_rate: 5


In [106]:
# In the CHS data, each rate is followed by a rank. I investigate here to see what
# those values are for the NaN Avertabledeaths_rate
CHS_data_to_merge[CHS_data_to_merge.Avertabledeaths_rate.isna()].Avertabledeaths_rank

ID
101    -
102    -
106    -
107    -
108    -
Name: Avertabledeaths_rank, dtype: object

This gives us no additional information, so I will leave these for now.

#### Dealing with NaNs in the EPHT Data

I saw that the last three columns in the dataframe all contain NaNs. Upon further inspection of the source, any one of columns 8-11 of the EPHT dataset contains the actual relevant numerical data for any single row.

Columns 2 and 3 together describe the statistic measured, along with the column immediately preceding that with the actual numerical data.
Column 4 was used to describe the area level of analysis (e.g."Borough" or "CD" for Community District). In the dataset to merge, we only kept those with the label "CD", so this column is unnecessary.
Column 5 contains the CD index needed to match to the CHS data.
Following column 5, anywhere from 1 to 4 columns are used to enumerate the neighborhoods encompassed by the CD.

In [107]:
def get_values(row):
    '''
    This function determines the position of the numerical datum in a 
    specified row. It then assigns that value to the variable name `stat`.
    The position preceding the number is joined with positions 2 and 3 of 
    the row to create the `name` for the statistic.
    '''
    
    if str(row[8]).replace('.', '').isnumeric():
        stat = float(row[8])
        name = ' '.join([row[2], row[3], row[7]])
    
    elif str(row[9]).replace('.', '').isnumeric():
        stat = float(row[9])
        name = ' '.join([row[2], row[3], row[8]])
    
    elif str(row[10]).replace('.', '').isnumeric():
        stat = float(row[10])
        name = ' '.join([row[2], row[3], row[9]])
    
    else:
        stat = float(row[11])
        name = ' '.join([row[2], row[3], row[10]])

    return (name, stat)

In [108]:
CD_EPHT_data[['label', 'stat']] = CD_EPHT_data.apply(
    lambda row: pd.Series(get_values(row)), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


### Using a Pivot Table to Transform the EPHT Data

In [110]:
EPHT_data_to_merge = CD_EPHT_data.pivot_table(index=5, columns='label', 
                                              values='stat')

### Merging the Tables

In [112]:
NYC_data = CHS_data_to_merge.merge(EPHT_data_to_merge, how='outer', 
                                   left_index=True, right_index=True)

In [163]:
to_standardize = NYC_data.drop(list(NYC_data.filter(regex='rank|CL|Percentile')),
                               axis=1)

print(to_standardize.shape)

(59, 61)


In [164]:
data_standardized = to_standardize.copy()

for column in to_standardize.columns[1:]:
    data_standardized[column] = to_standardize[column].apply(lambda x: (
        x - to_standardize[column].mean()) / to_standardize[column].std())

In [165]:
no_HSdegree_corr = data_standardized.corr()['Edudidnotcompletehs_rate'].sort_values(ascending=False)
HSdegree_somecollege_corr = data_standardized.corr()['Eduhsdegreeorsomecollege_rate'].sort_values(ascending=False)
collegedegree_plus_corr = data_standardized.corr()['Educollegedegreeandhigher_rate'].sort_values(ascending=False)

In [166]:
no_HSdegree_corr

Edudidnotcompletehs_rate                                         1.000000
Teen_Births                                                      0.873792
Poverty                                                          0.839136
Racehispanic_rate                                                0.826898
Nonwhite_rate                                                    0.758332
Rent_burden                                                      0.755929
Sugary_Drink                                                     0.748830
Insurance                                                        0.733628
Unemployment                                                     0.732155
Avoidable_Asthma                                                 0.729286
Age0to17_rate                                                    0.703223
Diabetes                                                         0.693400
Avoidable_Diabetes                                               0.688222
Assault_Hosps                         

In [167]:
collegedegree_plus_corr

Educollegedegreeandhigher_rate                                   1.000000
Racewhite_Rate                                                   0.769933
Exercise                                                         0.752994
Fruit_Veg                                                        0.725069
Age25to44_rate                                                   0.724142
Self_rep_health                                                  0.701858
Life_expectancy_rate                                             0.614890
Airquality_rate                                                  0.552788
Fine Particulate Matter (PM2.5) Mean Summer 2009                 0.542127
Fine Particulate Matter (PM2.5) Mean Annual Average 2009-2010    0.442375
Age65plus_rate                                                   0.377024
Fine Particulate Matter (PM2.5) Mean Winter 2008-09              0.363847
Elemental Carbon (EC) Mean Annual Average 2009-2010              0.356789
Elemental Carbon (EC) Mean Summer 2009

In [168]:
HSdegree_somecollege_corr

Eduhsdegreeorsomecollege_rate                                    1.000000
Obesity                                                          0.802747
Age0to17_rate                                                    0.751949
Diabetes                                                         0.723469
Sugary_Drink                                                     0.717101
Rent_burden                                                      0.660815
Late_No_Prenatal                                                 0.608083
Stroke_Hosp                                                      0.586894
Preterm_Births                                                   0.565172
Nonwhite_rate                                                    0.550642
IMR                                                              0.543731
Unemployment                                                     0.543359
Schoolabsent_rate                                                0.531187
Raceblack_rate                        