In [1]:
# Necessary imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from copy import deepcopy

In [62]:
boroughs_ = {
    'Manhattan': ['1','MN', 100, 12],
    'Brooklyn': ['3', 'BK', 300, 18],
    'Bronx': ['2', 'BX', 200, 12],
    'Queens': ['4', 'QN', 400, 14],
    'Staten Island': ['5', 'SI', 500, 3]
}

borough_labels = {'Manhattan': [],
                  'Brooklyn': [],
                  'Bronx': [],
                  'Queens': [],
                  'Staten Island': []}

for borough, abbr in boroughs_.items():
    for n in range(1, abbr[2]+1):
        if n < 10:
            n = f'0{n}'
        borough_labels[borough].append(f'{abbr[1]}{n}')

In [40]:
def get_EPHT_data(row):
    '''
    This function determines the position of the numerical datum in a 
    specified row. It then assigns that value to the variable name `stat`.
    The position preceding the number is joined with positions 2 and 3 of 
    the row to create the `name` for the statistic.
    '''
    value_col = (10 - row.isnull().sum())
    value = float(row[value_col])
    name = ' '.join([row[2], row[3], row[value_col-1]])

    return (name, value)

In [49]:
df = pd.read_csv(('data/NYCDataSourcesRound2/AllEPHTData1.csv'), header=None,
                 low_memory=False, usecols=[2,3,4,5,6,7,8,9,10],
                 na_values=['Estimate is based on small numbers so should be interpreted with caution',
                            'Number excludes outlier values that are suspected to be overreporting errors. '])

UHF = df[df.iloc[:,2] == 'UHF42']
CD = df[df.iloc[:,2] == 'CD']

dfs = [CD, UHF]

for df in dfs:
    df[['label', 'value']] = df.apply(
        lambda row: pd.Series(get_EPHT_data(row)), axis=1)
    df[5].astype(str, copy=False)

CD_data = CD.pivot_table(index=5, columns='label', values='value')
UHF_data = UHF.pivot_table(index=5, columns='label', values='value')

CD_data.to_csv(datadir+'/EPHT_CD.csv')
UHF_data.to_csv(datadir+'/EPTH_UHF.csv')

In [95]:
datadir = 'data/raw_csvs'
allfiles = os.listdir(data_dir)
csvfiles = sorted([file for file in allfiles if file.endswith('.csv')])
csvfiles

['2015_CHP_all_data.csv',
 '2015_Cause_of_death_data.csv',
 '2018_CHP_all_data.csv',
 '2018_Cause_of_premature_death_data.csv',
 'AbuseNeglectReport2015To2018.csv',
 'ChildWelfare.csv',
 'EPHT_CD.csv',
 'EPTH_UHF.csv',
 'Family_Violence_Related_Snapshots__New_York_City_Community_Board_Districts.csv',
 'Intimate_Partner_Violence_Related_Snapshots__New_York_City_Community_Board_Districts.csv',
 'NTA_Data.csv',
 'NYCgov_Poverty_Measure_Data__2005_.csv',
 'NYCgov_Poverty_Measure_Data__2006_.csv',
 'NYCgov_Poverty_Measure_Data__2007_.csv',
 'NYCgov_Poverty_Measure_Data__2008_.csv',
 'NYCgov_Poverty_Measure_Data__2009_.csv',
 'NYCgov_Poverty_Measure_Data__2010_.csv',
 'NYCgov_Poverty_Measure_Data__2011_.csv',
 'NYCgov_Poverty_Measure_Data__2012_.csv',
 'NYCgov_Poverty_Measure_Data__2013_.csv',
 'NYCgov_Poverty_Measure_Data__2014_.csv',
 'NYCgov_Poverty_Measure_Data__2015_.csv',
 'NYCgov_Poverty_Measure_Data__2016_.csv',
 'cd1999.csv',
 'cd2000.csv',
 'cd2001.csv',
 'cd2002.csv',
 'cd2003.csv

In [107]:
no_use = re.compile('NYCgov_Poverty_Measure_Data__|chs|youthriskbehavior|cd\d+')
use_csv = [csv for csv in csvfiles if not no_use.match(csv)]

for file in use_csv:
    if file == '2018_CHP_all_data.csv':
        df = pd.read_csv(os.path.join(datadir, file), index_col=False,
                         header=1)
    else:
        df = pd.read_csv(os.path.join(datadir, file), index_col=False)
    print(file)
    print(df.columns.values)
    print(df.index.values)
    print(df.shape)
    print('*****************************************************'
          '*****************************************************')

2015_CHP_all_data.csv
['ID' 'Name' 'OverallPopulation_rate' 'OverallPopulation_rank'
 'Racewhite_Rate' 'Racewhite_rank' 'Raceblack_rate' 'Raceblack_rank'
 'Raceasian_rate' 'Raceasian_rank' 'Racehispanic_rate' 'Racehispanic_rank'
 'Raceother_rate' 'Raceother_rank' 'Nonwhite_rate' 'Nonwhite_rank'
 'Age0to17_rate' 'Age0to17_rank' 'Age18to24_rate' 'Age18to24_rank'
 'Age25to44_rate' 'Age25to44_rank' 'Age45to64_rate' 'Age45to64_rank'
 'Age65plus_rate' 'Age65plus_rank' 'Foreign_born' 'Foreign_born_rank'
 'lower_95CL' 'upper_95CL' 'Ltd_Eng_Prof' 'Ltd_eng_prof_rank'
 'lower_95CL.1' 'upper_95CL.1' 'Housing_Defects' 'Housing_Defects_rank'
 'lower_95CL.2' 'upper_95CL.2' 'Airquality_rate' 'Airquality_rank'
 'Tobaccoretail_rate' 'Tobaccoretail_rank' 'Supermarketarea_rate'
 'Supermarketarea_rank' 'Edudidnotcompletehs_rate'
 'Edudidnotcompletehs_rank' 'Eduhsdegreeorsomecollege_rate'
 'Eduhsdegreeorsomecollege_rank' 'Educollegedegreeandhigher_rate'
 'Educollegedegreeandhigher_rank' 'Poverty' 'Poverty_r

EPTH_UHF.csv
['5' 'Access to Alcohol Number of Retail Outlets 2009'
 'Access to Alcohol Number of Service Outlets 2009' ...
 'Youth Access to Alcohol Number of Service Outlets 2009'
 'Youth Access to Alcohol Retail Outlet Density 2009'
 'Youth Access to Alcohol Service Outlet Density 2009']
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41]
(42, 1884)
**********************************************************************************************************
Family_Violence_Related_Snapshots__New_York_City_Community_Board_Districts.csv
['Report_Year' 'Comm_Dist_ Boro' 'Comm_District' 'FAM_DIR'
 'FAM_Fel_Assault ' 'DV_Fel_Assault ' 'FAM_Rape ' 'DV_Rape ']
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  

cdall_04.csv
['cd' 'birthtot' 'age1tot' 'age2tot' 'age3tot' 'age4tot' 'age5tot'
 'age6tot' 'age7tot' 'age8tot' 'age9tot' 'eth_bl_tot' 'eth_wh_tot'
 'eth_hi_tot' 'eth_ap_tot' 'nat1tot' 'nat2tot' 'ancs1tot' 'ancs2tot'
 'ancs3tot' 'ancs4tot' 'ancs5tot' 'ancs6tot' 'ancs7tot' 'ancs8tot'
 'ancs9tot' 'ancs10tot' 'ancs11tot' 'ancs12tot' 'ancs13tot' 'ancs14tot'
 'ancs15tot' 'ancs_oth_tot' 'bpl1tot' 'bpl2tot' 'bpl3tot' 'bpl4tot'
 'bpl5tot' 'bpl6tot' 'bpl7tot' 'bpl8tot' 'bpl9tot' 'bpl10tot' 'bpl11tot'
 'bpl12tot' 'bpl13tot' 'bpl14tot' 'bpl15tot' 'bpl_oth_tot' 'educ1tot'
 'educ2tot' 'educ3tot' 'par1tot' 'par2tot' 'prenat1tot' 'prenat2tot'
 'prenat3tot' 'prenat4tot' 'pay1tot' 'pay2tot' 'pay3tot' 'mar1tot'
 'mar2tot' 'sex1tot' 'sex2tot' 'weight1tot' 'weight2tot' 'weight3tot'
 'weight4tot' 'bwt1tot' 'bwt2tot' 'bwt3tot' 'bwt4tot' 'bwt5tot' 'bwt6tot'
 'bwt7tot' 'bwt8tot' 'bwt9tot' 'ga1tot' 'ga2tot' 'ga3tot' 'ga4tot'
 'ga5tot' 'apg1tot' 'apg2tot' 'apg3tot' 'apg4tot' 'apg5tot' 'plur1tot'
 'plur2tot' 'mth

cdall_13.csv
['cd' 'birthtot' 'age1tot' 'age2tot' 'age3tot' 'age4tot' 'age5tot'
 'age6tot' 'age7tot' 'age8tot' 'age9tot' 'eth_bl_tot' 'eth_wh_tot'
 'eth_hi_tot' 'eth_ap_tot' 'nat1tot' 'nat2tot' 'ancs1tot' 'ancs2tot'
 'ancs3tot' 'ancs4tot' 'ancs5tot' 'ancs6tot' 'ancs7tot' 'ancs8tot'
 'ancs9tot' 'ancs10tot' 'ancs11tot' 'ancs12tot' 'ancs13tot' 'ancs14tot'
 'ancs_oth_tot' 'bpl1tot' 'bpl2tot' 'bpl3tot' 'bpl4tot' 'bpl5tot'
 'bpl6tot' 'bpl7tot' 'bpl8tot' 'bpl9tot' 'bpl10tot' 'bpl11tot' 'bpl12tot'
 'bpl13tot' 'bpl14tot' 'bpl15tot' 'bpl_oth_tot' 'edlev1tot' 'edlev2tot'
 'edlev3tot' 'edlev4tot' 'edlev5tot' 'par1tot' 'par2tot' 'cover1tot'
 'cover2tot' 'cover3tot' 'mar1tot' 'mar2tot' 'sex1tot' 'sex2tot' 'bmi1tot'
 'bmi2tot' 'bmi3tot' 'bmi4tot' 'bwt1tot' 'bwt2tot' 'bwt3tot' 'bwt4tot'
 'bwt5tot' 'bwt6tot' 'bwt7tot' 'bwt8tot' 'bwt9tot' 'ga1tot' 'ga2tot'
 'ga3tot' 'ga4tot' 'ga5tot' 'apg1tot' 'apg2tot' 'apg3tot' 'apg4tot'
 'apg5tot' 'plur1tot' 'plur2tot' 'mth1tot' 'mth2tot' 'mth3tot' 'mth4tot'
 'pob1to

d2ghealth.csv
['Unnamed: 0' 'Males Under 5  2012-2016' 'Males 5–14  2012-2016'
 'Males 15–24  2012-2016' 'Males 25–34  2012-2016'
 'Males 35–44  2012-2016' 'Males 45–54  2012-2016'
 'Males 55–64  2012-2016' 'Males 65–74  2012-2016'
 'Males 75–84  2012-2016' 'Males 85+  2012-2016'
 'Females Under 5  2012-2016' 'Females 5–14  2012-2016'
 'Females 15–24  2012-2016' 'Females 25–34  2012-2016'
 'Females 35–44  2012-2016' 'Females 45–54  2012-2016'
 'Females 55–64  2012-2016' 'Females 65–74  2012-2016'
 'Females 75–84  2012-2016' 'Females 85+  2012-2016'
 'Child Population (under 5) 2012-2016'
 'Youth Population (under 18) 2012-2016'
 'Prime-Age Adult Population (ages 25–54) 2012-2016'
 'Elderly Population (65+) 2012-2016' 'Foreign Born  2012-2016'
 'Asian or Pacific Islander  2012-2016' 'Latino  2012-2016'
 'White  2012-2016' 'Black  2012-2016' 'Native American  2012-2016'
 'Other Race  2012-2016'
 'Speak a Language Other Than English at Home 2012-2016'
 'Speak Asian Language at Home 2012-2

### Files with Informative Indices & Column Names:
- **filename.csv (borough identifier)**
- 2015_CHP_all_data.csv  (numbers)
- 2015_Cause_of_death_data.csv  (numbers)
- 2018_CHP_all_data.csv  (numbers)
- 2018_Cause_of_premature_death_data.csv (numbers)
- AbuseNeglectReport2015To2018.csv  (numbers)
- ChildWelfare.csv  (letters)
- NTA_Data.csv  (**neighborhood names**)
- cd{year}.csv  (**neighborhood names**) (and some columns to look up) -- year = 1999-2014
- cdall_{year}  (numbers) (and some columns to look up) -- year = 2000-2014
- d2g1.csv  (numbers)
- d2g2.csv  (numbers)
- d2ghealth.csv  (numbers)


#### ???
- NYCgov_Poverty_Measure_Data__{year}.csv
- chs{year}_public.csv  (good columns, individual survey results--must look to see if there is a CD related col)  --years= 2002-2017
- youthriskbehavior{year}.csv  (records of survey responses--must look to see if questions are readily available and somewhere there may be a location identifier)


In [58]:
header_names = ['Year', 'Variable']

In [55]:
file1 = use_csv.pop()
file1

'proficiency_4thgrade.csv'

In [71]:
df1 = pd.read_csv(os.path.join(datadir, file1), header=None)
header = [(value[-4:], value[:-4]) for value in df1.iloc[0, 1:].values]
cols = pd.MultiIndex.from_tuples(header, names=header_names)

df1.drop(index=0, inplace=True)
CDs = df1.iloc[:,0]

def get_CD_num(comm_district):
    district = comm_district.split()
    for borough, abbrs in boroughs_.items():
        if district[0] in abbrs:
            num = abbrs[0] + district[1]
            return int(num)

index = CDs.apply(get_CD_num)
df1.drop(columns=0, inplace=True)
df1.index = index
df1.columns = cols
df1

Year,2013,2014,2015,2016,2017,2018,2013,2014,2015,2016,2017,2018
Variable,pct_prof_math,pct_prof_math,pct_prof_math,pct_prof_math,pct_prof_math,pct_prof_math,pct_prof_ela,pct_prof_ela,pct_prof_ela,pct_prof_ela,pct_prof_ela,pct_prof_ela
0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
301,0.33589462129528,0.4405997693194929,0.4215349369988549,0.436426116838488,0.4110218140068889,0.492864983534577,0.302222222222222,0.3541666666666671,0.3672055427251729,0.486549707602339,0.425629290617849,0.51697699890471
302,0.37037037037037,0.413078149920255,0.386281588447653,0.476744186046512,0.4512,0.553945249597424,0.3128038897893029,0.336973478939158,0.400359066427289,0.504187604690117,0.530546623794212,0.586816720257235
303,0.2358402971216339,0.210166177908113,0.215733015494636,0.245795601552393,0.2310654685494219,0.3732394366197179,0.165885660731022,0.20990099009901,0.228091236494598,0.2905317769130999,0.312987012987013,0.443037974683544
304,0.1988105352591329,0.230050933786078,0.23841059602649,0.279880478087649,0.277599142550911,0.315347721822542,0.144463667820069,0.181184668989547,0.1581485053037609,0.258291457286432,0.280353200883002,0.36996336996337
305,0.213206491326245,0.253815715093273,0.2249858677218769,0.2449888641425389,0.244968372627947,0.334925373134328,0.162870159453303,0.195776255707763,0.1894858463315999,0.29233409610984,0.2989934872705739,0.3560559951308579
306,0.610294117647059,0.646341463414634,0.648816936488169,0.695497630331754,0.6332599118942729,0.7116228070175441,0.558854718981972,0.604004449388209,0.6275,0.70878459687124,0.6976483762597979,0.773033707865169
307,0.4038748137108789,0.4603278688524589,0.4676524953789279,0.5008923259964311,0.4704192992533029,0.522670025188917,0.2909647779479329,0.327052489905787,0.328471781864299,0.4258694325808421,0.440658049353702,0.50258064516129
308,0.259023354564756,0.288546255506608,0.300653594771242,0.356,0.297951582867784,0.39832285115304,0.2296137339055789,0.3008849557522119,0.3340611353711789,0.3947368421052629,0.395480225988701,0.545263157894737
309,0.248025276461295,0.272545090180361,0.260956175298805,0.285714285714286,0.250580046403712,0.371657754010695,0.152487961476726,0.230142566191446,0.220647773279352,0.35,0.368171021377672,0.4123989218328839
310,0.574429223744292,0.6163753449862011,0.631199278629396,0.5968627450980389,0.6261151662611519,0.693944353518822,0.39906976744186,0.460093896713615,0.464154411764706,0.528,0.6076475477971739,0.654712260216847


In [68]:
file2 = use_csv.pop()
file2

'infantmortality.csv'

In [74]:
df2 = pd.read_csv(os.path.join(datadir, file2), header=None)
header = [(value.split()[0], value.split()[1]) for value in df2.iloc[0, 2:]]
cols = pd.MultiIndex.from_tuples(header, names=header_names)

df2.drop(index=0, inplace=True)

index = [int(num.split('.')[0]) for num in df2.iloc[:,0]]
df2.index = index

df2.drop(columns=[0,1], inplace=True)
df2.columns = cols
df2

Year,2013-2015,2013-2015,2014-2016,2014-2016,2015-2017,2015-2017
Variable,InfantMortalityRate,NeonatalMortalityRate,InfantMortalityRate,NeonatalMortalityRate,InfantMortalityRate,NeonatalMortalityRate
102,0.8,0.8,0.9,0.9,1.7,1.7
103,2.1,1.5,3.0,1.6,3.2,2.4
104,5.1,3.4,4.0,3.3,2.3,1.3
105,5.2,2.9,2.3,1.2,1.8,1.2
106,1.0,0.8,2.1,1.6,1.8,1.6
107,2.8,1.9,2.6,1.7,2.3,1.3
108,0.8,0.4,0.8,0.4,1.8,0.9
109,4.1,3.5,4.5,3.3,5.0,3.8
110,6.7,4.1,7.2,4.6,6.7,3.8
111,5.7,4.2,5.9,4.2,5.2,3.0


In [75]:
file3 = use_csv.pop()
file3

'd2ghealth.csv'

In [80]:
# Read in df
df3 = pd.read_csv(datadir+'/'+file3, header=None)

# Define header multiindex
header = [(value.split()[-1], ' '.join(value.split()[:-1])) for value in df3.iloc[0,1:]]
cols = pd.MultiIndex.from_tuples(header, names=header_names)

# Drop row 0
df3.drop(index=0, inplace=True)

# Format CD number
df3.iloc[:,0] = df3.iloc[:,0].apply(lambda x: int(x.replace(' ', '')))
df3.set_index(0, inplace=True)
# Drop col 0

# Set index and columns
df3.columns = cols
df3

Year,2012-2016,2012-2016,2012-2016,2012-2016,2012-2016,2012-2016,2012-2016,2012-2016,2012-2016,2012-2016,...,2011-2015,2011-2015,2011-2015,2011-2015,2011-2015,2011-2015,2011-2015,2011-2015,2011-2015,District
Variable,Males Under 5,Males 5–14,Males 15–24,Males 25–34,Males 35–44,Males 45–54,Males 55–64,Males 65–74,Males 75–84,Males 85+,...,"Colorectal Cancer Rate, Males (per 100,000)","Colorectal Cancer Rate, Females (per 100,000)","Ovarian Cancer Rate (per 100,000)",Melanoma Cancer (average annual cases),"Melanoma Cancer Rate (per 100,000)","Melanoma Cancer Rate, males (per 100,000)","Melanoma Cancer Rate, Females (per 100,000)",Tobacco-Related Cancers (average annual cases),Leukemias (average annual cases),Community
0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
101102,,,,,,,,,,,...,37.1,29.6,13.7,36.8,23.7,29.6,19.2,33.4,73.60000000000001,101 102
102,1.53688638614688,2.36709438329791,4.64738417708136,13.6617069343512,8.67155591662308,6.75517767118867,5.55771948763035,3.97075353060975,1.89634641708492,0.806837530743292,...,,,,,,,,,,
103,1.5400608629679,3.37796790079644,6.60718761416776,11.6612472172744,7.19447252535191,6.42764013644311,5.48490751053384,3.36604962875469,2.00087828418437,0.90088983619317,...,42.9,28.1,12.6,18,9.3,13.8,5.7,72.8,73,103
104,1.49076495408166,2.20974858990607,4.48644461407627,13.8291098432932,11.3874822700599,8.08629790623839,6.78274484664292,3.09040337630118,1.51305491070649,0.299593123817908,...,,,,,,,,,,
104105,,,,,,,,,,,...,49.1,28.5,17.6,35.2,21.5,28.2,15.2,36.2,79,104 105
105,1.75103480156843,1.61134872061774,4.86580281386882,14.3247417819182,8.37360073187849,6.08358862534805,5.48211510156297,4.03991962902982,2.0483197204643,0.618978416612491,...,,,,,,,,,,
106,1.89429589782567,1.80995717983371,4.11576539025744,12.8655189404841,6.05554883089686,5.42985700644035,5.42763073184151,3.83376080020671,2.5886898313722,0.894920388103957,...,37.2,29.3,14.3,58.6,31.1,39.4,24.8,39.599999999999994,97.4,106
107,2.90148905923583,3.58557751665647,2.92985430558832,8.26212518955894,7.00869924407372,7.1509190645068,5.82359029232407,4.43450559515246,2.02335693318507,1.14811583539904,...,33.5,25.4,12.7,62.2,24.4,31,19.4,50.4,132.6,107
108,2.96365112337739,3.5341212955221,2.82486375132322,8.72558010537931,7.0236323643125,5.71731875527023,5.0143697279829,4.49662073288474,2.40895224351156,1.08641673383957,...,34,31.4,14.3,97.2,33.6,45,25,55.4,161.6,108
109,2.08513721263098,4.2789489494796,10.4027142125073,10.780866215476,6.21565266561821,5.81364527928469,4.0863388171264,1.95967556678624,1.31734728136983,0.573412733473521,...,42.2,31.2,11,8.2,6.2,10.6,3.7,29.6,54.2,109


In [81]:
file4 = use_csv.pop()
file4

'd2g2.csv'

In [83]:
df4 = pd.read_csv(datadir+'/'+file4, header=None)
# Define header multiindex
header = [(value.split()[-1], ' '.join(value.split()[:-1])) for value in df4.iloc[0,1:]]
cols = pd.MultiIndex.from_tuples(header, names=header_names)

# Drop row 0
df4.drop(index=0, inplace=True)

# Format CD number
df4.iloc[:,0] = df4.iloc[:,0].apply(lambda x: int(x.replace(' ', '')))
df4.set_index(0, inplace=True)
# Drop col 0

# Set index and columns
df4.columns = cols
df4

Year,2010-2014,2010-2014,2010-2014,2010-2014,2010-2014,2010-2014,2010-2014,2010-2014,2010-2014,2010-2014,2010-2014,2010-2014,2010-2014,2005-2007,2010-2014,2010-2014,2014,2014,2014,2014,District
Variable,Black (% of total population),Black Population (#),Females 15-24 (#),Females 25-34 (#),Females 35-44 (#),Females 45-54 (#),Females 5-14 (#),Females 55-64 (#),Females 65-74 (#),Females 75-84 (#),...,Median Personal Earnings: Change Since 2005-2007 (% change) 2005-2007 to,Median Personal Earnings,Median Personal Earnings: 2005–2007 (2013 $),Median Home Value ($ for owner-occupied units),Veteran Income (median),Veteran Poverty (# of veterans with incomes below poverty),Unemployed Veterans (#),Veteran Poverty (% of veterans with incomes below poverty),Unemployed Veterans (% of veteran population),Community
0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
101,3.3854324618253,2222.45327162,2110.378561191,4333.742788367,10068.897953647,6037.84153499,3161.24656228,3006.951025559,1542.967369635,688.772165527,...,,,,,,,,,,
101102,,,,,,,,,,,...,17.375397848712275,80606,68673.67564018385,1000001,48667,207,49,10.3811434302909,4.06976744186047,101 102
102,1.83588969470867,1649.6869032179,1801.0338667607,6024.7643821705,12815.2966945955,5733.9327922875,4595.9113025446,5524.1120627666,4060.3585378829,1928.4963996191,...,,,,,,,,,,
103,6.66737823371608,10661.859823535,5162.587568184,12663.492822356,16414.805336652,10942.225663094,11159.842125342,10146.936906338,6661.67408985,4442.731433011,...,11.465044170356304,37536,33675.13131976361,612200,20717,256,64,7.14485068378454,2.99205236091632,103
104,6.10783236447841,6570.907993477,2311.775280093,5856.874062956,14435.551770365,7781.598852008,5872.002955723,5912.745948971,3792.131316424,2255.424511353,...,,,,,,,,,,
104105,,,,,,,,,,,...,12.966111839627908,69094,61163.475377544324,863100,21833,579,29,31.2129380053908,2.265625,104 105
105,3.83105495428544,2142.198146788,972.00396315,5230.425707796,8786.979927886,4150.213447802,2585.745702482,2827.024389337,1946.585287452,909.08692024,...,,,,,,,,,,
106,3.43454559232472,4537.80185455,2469.996038326,11178.574299231,19826.020093831,8569.78656585,7164.254301537,7938.975623233,7439.414721674,4289.913084597,...,-2.843126140602873,74926,77118.57846355878,734200,93199,561,6,21.3795731707317,0.396301188903567,106
107,6.37668936679184,13389.092005894,7947.22472051,8645.12594331,22514.448215076,17347.401144962,15958.997041802,14165.254051436,11213.86868472,6034.575490266,...,-8.824477833075385,67436,73962.833880499,971200,60255,298,71,14.4590004852014,6.69180018850141,107
108,2.8306789071118,6210,8277,8641,30531,17889,13310,15843,12704,7926,...,-8.176265003445076,75903,82661.6342744583,1000001,80916,197,294,7.68930523028884,24.2174629324547,108


In [84]:
file5 = use_csv.pop()
file5

'd2g1.csv'

In [89]:
# Read in df
df5 = pd.read_csv(datadir+'/'+file5, header=None)

# Define header multiindex
header = [(value.split()[-1], ' '.join(value.split()[:-1])) for value in df5.iloc[0,1:]]
cols = pd.MultiIndex.from_tuples(header, names=header_names)

# Drop row 0
df5.drop(index=0, inplace=True)

# Format CD number
df5.iloc[:,0] = df5.iloc[:,0].apply(lambda x: int(x.replace(' ', '')))
df5.set_index(0, inplace=True)
# Drop col 0

# Set index and columns
df5.columns = cols
df5

Year,2012-2016,2012-201,2015,2016,2010,2016,2016-2017,2018,2018,2016,...,2012-2016,2012-2016,2012-2016,2012-2016,2012-2016,2012-2016,2012-2016,2012-2016,2012-2016,District
Variable,dis_y_numb_puma,dis_y_perc_puma,air_qual_cd,citu_land_cd,diversion_cd,lead_complaints_cd,noise_per_1000_cd,park_recreation_land_cd,vacant_parking_land_cd,waste_cd Fiscal Year,...,retail_trade_industries_cd,transportation_warehousing_utilities_industries_cd,private_wage_and_salary_workers_class_cd,government_workers_class_cd,self_employed_class_cd,unpaid_family_workers_class_cd,labor_force_participation_cd,working_poor_cd,poverty_all_ages_federal_change_cd 2000 to,Community
0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
101,,,,,,,,,,,...,5.49,1.09,88.5,5.59,5.88,0.03,77.38,3.61,-18.73,101
101102,777,4.9,,,,,,,,,...,,,,,,,,,,
102,,,,,,,,,,,...,7.68,1.24,85.37,4.49,10.01,0.12,72.38,2.75,-26.82,102
103,2325,11.54,,,,,,,,,...,8.35,3.15,82.94,8.44,8.18,0.44,59.41,9.75,-8.7,103
104,,,,,,,,,,,...,7.93,1.75,86.66,5.73,7.6,0.01,77.59,5.03,-15.53,104
104105,1100,7.93,,,,,,,,,...,,,,,,,,,,
105,,,,,,,,,,,...,5.88,1.19,87.48,4.28,8.16,0.09,72,3.73,3.59,105
106,1209,7.45,,,,,,,,,...,5.95,1.47,85.84,7.35,6.63,0.18,73.17,2.89,14.54,106
107,1375,11.29,,,,,,,,,...,5,1.15,81.52,7.58,10.72,0.18,69.6,3.8,-0.78,107
108,870,6.96,,,,,,,,,...,6.11,1.35,85.18,6.47,8.15,0.2,71.31,2.59,1.94,108


In [100]:
years = [str(year) for year in range(2000,2015)]
cd_all_dfs = []
for year in years:
    df = pd.read_csv(datadir+f'/cdall_{year[-2:]}.csv', header=None)
    header = [(year, value) for value in df.iloc[0,1:]]
    cols = pd.MultiIndex.from_tuples(header, names=header_names)
    df.drop(index=0, inplace=True)
    index = [int(num.split('.')[0]) for num in df.iloc[:,0]]
    df.drop(columns=0, inplace=True)
    df.columns = cols
    df.index=index
    cd_all_dfs.append(df)

In [108]:
use_csv_left = use_csv[:-20]

In [109]:
use_csv_left

['2015_CHP_all_data.csv',
 '2015_Cause_of_death_data.csv',
 '2018_CHP_all_data.csv',
 '2018_Cause_of_premature_death_data.csv',
 'AbuseNeglectReport2015To2018.csv',
 'ChildWelfare.csv',
 'EPHT_CD.csv',
 'EPTH_UHF.csv',
 'Family_Violence_Related_Snapshots__New_York_City_Community_Board_Districts.csv',
 'Intimate_Partner_Violence_Related_Snapshots__New_York_City_Community_Board_Districts.csv',
 'NTA_Data.csv']

In [110]:
dfa = pd.read_csv(datadir+'/NTA_Data.csv', header=None)
dfa

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,186,187,188,189,190,191,192,193,194,195
0,NTA_Name,NTA_Code,TotalPop,Female,PopU18,Pop18t24,Pop25t44,Pop45t64,Pop65pl,Hispanic,...,NitricOxide,NitrogenDioxide,Ozone,SulfurDioxide,Daycares,AlcoholRetailers,TobaccoRetailers,FelonyCrime,PropertyCrime,ViolentCrime
1,New York City,NYC,8354889,52.4,21.4,10.1,31.4,24.6,12.5,28.8,...,19.5,18.9,31.4,0.6,11234,25.7,10.9,20.3,10.7,4.4
2,Brooklyn Heights-Cobble Hill,BK09,23532,51,12.8,9.4,42.2,22.5,13.1,8.9,...,32,23.7,28.4,0.5,21,33.1,7.2,11.6,7.4,1.7
3,Sheepshead Bay-Gerritsen Beach-Manhattan Beach,BK17,63459,53.6,19.1,7.4,25.3,29.7,18.6,7.1,...,16.5,16.9,33,0.4,71,21.3,9.6,16.5,9.5,3
4,Brighton Beach,BK19,33146,53.1,17.6,7.4,24.5,28.4,22.1,12.4,...,18.7,18,32.5,0.5,39,18.7,8.1,16.1,9.5,4.1
5,Seagate-Coney Island,BK21,29229,53.1,22.5,12.2,21.2,24.9,19.3,24.9,...,14.9,15.8,33.7,0.4,27,23.9,10.6,32.2,15.5,6.8
6,West Brighton,BK23,14977,58.8,7.6,5.3,18.8,31.9,36.3,0.5,...,17.1,17,32.7,0.4,6,8.7,4,14.3,10.3,1.6
7,Homecrest,BK25,43565,51.7,26.1,8.3,27.5,23.2,14.9,7.3,...,20,19.4,31.6,0.5,39,19.7,10.1,12.9,7.3,3
8,Gravesend,BK26,28944,55,21.1,8.3,25.8,24.2,20.6,13.9,...,16.3,17.2,32.8,0.4,19,13.8,7.9,17,9.8,3.2
9,Bath Beach,BK27,32002,49.1,20.2,7.4,31.7,24.7,15.9,15.2,...,17.7,18.8,31.8,0.4,23,15,10,9.7,4.6,2.3
