In [1]:
## Import libraries
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Markdown
from shapely.geometry import Point, Polygon

%matplotlib inline

In [2]:
## Define the working directory. (This should be executed just once)
os.chdir(os.path.join('..'))
working_path = os.getcwd()

print('The working directory will be {}'.format(working_path))

# Define the path where the data sets are located
datasets_path = os.path.join(working_path, 'Datasets')

print('The datasets folder is {}'.format(datasets_path))

The working directory will be C:\Users\Hector\Documents\DS4A_datathon
The datasets folder is C:\Users\Hector\Documents\DS4A_datathon\Datasets


In [3]:
demographics_path = os.path.join(datasets_path, 'demographics.csv')

In [4]:
raw_df = pd.read_csv(demographics_path)
raw_df.head()

Unnamed: 0,nta_name,borough,nta_code,population,under_5_years,5-9_years,10-14_years,15-19_years,20-24_years,25-29_years,...,15000_to_24999,25000_to_34999,35000_to_49999,50000_to_74999,75000_to_99999,100000_to_149999,150000_to_199999,200000_or_more,median_income,mean_income
0,Allerton-Pelham Gardens,Bronx,BX31,28903,1679,1706,1763,2039,1964,1703,...,797,773,1160,1764,1155,1562,765,427,61638,78489
1,Annadale-Huguenot-Prince's Bay-Eltingville,Staten Island,SI01,27770,1397,1698,1817,1880,1720,1594,...,571,405,1008,1523,1346,2075,1086,1151,88288,109187
2,Arden Heights,Staten Island,SI48,25238,1507,1540,1596,1752,1614,1561,...,337,516,707,1421,1611,2021,1047,740,89570,101627
3,Astoria,Queens,QN70,78793,3480,3037,3060,3392,6630,11586,...,3673,2816,4725,6463,4557,4698,1627,1197,54882,70094
4,Auburndale,Queens,QN48,19996,917,966,1063,1168,1214,1307,...,445,632,690,1417,1060,1237,589,433,70772,84402


In [5]:
raw_df.columns

Index(['nta_name', 'borough', 'nta_code', 'population', 'under_5_years',
       '5-9_years', '10-14_years', '15-19_years', '20-24_years', '25-29_years',
       '30-34_years', '35-39_years', '40-44_years', '45-49_years',
       '50-54_years', '55-59_years', '60-64_years', 'over_65_years',
       'median_age', 'people_per_acre', 'households', 'less_than_10,000',
       '10000_to_14999', '15000_to_24999', '25000_to_34999', '35000_to_49999',
       '50000_to_74999', '75000_to_99999', '100000_to_149999',
       '150000_to_199999', '200000_or_more', 'median_income', 'mean_income'],
      dtype='object')

In [6]:
raw_df['area_squarekm'] = raw_df['population']/(raw_df['people_per_acre'] * 247.105)

In [7]:
raw_df['total_income'] = raw_df['mean_income'] * raw_df['population']

In [8]:
important_columns = ['borough', 'population', 'total_income', 'area_squarekm', 'under_5_years', 
                     '5-9_years', '10-14_years', '15-19_years', '20-24_years', '25-29_years',
                     '30-34_years', '35-39_years', '40-44_years', '45-49_years', '50-54_years', 
                     '55-59_years', '60-64_years', 'over_65_years', 'households', 'less_than_10,000', 
                     '10000_to_14999', '15000_to_24999', '25000_to_34999', '35000_to_49999', 
                     '50000_to_74999', '75000_to_99999', '100000_to_149999','150000_to_199999', '200000_or_more',]

In [9]:
age_columns = ['under_5_years', '5-9_years', '10-14_years', '15-19_years', 
               '20-24_years', '25-29_years', '30-34_years', '35-39_years', 
               '40-44_years', '45-49_years', '50-54_years', '55-59_years', 
               '60-64_years', 'over_65_years']

In [10]:
raw_df['acre_area'] = raw_df['population']/raw_df['people_per_acre']
raw_df[['nta_name', 'borough', 'population', 'acre_area', 'area_squarekm']]

Unnamed: 0,nta_name,borough,population,acre_area,area_squarekm
0,Allerton-Pelham Gardens,Bronx,28903,726.206030,2.938856
1,Annadale-Huguenot-Prince's Bay-Eltingville,Staten Island,27770,3305.952381,13.378735
2,Arden Heights,Staten Island,25238,1157.706422,4.685079
3,Astoria,Queens,78793,902.554410,3.652514
4,Auburndale,Queens,19996,784.156863,3.173375
...,...,...,...,...,...
183,Windsor Terrace,Brooklyn,20988,322.396313,1.304694
184,Woodhaven,Queens,56674,853.524096,3.454095
185,Woodlawn-Wakefield,Bronx,42483,901.974522,3.650167
186,Woodside,Queens,45099,648.906475,2.626035


In [11]:
clean_df = raw_df[important_columns].groupby('borough').sum().reset_index()
clean_df

Unnamed: 0,borough,population,total_income,area_squarekm,under_5_years,5-9_years,10-14_years,15-19_years,20-24_years,25-29_years,...,"less_than_10,000",10000_to_14999,15000_to_24999,25000_to_34999,35000_to_49999,50000_to_74999,75000_to_99999,100000_to_149999,150000_to_199999,200000_or_more
0,Bronx,1372111,66794139347,87.51523,102995,98538,99048,114344,110579,103829,...,75525,45782,68197,53967,63301,72104,42686,38418,11715,7980
1,Brooklyn,2503518,168500006492,159.648092,177058,159285,156494,170623,195733,222691,...,105868,64435,105583,90596,116024,146853,98214,107195,45358,45069
2,Manhattan,1584024,198983103829,51.272737,76555,61321,58226,77441,141450,186552,...,71440,39207,62636,51331,64489,94112,73515,99567,56607,132185
3,Queens,2230286,163702982674,231.429413,132453,123751,123397,139078,160848,184896,...,55668,36527,79695,72079,99299,140095,101834,114328,47029,33343
4,Staten Island,468730,42324993228,149.03811,28339,30015,30797,32929,31458,29988,...,11241,6381,13051,11721,15934,25193,22729,31191,15978,11660


In [14]:
path_demographics = os.path.join(datasets_path, 'demographics_proccesed.csv')
clean_df.to_csv(path_demographics)