In [1]:
import os
import pandas as pd
import zipfile
import geopandas as  gpd
from fiona.crs import from_epsg
import numpy as np
from sklearn.cluster import KMeans, DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import seaborn
import datetime
import pylab as pl
%pylab inline

Populating the interactive namespace from numpy and matplotlib




In [2]:
def get_data():
    '''
    Download data for Homework 11, Assigment 2 and move to HW11 PUI Data Folder, including:
    1.  Census Data for Businesses by Zipcode
    2.  NYC Zipcodes Shapefile 
    '''
    
    # Download NYC Zipcode Shapefile from GitHub HW11 Repository
    
    url_base = "https://raw.githubusercontent.com/fedhere/PUI2016_fb55/master/HW11_fb55"
    !curl -O "{url_base}/nyc-zip-code-tabulation-areas-polygons.geojson"
    
    #  Commenting out this portion of the function now that data is downloaded
    '''
    # Download Census Data for Business by Zip Code (Code provided by professor)

    !for ((y=93; y<=99; y+=1)); do wget ftp://ftp.census.gov/Econ2001_And_Earlier/CBP_CSV/zbp$y\totals.zip; done 
    !for ((y=0; y<=1; y+=1)); do wget ftp://ftp.census.gov/Econ2001_And_Earlier/CBP_CSV/zbp0$y\totals.zip; done
    !for ((y=2; y<=9; y+=1)); do wget ftp://ftp.census.gov/econ200$y\/CBP_CSV/zbp0$y\totals.zip; done
    !for ((y=10; y<=15; y+=1)); do wget ftp://ftp.census.gov/econ20$y\/CBP_CSV/zbp$y\totals.zip; done
    '''
    
get_data()

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  591k  100  591k    0     0  1315k      0 --:--:-- --:--:-- --:--:-- 4696k


In [3]:
puidata = os.getenv('PUIDATA')
#open zipcode file stored in PUIDATA
nycshape = gpd.read_file(puidata +"/HW11/nyc-zip-code-tabulation-areas-polygons.geojson")
nycshape.head()

Unnamed: 0,@id,BLDGpostalCode,CTY_FIPS,OBJECTID,PO_NAME,STATE,ST_FIPS,Shape_Area,Shape_Leng,borough,geometry,postalCode
0,http://nyc.pediacities.com/Resource/PostalCode...,0,81,1,Jackson Heights,NY,36,20163280.0,20624.692317,Queens,POLYGON ((-73.86942457284175 40.74915687096787...,11372
1,http://nyc.pediacities.com/Resource/PostalCode...,0,81,2,Glen Oaks,NY,36,22606530.0,23002.816039,Queens,POLYGON ((-73.71068374074007 40.75004039450917...,11004
2,http://nyc.pediacities.com/Resource/PostalCode...,0,81,3,New Hyde Park,NY,36,6269333.0,15749.161511,Queens,POLYGON ((-73.70098278625547 40.73889569923034...,11040
3,http://nyc.pediacities.com/Resource/PostalCode...,0,81,4,Bellerose,NY,36,49418360.0,35932.810639,Queens,POLYGON ((-73.72270447144122 40.75373371438336...,11426
4,http://nyc.pediacities.com/Resource/PostalCode...,0,81,5,Fresh Meadows,NY,36,69385870.0,38693.565676,Queens,POLYGON ((-73.81088634744755 40.72717187575918...,11365


In [4]:
# rename postalcode column name to zipcode and remove columns deemed unnecessary
nycshape = nycshape.rename(columns={'postalCode': "zipcode"})
nycshape = nycshape[['zipcode', 'borough', 'geometry', 'PO_NAME']]
#datatype for all columns is 'object', changing zipcode to 'numeric'
nycshape.zipcode = pd.to_numeric(nycshape.zipcode)
nycshape.head()

Unnamed: 0,zipcode,borough,geometry,PO_NAME
0,11372,Queens,POLYGON ((-73.86942457284175 40.74915687096787...,Jackson Heights
1,11004,Queens,POLYGON ((-73.71068374074007 40.75004039450917...,Glen Oaks
2,11040,Queens,POLYGON ((-73.70098278625547 40.73889569923034...,New Hyde Park
3,11426,Queens,POLYGON ((-73.72270447144122 40.75373371438336...,Bellerose
4,11365,Queens,POLYGON ((-73.81088634744755 40.72717187575918...,Fresh Meadows


In [5]:
# Test for Duplicate Values
print (len(nycshape))
print (len(nycshape['zipcode'].unique()))

262
247


In [6]:
#create a new dataframe with values from all census business data files, 
years = ['94', '95', '96', '97', '98', '99', '00', '01', '02', '03', '04', '05', '06', '07',
        '08', '09', '10', '11', '12', '13', '14']
CombinedValues = pd.DataFrame()
for year in years:
    fname = 'zbp' + year + 'totals.zip'
    zf = zipfile.ZipFile(fname)
    each_year = pd.read_csv(zf.open(fname.replace('.zip','.txt')))
    each_year['year'] = year
    each_year.rename(columns = {"est" : "est"+ str(year)}, inplace = True)
    each_year.rename(columns = {"EST" : "est"+ str(year)}, inplace = True)
    each_year.rename(columns = {"ZIP" : "zipcode"}, inplace= True)
    each_year.rename(columns = {"zip" : "zipcode"}, inplace= True)
    CombinedValues = pd.concat([CombinedValues, each_year], axis=0)
    #CombinedValues = CombinedValues.groupby(["zip"])
    
  
CombinedValues.head()

Unnamed: 0,AP,EMP,EMPFLAG,NAME,QP1,ap,ap_nf,city,cty_name,emp,...,est96,est97,est98,est99,name,qp1,qp1_nf,stabbr,year,zipcode
0,,,,,,155158.0,,,,6198.0,...,,,,,"Agawam, MA",33601.0,,,94,1001
1,,,,,,127367.0,,,,6073.0,...,,,,,"Amherst, MA",28924.0,,,94,1002
2,,,,,,1604.0,,,,68.0,...,,,,,"Amherst, MA",367.0,,,94,1003
3,,,,,,4302.0,,,,210.0,...,,,,,"Amherst, MA",844.0,,,94,1004
4,,,,,,13521.0,,,,678.0,...,,,,,"Barre, MA",3010.0,,,94,1005


In [7]:
CombinedValues_reduced = CombinedValues
CombinedValues_reduced.drop(CombinedValues_reduced.columns[[0,1,2,3,4,5,
                                                           6,7,8,9,10,11,33,34,35,36,37]], axis=1, inplace=True)

CombinedValues_reduced.head()

Unnamed: 0,est00,est01,est02,est03,est04,est05,est06,est07,est08,est09,...,est12,est13,est14,est94,est95,est96,est97,est98,est99,zipcode
0,,,,,,,,,,,...,,,,439.0,,,,,,1001
1,,,,,,,,,,,...,,,,450.0,,,,,,1002
2,,,,,,,,,,,...,,,,10.0,,,,,,1003
3,,,,,,,,,,,...,,,,47.0,,,,,,1004
4,,,,,,,,,,,...,,,,92.0,,,,,,1005


In [8]:
#convert zipcode to numeric and rename zip column to zipcode for the merge

CombinedValues_reduced["zipcode"] = CombinedValues_reduced["zipcode"].apply(pd.to_numeric)
#merge shapefile and census information together on zipcode column
CombinedValues_reduced.head()

Unnamed: 0,est00,est01,est02,est03,est04,est05,est06,est07,est08,est09,...,est12,est13,est14,est94,est95,est96,est97,est98,est99,zipcode
0,,,,,,,,,,,...,,,,439.0,,,,,,1001
1,,,,,,,,,,,...,,,,450.0,,,,,,1002
2,,,,,,,,,,,...,,,,10.0,,,,,,1003
3,,,,,,,,,,,...,,,,47.0,,,,,,1004
4,,,,,,,,,,,...,,,,92.0,,,,,,1005


In [None]:
#this would merge the dataframe with the shapefile
combinedtest = nycshape.merge(CombinedValues_reduced, on = "zipcode")
combinedtest.head()