In [481]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from pprint import pprint
from sqlalchemy import create_engine
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

# Set pandas to display numbers with commas
pd.options.display.float_format = '{:,}'.format

# Prep Data and Create DB

### Import and Clean Data

In [482]:
# import and clean data

pluto_df = pd.read_csv("pluto.csv")
pluto_df.sort_values(by="zipcode", inplace=True)
pluto_df = pluto_df.reset_index(drop=True).copy()
pluto_df.dropna(inplace=True)

In [483]:
# Create csv to check for parcel outliers
# This check has to be performed manually b/c of how inconsistent the discrepancies are
# I.e. some zips have a legit 30,000,000sf park that isn't Central Park, or a 15,000,000sf parcel of vacant land...
pluto_df[pluto_df.lotarea > 1000000].sort_values(by='zipcode').to_csv('big_parcels.csv')

In [484]:
# List of bad parcel indices (parcels that do not correspond to geojson and throw off KNN)
bad_indices = [27309, 
               42060,
               45278,
               78158,
               87500,
               94465,
               99456,
               109342,
               114283,
               446822,
               467890,
               582314,
               630487,
               631393,
               651855,
               831941,
               833776]

In [485]:
# Indices are all screwed up b/c dropna is after reset_index above...
# Create a column called "index"
pluto_df.reset_index(inplace=True)

# remove rows with index in bad_indices
pluto_df = pluto_df[~pluto_df['index'].isin(bad_indices)]

# Drop pacels with 0 lotarea
pluto_df = pluto_df[pluto_df.lotarea > 0]

# Clean
pluto_df.drop('index', axis=1, inplace=True)
pluto_df.reset_index(drop=True, inplace=True)

In [486]:
pluto_df

Unnamed: 0,zipcode,borough,borocode,landuse,bldgarea,lotarea
0,10001.0,MN,1.0,5.0,611625.0,23601.0
1,10001.0,MN,1.0,4.0,13489.0,2469.0
2,10001.0,MN,1.0,5.0,34000.0,17773.0
3,10001.0,MN,1.0,5.0,2008.0,1980.0
4,10001.0,MN,1.0,6.0,58764.0,4937.0
5,10001.0,MN,1.0,6.0,113000.0,7407.0
6,10001.0,MN,1.0,3.0,13000.0,2364.0
7,10001.0,MN,1.0,10.0,34213.0,6300.0
8,10001.0,MN,1.0,10.0,0.0,8636.0
9,10001.0,MN,1.0,4.0,5849.0,2467.0


### Sum Data by Zipcode and Normalize

DESCRIPTION OUT OF DATE

Map bldgarea and lotarea to individual landuses. This gets confusing, but landuses associated with buildings get bldgarea and open space uses like parks get lot area. Effectively, this means that building uses and open space uses are in different units. The most elegant solution would be to just normalize each column individually for KNN, but this creates confusing visualizations b/c a low density zipcode that may be 50% parks but is in the 80th percentile will appear overwhelmingly green.


So, after mapping bldg/lot areas to uses,  

In [487]:
#One hot encoding for landuse

dummy_df = pd.get_dummies(pluto_df.landuse)
pluto_df.drop("landuse", axis=1, inplace=True)
data_df = pluto_df.join(dummy_df)
data_df.head()

Unnamed: 0,zipcode,borough,borocode,bldgarea,lotarea,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0
0,10001.0,MN,1.0,611625.0,23601.0,0,0,0,0,1,0,0,0,0,0,0
1,10001.0,MN,1.0,13489.0,2469.0,0,0,0,1,0,0,0,0,0,0,0
2,10001.0,MN,1.0,34000.0,17773.0,0,0,0,0,1,0,0,0,0,0,0
3,10001.0,MN,1.0,2008.0,1980.0,0,0,0,0,1,0,0,0,0,0,0
4,10001.0,MN,1.0,58764.0,4937.0,0,0,0,0,0,1,0,0,0,0,0


In [488]:
# Create column with dummy values that can be counted during groupby
# These will be used later to cull mini zipcodes in Manhattan with only a couple of buildings
data_df["bldg_count"] = data_df["zipcode"]

In [489]:
# Create bldg_lotarea column that tracks lot areas for building landuses
# This is necessary to create a weighted average FAR after summing y zipcode
data_df['bldg_lotarea'] = 0

In [490]:
# Assign parcel areas to each land use
cols = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]
for col in cols:
    if col < 8:
        data_df['bldg_lotarea'] += data_df[col] * data_df.lotarea #lot areas of parcels with building uses
        data_df[col] = data_df[col] * data_df.bldgarea #building areas
    else:
        data_df[col] = data_df[col] * data_df.lotarea #open space uses

data_df.dropna(inplace=True)

In [491]:
data_df.head(10)

Unnamed: 0,zipcode,borough,borocode,bldgarea,lotarea,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,bldg_count,bldg_lotarea
0,10001.0,MN,1.0,611625.0,23601.0,0.0,0.0,0.0,0.0,611625.0,0.0,0.0,0.0,0.0,0.0,0.0,10001.0,23601.0
1,10001.0,MN,1.0,13489.0,2469.0,0.0,0.0,0.0,13489.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10001.0,2469.0
2,10001.0,MN,1.0,34000.0,17773.0,0.0,0.0,0.0,0.0,34000.0,0.0,0.0,0.0,0.0,0.0,0.0,10001.0,17773.0
3,10001.0,MN,1.0,2008.0,1980.0,0.0,0.0,0.0,0.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.0,10001.0,1980.0
4,10001.0,MN,1.0,58764.0,4937.0,0.0,0.0,0.0,0.0,0.0,58764.0,0.0,0.0,0.0,0.0,0.0,10001.0,4937.0
5,10001.0,MN,1.0,113000.0,7407.0,0.0,0.0,0.0,0.0,0.0,113000.0,0.0,0.0,0.0,0.0,0.0,10001.0,7407.0
6,10001.0,MN,1.0,13000.0,2364.0,0.0,0.0,13000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10001.0,2364.0
7,10001.0,MN,1.0,34213.0,6300.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6300.0,0.0,10001.0,0.0
8,10001.0,MN,1.0,0.0,8636.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8636.0,0.0,10001.0,0.0
9,10001.0,MN,1.0,5849.0,2467.0,0.0,0.0,0.0,5849.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10001.0,2467.0


In [492]:
# Groupby to create summary data

data_df.drop("borough", axis=1, inplace=True)
landuse_df = data_df.groupby("zipcode").agg({'borocode':'median',
                                             'lotarea': 'sum',
                                             1.0:'sum',
                                             2.0:'sum',
                                             3.0:'sum',
                                             4.0:'sum', 
                                             5.0:'sum',
                                             6.0:'sum',
                                             7.0:'sum',
                                             8.0:'sum',
                                             9.0:'sum',
                                             10.0:'sum',
                                             11.0:'sum',
                                             'bldg_count': 'count',
                                             'bldg_lotarea': 'sum'
                                            })
landuse_df.reset_index(inplace=True)

In [493]:
# Transform building landuses into weighted FAR
landuse_df[1] = landuse_df[1] / landuse_df['bldg_lotarea']
landuse_df[2] = landuse_df[2] / landuse_df['bldg_lotarea']
landuse_df[3] = landuse_df[3] / landuse_df['bldg_lotarea']
landuse_df[4] = landuse_df[4] / landuse_df['bldg_lotarea']
landuse_df[5] = landuse_df[5] / landuse_df['bldg_lotarea']
landuse_df[6] = landuse_df[6] / landuse_df['bldg_lotarea']
landuse_df[7] = landuse_df[7] / landuse_df['bldg_lotarea']

In [494]:
# Normalize open space landuses

# Transform open space landuses into percentages of total 2D area 
landuse_df[8] = landuse_df[8] / landuse_df['lotarea']
landuse_df[9] = landuse_df[9] / landuse_df['lotarea']
landuse_df[10] = landuse_df[10] / landuse_df['lotarea']
landuse_df[11] = landuse_df[11] / landuse_df['lotarea']

# Find total built FAR
landuse_df['sum_FAR'] = landuse_df.iloc[:, 3:10].sum(axis=1)

# Find total % of zipcode dedicated to open space landuses, and then % dedicated to buildings
landuse_df['open_space_percent'] = landuse_df.iloc[:, 10:14].sum(axis=1)
landuse_df['buildings_percent'] = 1 - landuse_df['open_space_percent']

# Extrapolate FAR for entire zipcode as if open space uses were buildings
landuse_df['max_FAR'] = landuse_df['sum_FAR'] / landuse_df['buildings_percent']

# Apportion FAR to each open space landuse
landuse_df[8] = landuse_df[8] * landuse_df['max_FAR']
landuse_df[9] = landuse_df[9] * landuse_df['max_FAR']
landuse_df[10] = landuse_df[10] * landuse_df['max_FAR']
landuse_df[11] = landuse_df[11] * landuse_df['max_FAR']

# Clean
landuse_df.drop(['sum_FAR', 
                 'open_space_percent', 
                 'buildings_percent', 
                 'max_FAR', 
                 'bldg_lotarea', 
                 'lotarea'], axis=1, inplace=True)

landuse_df.dropna(inplace=True)

In [495]:
# Remove miniscule zipcodes from Manhattan only (lots of 1-10 building mini-zipcodes)
landuse_df = landuse_df[(landuse_df.bldg_count >= 10) | (landuse_df.borocode != 1.0)].copy()
landuse_df.drop('bldg_count', axis=1, inplace=True)

In [496]:
# Relabel borocode so easy to remember (had to drop above b/c groupby only accepts numeric columns)
boro_dict = {"Manhattan": 1.0, 
             "Bronx": 2.0, 
             "Brooklyn": 3.0, 
             "Queens": 4.0, 
             "Staten Island": 5.0}

def relabel(num):
    boros = list(boro_dict.keys())
    values = list(boro_dict.values())    
    return boros[values.index(num)]

landuse_df['borocode'] = landuse_df['borocode'].apply(relabel)

In [497]:
# Read in and add neighborhoods (important for frontend display)
neighborhoods_df = pd.read_csv("neighborhoods.csv")
landuse_df = landuse_df.merge(neighborhoods_df, on="zipcode", how="inner")

In [498]:
# Rename colummns
cols_dict = {1.0: "1 & 2 Family Residential",
             2.0: "Multi-Family Walk-up",
             3.0: "Multi-Family Elevator",
             4.0: "Mixed Residential & Commercial",
             5.0: "Commercial & Office",
             6.0: "Industrial & Manufacturing",
             7.0: "Transportation & Utility",
             8.0: "Public Facilities & Institutions",
             9.0: "Open Space & Outdoor Recreation",
             10.0: "Parking",
             11.0: "Vacant Land"}

landuse_df.rename(columns = cols_dict, inplace=True)

In [499]:
# Reorder columns
cols = ['zipcode',
        'borocode',
        'neighborhood',
        '1 & 2 Family Residential',
        'Multi-Family Walk-up',
        'Multi-Family Elevator',
        'Mixed Residential & Commercial',
        'Commercial & Office',
        'Industrial & Manufacturing',
        'Transportation & Utility',
        'Public Facilities & Institutions',
        'Open Space & Outdoor Recreation',
        'Parking',
        'Vacant Land']

landuse_df = landuse_df[cols]

Weighting KNN based on FAR for each landuse will result in most high density Manhattan neighborhoods choosing the highest density outer borough neighborhood (Brooklyn Heights) and most low density outer borough neighborhoods choosing the lowest density Manhattan neighborhood (Roosevelt Island).

Treating landuse as % and adding in total FAR as a column should help balance the results, while still providing a legible visualization, just keep in mind that KNN is using euclidean distance, so a difference of 10 FAR will be difficult to over come with % landuse differences < 1.0, but all we're trying to overcome is the difference between the standout zipcodes for Roosevelt Island and Brooklyn Heights, which aren't THAT much higher or lower density than other zipcodes.

In [500]:
# Create summary FAR attribute
landuse_df['FAR'] = landuse_df.iloc[:, 3] + landuse_df.iloc[:, 4] + landuse_df.iloc[:, 5] + landuse_df.iloc[:, 6] + landuse_df.iloc[:, 7] + landuse_df.iloc[:, 8] + landuse_df.iloc[:, 9]

In [501]:
# Convert landuse attributes to %s
landuse_df.iloc[:, 3] = landuse_df.iloc[:, 3] / landuse_df.FAR
landuse_df.iloc[:, 4] = landuse_df.iloc[:, 4] / landuse_df.FAR
landuse_df.iloc[:, 5] = landuse_df.iloc[:, 5] / landuse_df.FAR
landuse_df.iloc[:, 6] = landuse_df.iloc[:, 6] / landuse_df.FAR
landuse_df.iloc[:, 7] = landuse_df.iloc[:, 7] / landuse_df.FAR
landuse_df.iloc[:, 8] = landuse_df.iloc[:, 8] / landuse_df.FAR
landuse_df.iloc[:, 9] = landuse_df.iloc[:, 9] / landuse_df.FAR
landuse_df.iloc[:, 10] = landuse_df.iloc[:, 10] / landuse_df.FAR
landuse_df.iloc[:, 11] = landuse_df.iloc[:, 11] / landuse_df.FAR
landuse_df.iloc[:, 12] = landuse_df.iloc[:, 12] / landuse_df.FAR
landuse_df.iloc[:, 13] = landuse_df.iloc[:, 13] / landuse_df.FAR

In [502]:
landuse_df

Unnamed: 0,zipcode,borocode,neighborhood,1 & 2 Family Residential,Multi-Family Walk-up,Multi-Family Elevator,Mixed Residential & Commercial,Commercial & Office,Industrial & Manufacturing,Transportation & Utility,Public Facilities & Institutions,Open Space & Outdoor Recreation,Parking,Vacant Land,FAR
0,10001.0,Manhattan,NOMAD / Chelsea,0.0004124198483636065,0.007268694267911448,0.10247569231334129,0.17028275070858506,0.6504896660393278,0.054557736348873564,0.014513040473597302,0.2645583063882568,0.05821909119061961,0.0588062465076829,0.11664649044360696,9.440470873539512
1,10002.0,Manhattan,Lower East Side,0.00127991433885903,0.048534911599726664,0.416105542441979,0.4095571143839591,0.0801341004289442,0.029613527565556956,0.014774889240975015,0.15970219986036446,0.3913646681164199,0.01652362701738413,0.04572916847440839,3.5543276113023796
2,10003.0,Manhattan,NOHO / Union Square,0.011656678405180846,0.08564123028410003,0.28828989124127596,0.3548085780234243,0.24336546723695232,0.016159922675259432,7.823213380711986e-05,0.23190679835933228,0.057764600377020074,0.012018955392736027,0.006401749394849561,5.994167508895084
3,10004.0,Manhattan,Bowling Green / Govenor's Island,0.0,0.0,0.027271661061546392,0.1577911949257813,0.7943236195635951,0.0032150199569565283,0.017398504492120735,3.2825442330422194,0.4251244229001687,0.004471261026632602,0.006077666878892691,10.42897352039037
4,10005.0,Manhattan,Wall Street,0.0,0.0,0.09781066369391206,0.22407800334415182,0.651202846223548,0.0,0.026908486738388197,0.00906402180506162,0.0,0.0,0.001503094745124346,13.250371594069332
5,10006.0,Manhattan,Lower Manhattan,0.0,0.0,0.21795382699214652,0.1276967946400301,0.6543493783678234,0.0,0.0,0.1894167864201802,0.030468951203046594,0.3821922588687446,0.022878381735087452,16.82582226089554
6,10007.0,Manhattan,City Hall,0.00019504055319090414,0.0022928062946319357,0.4265098374627052,0.1203552654783946,0.44737334999757117,0.003162502538152575,0.00011119767535373736,0.19861372677619282,0.03315487797133304,0.006198046436365366,0.0047630181330734584,16.81338395280098
7,10009.0,Manhattan,East Village,0.0016343010737397317,0.15018874819690892,0.25959268873887603,0.5423433705720027,0.03610666083681375,0.004360247824619002,0.005773982757039812,0.0784175686809856,0.08161669647696504,0.003523507844231976,0.015192335535341274,3.049451054179819
8,10010.0,Manhattan,Flat Iron / Gramercy,0.0009124956872669622,0.017254272367809402,0.17994081445160487,0.37202071982379287,0.42004678563797465,0.008135661905984939,0.0016892501255664366,0.23011753268537116,0.11649560137322777,0.0116335322845698,0.0030910858880743323,8.175598886074736
9,10011.0,Manhattan,Chelsea,0.018441421209351234,0.08000944739323217,0.26509508284353994,0.32462095030379906,0.27399233343251084,0.029523748674908343,0.008317016142658215,0.0871267300909633,0.06306735442853678,0.0057949508332075204,0.04225804561988678,5.008238825282489


### Ensure matching zipcodes between DB and geojson

In [503]:
# geo_path = "../nearest_neighborhood/static/geojson/zipcodes.geojson"

# with open(geo_path) as f:
#     d = json.load(f)
#     pprint(d)

In [504]:
# # Create new df from zipcode GeoJSON

# geo_path = "../nearest_neighborhood/static/geojson/zipcodes.geojson"

# zips = []
# areas = []
# neighborhood = []
# ids = []

# with open(geo_path) as f:
#     d = json.load(f)
#     #pprint(d)
#     for obj in d['features']:
#         zips.append(obj['properties']['postalCode'])
#         areas.append(obj['properties']['Shape_Area'])
#         neighborhood.append(obj['properties']['PO_NAME'])
#         ids.append(obj['id'])
        
# data = {"zipcode": zips, "id": ids, "area": areas, "neighborhood": neighborhood}
# geo_df = pd.DataFrame.from_dict(data)

# geo_df.sort_values("zipcode", inplace=True)
# geo_df.reset_index(drop=True, inplace=True)

In [505]:
# print(geo_df.to_string())

In [506]:
# csv_path = "../nearest_neighborhood/static/js/available_zipcodes.csv"
# landuse_df.zipcode.to_csv(csv_path, indext=False)

In [507]:
# List of zipcodes without corresponding geoJSON need to be removed from database
drop_zips = [11249, 11241, 11109]

In [508]:
def check_zip(zipcode):
    if zipcode not in drop_zips:
        return True
    else:
        return False

In [509]:
landuse_df = landuse_df[landuse_df.zipcode.apply(check_zip)].copy()

In [510]:
# Scale FAR to between 0 and 1
scaler = MinMaxScaler()
landuse_df['FAR'] = scaler.fit_transform(landuse_df[['FAR']])

In [511]:
landuse_df

Unnamed: 0,zipcode,borocode,neighborhood,1 & 2 Family Residential,Multi-Family Walk-up,Multi-Family Elevator,Mixed Residential & Commercial,Commercial & Office,Industrial & Manufacturing,Transportation & Utility,Public Facilities & Institutions,Open Space & Outdoor Recreation,Parking,Vacant Land,FAR
0,10001.0,Manhattan,NOMAD / Chelsea,0.0004124198483636065,0.007268694267911448,0.10247569231334129,0.17028275070858506,0.6504896660393278,0.054557736348873564,0.014513040473597302,0.2645583063882568,0.05821909119061961,0.0588062465076829,0.11664649044360696,0.5589443137486669
1,10002.0,Manhattan,Lower East Side,0.00127991433885903,0.048534911599726664,0.416105542441979,0.4095571143839591,0.0801341004289442,0.029613527565556956,0.014774889240975015,0.15970219986036446,0.3913646681164199,0.01652362701738413,0.04572916847440839,0.20742184450714918
2,10003.0,Manhattan,NOHO / Union Square,0.011656678405180846,0.08564123028410003,0.28828989124127596,0.3548085780234243,0.24336546723695232,0.016159922675259432,7.823213380711986e-05,0.23190679835933228,0.057764600377020074,0.012018955392736027,0.006401749394849561,0.3531299095585239
3,10004.0,Manhattan,Bowling Green / Govenor's Island,0.0,0.0,0.027271661061546392,0.1577911949257813,0.7943236195635951,0.0032150199569565283,0.017398504492120735,3.2825442330422194,0.4251244229001687,0.004471261026632602,0.006077666878892691,0.6179780266217585
4,10005.0,Manhattan,Wall Street,0.0,0.0,0.09781066369391206,0.22407800334415182,0.651202846223548,0.0,0.026908486738388197,0.00906402180506162,0.0,0.0,0.001503094745124346,0.7864728751817212
5,10006.0,Manhattan,Lower Manhattan,0.0,0.0,0.21795382699214652,0.1276967946400301,0.6543493783678234,0.0,0.0,0.1894167864201802,0.030468951203046594,0.3821922588687446,0.022878381735087452,0.9999999999999999
6,10007.0,Manhattan,City Hall,0.00019504055319090414,0.0022928062946319357,0.4265098374627052,0.1203552654783946,0.44737334999757117,0.003162502538152575,0.00011119767535373736,0.19861372677619282,0.03315487797133304,0.006198046436365366,0.0047630181330734584,0.9992571800277718
7,10009.0,Manhattan,East Village,0.0016343010737397317,0.15018874819690892,0.25959268873887603,0.5423433705720027,0.03610666083681375,0.004360247824619002,0.005773982757039812,0.0784175686809856,0.08161669647696504,0.003523507844231976,0.015192335535341274,0.17727044551547294
8,10010.0,Manhattan,Flat Iron / Gramercy,0.0009124956872669622,0.017254272367809402,0.17994081445160487,0.37202071982379287,0.42004678563797465,0.008135661905984939,0.0016892501255664366,0.23011753268537116,0.11649560137322777,0.0116335322845698,0.0030910858880743323,0.483405730248568
9,10011.0,Manhattan,Chelsea,0.018441421209351234,0.08000944739323217,0.26509508284353994,0.32462095030379906,0.27399233343251084,0.029523748674908343,0.008317016142658215,0.0871267300909633,0.06306735442853678,0.0057949508332075204,0.04225804561988678,0.2942499146418157


In [512]:
landuse_df.sort_values(by='FAR')

Unnamed: 0,zipcode,borocode,neighborhood,1 & 2 Family Residential,Multi-Family Walk-up,Multi-Family Elevator,Mixed Residential & Commercial,Commercial & Office,Industrial & Manufacturing,Transportation & Utility,Public Facilities & Institutions,Open Space & Outdoor Recreation,Parking,Vacant Land,FAR
174,11430.0,Queens,JFK Airport,0.0,0.0,0.0,0.0,0.00833134644915413,0.002427194886862092,0.9892414586639838,0.00013534018842136343,0.0,0.00014588454904796867,0.0,0.0
184,11697.0,Queens,Breezy Point,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02002192840957489,0.49355022573047097,0.0,0.015206527906006245,0.0037436293008819383
53,10309.0,Staten Island,Rossville / Pleasant Plains,0.7725117056953368,0.03364116418253906,0.0,0.00834975836585921,0.10189570181326546,0.06166201596137149,0.02193965398162781,0.18137370061915084,0.17993940946120493,0.008608745273644573,0.15664391010012405,0.010975170623497517
47,10303.0,Staten Island,Arlington / Mariner's Harbor,0.49476562348254877,0.09180856707803292,0.08980996868669029,0.016080917932367506,0.10240911627584377,0.17025310742384492,0.03487269912067178,0.019296410766069232,0.004308739869486247,0.003432410010862741,0.15167764801740968,0.01264763713263867
70,10464.0,Bronx,Pelham Bay Park,0.6379494473578821,0.13071878373332774,0.05421489989015599,0.10159385632456908,0.05275291059537354,0.021074330042982148,0.0016957720557094155,0.04906088480457966,7.65881980974984,0.011352681699862317,0.13832633704621594,0.012892988214001984
85,11040.0,Queens,Hillside Manor / Hyde Park,0.7408489737430239,0.1756111443488128,0.0,0.02096157378193926,0.060123365417312995,0.0,0.0024549427089110786,0.048212189470932115,0.0,0.0,7.718680059726161e-05,0.016729657631402568
83,11004.0,Queens,Glen Oaks,0.5765969237577012,0.3296869685148865,0.007313407477175972,0.014302000546584245,0.07182223812118387,0.0,0.0002784615824682242,0.12064916291323355,0.01555846407300223,0.002363638624766592,0.0015728972804520168,0.017034838174553923
56,10314.0,Staten Island,Bulls Head / New Springville,0.6571326382754872,0.10141476564095815,0.03020897479191912,0.013628408833429322,0.13816146391569656,0.05423940993341745,0.005214338609092366,0.26459765392867657,0.14520568483408208,0.002938292576072181,0.11363842837265677,0.0170585476290857
157,11411.0,Queens,Cambria Heights,0.9367224502855676,0.01200606313531325,0.0,0.009602865180907515,0.04076393764385975,0.0,0.0009046837543519124,0.048069298054220264,0.0,0.0011444356005077942,0.00011543093222234623,0.017508492068510736
141,11363.0,Queens,Little Neck,0.7982249409653749,0.05499466784030605,0.07897626765748915,0.029435849040634444,0.037692867601628426,0.0,0.0006754068945671217,0.041143204038334394,0.214151658231576,0.012109075357353893,0.08977261092952282,0.01832890938340411


### Create Database file for Flask app

In [513]:
database_path = "../zipcodeDB.sqlite"

In [514]:
# Create Engine
# will create sqlite file if it doesn't already exist
engine = create_engine(f"sqlite:///{database_path}")
conn = engine.connect()

In [515]:
# Push data to database
landuse_df.to_sql(name="zip_table", con=conn, if_exists ="replace", index=False)

# KNN Algorith for Backend

## Run Initial Query And Find Nearest Neighbor

In [516]:
# Specify target zipcode

#zipcode = 11101 #Long Island City, QNS
#zipcode = 11104 #Sunnyside, QNS
#zipcode = 11355 #Flushing, QNS
#zipcode = 11201 #Brooklyn Heights, BK
#zipcode = 10304 #Todt Hill, SI
zipcode = 10016 #Murray Hill
#zipcode = 11422 #Rosedale
#zipcode = 10017 #Midtown East
#zipcode = 10034 #Inwood
#zipcode = 10028 #Battery Park City
#zipcode = 10465 #Throg's Neck
#zipcode = 10036 #Midtown / Hell's Kitchen

In [517]:
database_path = "../zipcodeDB.sqlite"

# Create Engine
# will create sqlite file if it doesn't already exist
engine = create_engine(f"sqlite:///{database_path}")
conn = engine.connect()

# Query records to test that it works
data_df = pd.read_sql("SELECT * FROM zip_table", conn)

In [518]:
data_df.head()

Unnamed: 0,zipcode,borocode,neighborhood,1 & 2 Family Residential,Multi-Family Walk-up,Multi-Family Elevator,Mixed Residential & Commercial,Commercial & Office,Industrial & Manufacturing,Transportation & Utility,Public Facilities & Institutions,Open Space & Outdoor Recreation,Parking,Vacant Land,FAR
0,10001.0,Manhattan,NOMAD / Chelsea,0.0004124198483636,0.0072686942679114,0.1024756923133412,0.170282750708585,0.6504896660393278,0.0545577363488735,0.0145130404735973,0.2645583063882568,0.0582190911906196,0.0588062465076829,0.1166464904436069,0.5589443137486669
1,10002.0,Manhattan,Lower East Side,0.001279914338859,0.0485349115997266,0.416105542441979,0.4095571143839591,0.0801341004289442,0.0296135275655569,0.014774889240975,0.1597021998603644,0.3913646681164199,0.0165236270173841,0.0457291684744083,0.2074218445071491
2,10003.0,Manhattan,NOHO / Union Square,0.0116566784051808,0.0856412302841,0.2882898912412759,0.3548085780234243,0.2433654672369523,0.0161599226752594,7.823213380711986e-05,0.2319067983593322,0.05776460037702,0.012018955392736,0.0064017493948495,0.3531299095585239
3,10004.0,Manhattan,Bowling Green / Govenor's Island,0.0,0.0,0.0272716610615463,0.1577911949257813,0.7943236195635951,0.0032150199569565,0.0173985044921207,3.2825442330422194,0.4251244229001687,0.0044712610266326,0.0060776668788926,0.6179780266217585
4,10005.0,Manhattan,Wall Street,0.0,0.0,0.097810663693912,0.2240780033441518,0.651202846223548,0.0,0.0269084867383881,0.0090640218050616,0.0,0.0,0.0015030947451243,0.7864728751817212


In [519]:
#Emphasize parks
data_df['Open Space & Outdoor Recreation'] = data_df['Open Space & Outdoor Recreation'] * 4

In [520]:
# Split DFs so that selected outer-borough zipcode can find nearest neighbor zip in Manhattan
manhattan_df = data_df[data_df.borocode == "Manhattan"]
outer_borough_df = data_df[data_df.borocode != "Manhattan"]

In [521]:
# Create dictObj
# This formatting is reuired for the D3 chart, and is (believe it or not) the cleanest solution I could come up with...
dictObj = {"manhattan": {"borough": "Manhattan",
                         "zipcode": None,
                         "FAR": None,
                         "neighborhood": None,
                         "values": {"A": {"color": "#FFDD80", 
                                          "GFA": None,
                                          "label": "1 & 2 Family Residential"},
                                    "B": {"color": "#ff9100",
                                          "GFA": None,
                                          "label": "Multi-Family Walk-up"},
                                    "C": {"color": "#bf360c",
                                          "GFA": None,
                                          "label": "Multi-Family Elevator"},
                                    "D": {"color": "#ff5252",
                                          "GFA": None,
                                          "label": "Mixed Residential & Commercial"},
                                    "E": {"color": "#c51162",
                                          "GFA": None,
                                          "label": "Commercial & Office"},
                                    "F": {"color": "#7b1fa2",
                                          "GFA": None,
                                          "label": "Industrial & Manufacturing"},
                                    "G": {"color": "#ba68c8",
                                          "GFA": None,
                                          "label": "Transportation & Utility"},
                                    "H": {"color": "#0d47a1",
                                          "GFA": None,
                                          "label": "Public Facilities & Institutions"},
                                    "I": {"color": "#00bfa5",
                                          "GFA": None,
                                          "label": "Open Space & Outdoor Recreation"},
                                    "J": {"color": "#607d8b",
                                          "GFA": None, 
                                          "label": "Parking"},
                                    "K": {"color": "#263238",
                                           "GFA": None,
                                           "label": "Vacant Land"}
                                   }
                        },
           "outer": {"borough": None,
                             "zipcode": None,
                             "FAR": None,
                             "neighborhood": None,
                             "values": {"A": {"color": "#FFDD80", 
                                              "GFA": None,
                                              "label": "1 & 2 Family Residential"},
                                        "B": {"color": "#ff9100",
                                              "GFA": None,
                                              "label": "Multi-Family Walk-up"},
                                        "C": {"color": "#bf360c",
                                              "GFA": None,
                                              "label": "Multi-Family Elevator"},
                                        "D": {"color": "#ff5252",
                                              "GFA": None,
                                              "label": "Mixed Residential & Commercial"},
                                        "E": {"color": "#c51162",
                                              "GFA": None,
                                              "label": "Commercial & Office"},
                                        "F": {"color": "#7b1fa2",
                                              "GFA": None,
                                              "label": "Industrial & Manufacturing"},
                                        "G": {"color": "#ba68c8",
                                              "GFA": None,
                                              "label": "Transportation & Utility"},
                                        "H": {"color": "#0d47a1",
                                              "GFA": None,
                                              "label": "Public Facilities & Institutions"},
                                        "I": {"color": "#00bfa5",
                                              "GFA": None,
                                              "label": "Open Space & Outdoor Recreation"},
                                        "J": {"color": "#607d8b",
                                              "GFA": None, 
                                              "label": "Parking"},
                                        "K": {"color": "#263238",
                                               "GFA": None,
                                               "label": "Vacant Land"}
                                   }
                        }
          }


In [522]:
target_borough = data_df.borocode[data_df.zipcode == zipcode].tolist()[0]
target_neighborhood = data_df.neighborhood[data_df.zipcode == zipcode].tolist()[0]
target_FAR = data_df.FAR[data_df.zipcode == zipcode].tolist()[0]

In [523]:
target_borough

'Manhattan'

In [524]:
target_neighborhood

'Murray Hill / Kips Bay'

In [525]:
target_FAR

0.5644702134886085

### Initial Query

In [526]:
if target_borough == 'Manhattan':
    # Pull out target zipcode from manhattan_df
    target_df = manhattan_df[manhattan_df.zipcode == zipcode]
    
    # Define zipcode, neighborhood
    dictObj["manhattan"]["zipcode"] = zipcode
    dictObj["manhattan"]["neighborhood"] = target_neighborhood
    dictObj["manhattan"]["FAR"] = target_FAR
    
    for item in dictObj["manhattan"]["values"]:
        value_type = dictObj["manhattan"]["values"][item]["label"]
        dictObj["manhattan"]["values"][item]["GFA"] = target_df[value_type].tolist()[0]
    
    # Reset park values
    dictObj["manhattan"]["values"]["I"]["GFA"] = dictObj["manhattan"]["values"]["I"]["GFA"] / 4     
    
else:
    # Pull out target zipcode from outer_borough_df
    target_df = outer_borough_df[outer_borough_df.zipcode == zipcode]
    
    # Define zipcode, neighborhood, and borough
    dictObj["outer"]["zipcode"] = int(zipcode)
    dictObj["outer"]["neighborhood"] = target_neighborhood
    dictObj["outer"]["borough"] = target_borough
    dictObj["outer"]["FAR"] = target_FAR
    
    for item in dictObj["outer"]["values"]:
        value_type = dictObj["outer"]["values"][item]["label"]
        dictObj["outer"]["values"][item]["GFA"] = target_df[value_type].tolist()[0]
        
    # Reset park values
    dictObj["outer"]["values"]["I"]["GFA"] = dictObj["outer"]["values"]["I"]["GFA"] / 4   


### KNN

In [527]:
if target_borough == 'Manhattan': #i.e. the nearest neighbor will NOT be in Manhattan

    # Combine with Outerborough df to train model
    train_data = pd.concat([target_df, outer_borough_df])

    # Drop columns that shouldn't be part of KNN distance calculation and reset index
    train_data.drop("zipcode", axis=1, inplace=True)
    train_data.drop("borocode", axis=1, inplace=True)
    train_data.drop("neighborhood", axis=1, inplace=True)
    train_data.reset_index(drop=True, inplace=True)
    
    # Train Nearest Neighbors model
    nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(train_data)
    distances, indices = nbrs.kneighbors(train_data)

    # Find nearest neighbor
    nearest_neighbor_index = indices[0][1]
    result = train_data.iloc[nearest_neighbor_index, :]
    
    
    
    # Define zipcode, neighborhood, and borough
    recreated_data = pd.concat([target_df, outer_borough_df]) # recreate df to access zipcode
    result_zipcode = recreated_data.iloc[nearest_neighbor_index, :].zipcode
    result_neighborhood = data_df[data_df.zipcode == result_zipcode].neighborhood.tolist()[0]
    result_borough = data_df[data_df.zipcode == result_zipcode].borocode.tolist()[0]
    result_FAR = data_df[data_df.zipcode == result_zipcode].FAR.tolist()[0]
    dictObj["outer"]["zipcode"] = result_zipcode
    dictObj["outer"]["neighborhood"] = result_neighborhood
    dictObj["outer"]["borough"] = result_borough
    dictObj["outer"]["FAR"] = result_FAR
    
    for item in dictObj["outer"]["values"]:
        value_type = dictObj["outer"]["values"][item]["label"]
        dictObj["outer"]["values"][item]["GFA"] = result[value_type]
    
    # Reset park values
    dictObj["outer"]["values"]["I"]["GFA"] = dictObj["outer"]["values"]["I"]["GFA"] / 4     


        
else: # i.e. the nearest neighbor IS in Manhattan

    # Combine with Manhattan df to train model
    train_data = pd.concat([target_df, manhattan_df])
    
    # Drop columns that shouldn't be part of KNN distance calculation and reset index
    train_data.drop("zipcode", axis=1, inplace=True)
    train_data.drop("borocode", axis=1, inplace=True)
    train_data.drop("neighborhood", axis=1, inplace=True)
    train_data.reset_index(drop=True, inplace=True)
    
    # Train Nearest Neighbors model
    nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(train_data)
    distances, indices = nbrs.kneighbors(train_data)

    # Find nearest neighbor
    nearest_neighbor_index = indices[0][1]
    result = train_data.iloc[nearest_neighbor_index, :]
    
    # Define zipcode, neighborhood and borough
    recreated_data = pd.concat([target_df, manhattan_df]) # recreate df to access zipcode ADDED THIS LINE
    result_zipcode = recreated_data.iloc[nearest_neighbor_index, :].zipcode
    result_neighborhood = data_df[data_df.zipcode == result_zipcode].neighborhood.tolist()[0]
    result_borough = data_df[data_df.zipcode == result_zipcode].borocode.tolist()[0]
    result_FAR = data_df[data_df.zipcode == result_zipcode].FAR.tolist()[0]
    dictObj["manhattan"]["zipcode"] = result_zipcode
    dictObj["manhattan"]["neighborhood"] = result_neighborhood
    dictObj["manhattan"]["borough"] = result_borough
    dictObj["manhattan"]["FAR"] = result_FAR
    
    for item in dictObj["manhattan"]["values"]:
        value_type = dictObj["manhattan"]["values"][item]["label"]
        dictObj["manhattan"]["values"][item]["GFA"] = result[value_type]
    
    # Reset park values
    dictObj["manhattan"]["values"]["I"]["GFA"] = dictObj["manhattan"]["values"]["I"]["GFA"] / 4

In [528]:
pprint(dictObj)

{'manhattan': {'FAR': 0.5644702134886085,
               'borough': 'Manhattan',
               'neighborhood': 'Murray Hill / Kips Bay',
               'values': {'A': {'GFA': 0.004749091612728957,
                                'color': '#FFDD80',
                                'label': '1 & 2 Family Residential'},
                          'B': {'GFA': 0.02021618988283336,
                                'color': '#ff9100',
                                'label': 'Multi-Family Walk-up'},
                          'C': {'GFA': 0.20107708958148293,
                                'color': '#bf360c',
                                'label': 'Multi-Family Elevator'},
                          'D': {'GFA': 0.33372601860875495,
                                'color': '#ff5252',
                                'label': 'Mixed Residential & Commercial'},
                          'E': {'GFA': 0.4303886444449339,
                                'color': '#c51162',
                       

In [529]:
distances[0]

array([0.        , 0.58768911])