In [185]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Set pandas to display numbers with commas
pd.options.display.float_format = '{:,}'.format

## Import and Clean Data

In [186]:
# import and clean data

pluto_df = pd.read_csv("pluto.csv")
pluto_df.sort_values(by="zipcode", inplace=True)
pluto_df = pluto_df.reset_index(drop=True).copy()
pluto_df.dropna(inplace=True)
pluto_df.head()

Unnamed: 0,zipcode,borough,borocode,landuse,bldgarea
0,10001.0,MN,1.0,5.0,611625.0
1,10001.0,MN,1.0,4.0,13489.0
2,10001.0,MN,1.0,5.0,34000.0
3,10001.0,MN,1.0,11.0,0.0
4,10001.0,MN,1.0,5.0,2008.0


In [187]:
#One hot encoding for landuse

dummy_df = pd.get_dummies(pluto_df.landuse)
pluto_df.drop("landuse", axis=1, inplace=True)
data_df = pluto_df.join(dummy_df)
data_df.head()

Unnamed: 0,zipcode,borough,borocode,bldgarea,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0
0,10001.0,MN,1.0,611625.0,0,0,0,0,1,0,0,0,0,0,0
1,10001.0,MN,1.0,13489.0,0,0,0,1,0,0,0,0,0,0,0
2,10001.0,MN,1.0,34000.0,0,0,0,0,1,0,0,0,0,0,0
3,10001.0,MN,1.0,0.0,0,0,0,0,0,0,0,0,0,0,1
4,10001.0,MN,1.0,2008.0,0,0,0,0,1,0,0,0,0,0,0


In [188]:
# Associate bldgarea with landuse values

cols = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]
for col in cols:
    data_df[col] = data_df[col] * data_df.bldgarea
data_df.drop("bldgarea", axis=1, inplace=True)
data_df.head()

Unnamed: 0,zipcode,borough,borocode,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0
0,10001.0,MN,1.0,0.0,0.0,0.0,0.0,611625.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10001.0,MN,1.0,0.0,0.0,0.0,13489.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10001.0,MN,1.0,0.0,0.0,0.0,0.0,34000.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10001.0,MN,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10001.0,MN,1.0,0.0,0.0,0.0,0.0,2008.0,0.0,0.0,0.0,0.0,0.0,0.0


In [189]:
# Create summary data

data_df.drop("borough", axis=1, inplace=True)
landuse_df = data_df.groupby("zipcode").agg({'borocode':'median',
                                                1.0:'sum',
                                                2.0:'sum',
                                                3.0:'sum',
                                                4.0:'sum', 
                                                5.0:'sum',
                                                6.0:'sum',
                                                7.0:'sum',
                                                8.0:'sum',
                                                9.0:'sum',
                                                10.0:'sum',
                                                11.0:'sum'})
landuse_df.reset_index(inplace=True)
landuse_df.head()

Unnamed: 0,zipcode,borocode,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0
0,10001.0,1.0,33606.0,592289.0,8350224.0,13875477.0,53005101.0,4445633.0,1182594.0,10895555.0,1152624.0,616150.0,0.0
1,10002.0,1.0,51117.0,1938379.0,16618352.0,16356822.0,3200967.0,1182700.0,590077.0,4618645.0,87067.0,252263.0,0.0
2,10003.0,1.0,515395.0,3786590.0,12746613.0,15687708.0,10760299.0,714504.0,3459.0,9501989.0,126922.0,159151.0,0.0
3,10004.0,1.0,0.0,0.0,725023.0,4194913.0,21117265.0,85472.0,462543.0,3722844.0,945425.0,87775.0,0.0
4,10005.0,1.0,0.0,0.0,2480663.0,5683041.0,16515733.0,0.0,682450.0,43568.0,0.0,0.0,0.0


In [190]:
# Relabel borocode so easy to remember (had to drop above b/c groupby only accepts numeric columns)

boro_dict = {"MN": 1.0, 
             "BX": 2.0, 
             "BK": 3.0, 
             "QN": 4.0, 
             "SI": 5.0}

def relabel(num):
    boros = list(boro_dict.keys())
    values = list(boro_dict.values())    
    return boros[values.index(num)]

landuse_df['borocode'] = landuse_df['borocode'].apply(relabel)

In [191]:
cols_dict = {1.0: "1,2 Family Resi",
             2.0: "Multi-Fam Walk-up",
             3.0: "Multi-Fam Elevator",
             4.0: "Mixed Resi/Comm",
             5.0: "Comm & Office",
             6.0: "Industrial",
             7.0: "Transport",
             8.0: "Public",
             9.0: "Open Space",
             10.0: "Parking",
             11.0: "Vacant"}

In [192]:
landuse_df.rename(columns = cols_dict, inplace=True)
landuse_df

Unnamed: 0,zipcode,borocode,"1,2 Family Resi",Multi-Fam Walk-up,Multi-Fam Elevator,Mixed Resi/Comm,Comm & Office,Industrial,Transport,Public,Open Space,Parking,Vacant
0,10001.0,MN,33606.0,592289.0,8350224.0,13875477.0,53005101.0,4445633.0,1182594.0,10895555.0,1152624.0,616150.0,0.0
1,10002.0,MN,51117.0,1938379.0,16618352.0,16356822.0,3200967.0,1182700.0,590077.0,4618645.0,87067.0,252263.0,0.0
2,10003.0,MN,515395.0,3786590.0,12746613.0,15687708.0,10760299.0,714504.0,3459.0,9501989.0,126922.0,159151.0,0.0
3,10004.0,MN,0.0,0.0,725023.0,4194913.0,21117265.0,85472.0,462543.0,3722844.0,945425.0,87775.0,0.0
4,10005.0,MN,0.0,0.0,2480663.0,5683041.0,16515733.0,0.0,682450.0,43568.0,0.0,0.0,0.0
5,10006.0,MN,0.0,0.0,2406484.0,1409933.0,7224839.0,0.0,0.0,237019.0,0.0,305026.0,0.0
6,10007.0,MN,11515.0,135365.0,25180716.0,7113948.0,26421102.0,186711.0,6565.0,1964664.0,0.0,0.0,0.0
7,10009.0,MN,49533.0,4551976.0,7867831.0,16437543.0,1094334.0,132152.0,175000.0,1778954.0,52703.0,3358.0,0.0
8,10010.0,MN,35514.0,671530.0,7188028.0,14478911.0,16348068.0,316637.0,65745.0,5288022.0,20206.0,84013.0,0.0
9,10011.0,MN,1029569.0,4471226.0,14800035.0,18123314.0,15296761.0,1648286.0,464332.0,4294146.0,12323.0,78016.0,0.0


## Find Nearest Neighbor

In [193]:
# Split DFs so that selected outer-borough zipcode can find nearest neighbor zip in Manhattan
manhattan_df = landuse_df[landuse_df.borocode == "MN"]
outer_boro_df = landuse_df[landuse_df.borocode != "MN"]

In [194]:
# Specify target zipcode

#zipcode = 11101 #Long Island City, QNS
#zipcode = 11104 #Sunnyside, QNS
#zipcode = 11355 #Flushing, QNS
#zipcode = 11201 #Brooklyn Heights, BK
zipcode = 10304 #Todt Hill, SI

In [195]:
# Pull out target zipcode from outer_boro_df
query_zip = outer_boro_df[outer_boro_df.zipcode == zipcode]

# Combine with Manhattan df to train model
train_data = pd.concat([query_zip, manhattan_df])

# Drop column with strings and reset index
train_data.drop("borocode", axis=1, inplace=True)
train_data.reset_index(drop=True, inplace=True)

In [196]:
# Train Nearest Neighbors model

nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(train_data)
distances, indices = nbrs.kneighbors(train_data)

In [197]:
nearest_neighbor_index = indices[0][1]

In [198]:
train_data.iloc[nearest_neighbor_index, :]

zipcode                 10,030.0
1,2 Family Resi        774,088.0
Multi-Fam Walk-up    4,628,963.0
Multi-Fam Elevator   3,004,654.0
Mixed Resi/Comm      4,303,357.0
Comm & Office          110,686.0
Industrial                   0.0
Transport                    0.0
Public               1,341,992.0
Open Space               1,410.0
Parking                 56,356.0
Vacant                       0.0
Name: 28, dtype: float64

In [199]:
query_zip

Unnamed: 0,zipcode,borocode,"1,2 Family Resi",Multi-Fam Walk-up,Multi-Fam Elevator,Mixed Resi/Comm,Comm & Office,Industrial,Transport,Public,Open Space,Parking,Vacant
72,10304.0,SI,14242428.0,1899750.0,3222559.0,1417842.0,1085077.0,435189.0,70512.0,2456708.0,65787.0,29529.0,0.0
