In [190]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# import statsmodels.api as sm
%matplotlib inline 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import metrics
from sklearn.model_selection import train_test_split 
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors

import requests
import kdtree as KDTree


In [191]:
df = pd.read_csv('houses_lat_long_v4.csv')

In [192]:
df = df.dropna(subset=['LAT_LONG'])

In [193]:
df.shape

(87158, 31)

In [194]:
# First, look for null values so we can dropna
null_columns=df.columns[df.isnull().any()]
df[null_columns].isnull().sum()

LAND_SF          19
YR_REMOD       7329
TOTAL_RMS        33
BTH_STYLE2    23961
BTH_STYLE3    58492
dtype: int64

In [195]:
# Get the coords
def split_coords(col):
    x = col.split(",")
    return x[0], x[1]

df['LAT'], df['LONG'] = zip(*df['LAT_LONG'].apply(split_coords))

In [196]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,ST_NAME_SUF,ZIPCODE,OWN_OCC,AV_LAND,AV_BLDG,AV_TOTAL,GROSS_TAX,...,KITCH_STYLE,HEAT_TYP,AC,FPLACE,INT_CND,INT_FIN,VIEW,LAT_LONG,LAT,LONG
0,502555000,2,BEAVER,ST,2108,Y,1436300,1242500,2678800,2823455,...,M,W,N,0.0,G,N,A,"-71.07207,42.355618",-71.07207,42.355618
1,100001000,104 A 104,PUTNAM,ST,2128,Y,129900,411300,541200,570425,...,S,W,N,0.0,A,N,A,"-71.03256,42.379173",-71.03256,42.379173
2,100002000,197,LEXINGTON,ST,2128,N,121000,475100,596100,628289,...,M,F,C,0.0,A,N,A,"-71.03255,42.37957",-71.03255,42.37957
3,100003000,199,LEXINGTON,ST,2128,N,121400,421100,542500,571795,...,S,S,N,0.0,A,N,A,"-71.03248,42.379593",-71.03248,42.379593
4,100004000,201,LEXINGTON,ST,2128,N,121900,396400,518300,546288,...,S,W,N,0.0,A,N,A,"-71.03242,42.379616",-71.03242,42.379616
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101662,1000319288,80-82,FENWOOD,RD,2115,Y,0,561500,561500,591821,...,M,F,C,0.0,G,N,A,"-71.10889,42.335903",-71.10889,42.335903
101663,1000319290,80-82,FENWOOD,RD,2115,Y,0,195300,195300,205847,...,M,F,C,0.0,G,N,A,"-71.10889,42.335903",-71.10889,42.335903
103067,1100253022,126,THORNTON,ST,2119,N,0,563900,563900,594351,...,M,F,C,0.0,G,N,A,"-71.09162,42.322926",-71.09162,42.322926
103068,1100253024,126,THORNTON,ST,2119,Y,0,614800,614800,647999,...,M,F,C,0.0,G,N,A,"-71.09162,42.322926",-71.09162,42.322926


In [197]:
df = df.fillna(0)

In [198]:
# features: bed, bath, sqft, lon/lat, garage, age (or year built), and lotsize

features = ['BDRMS', 'FULL_BTH', 'HALF_BTH', 'LIVING_AREA', 'YR_BUILT', 'LAND_SF', 'LAT', 'LONG']
X = df[features] #Features
y = df['AV_TOTAL'] #Target
X

Unnamed: 0,BDRMS,FULL_BTH,HALF_BTH,LIVING_AREA,YR_BUILT,LAND_SF,LAT,LONG
0,2.0,1.0,0.0,1478.0,1900.0,1225.0,-71.07207,42.355618
1,6.0,3.0,0.0,2202.0,1900.0,1150.0,-71.03256,42.379173
2,3.0,3.0,0.0,2307.0,1920.0,1150.0,-71.03255,42.37957
3,5.0,3.0,0.0,2268.0,1905.0,1150.0,-71.03248,42.379593
4,5.0,3.0,0.0,2028.0,1900.0,1150.0,-71.03242,42.379616
...,...,...,...,...,...,...,...,...
101662,1.0,1.0,0.0,620.0,2016.0,620.0,-71.10889,42.335903
101663,1.0,1.0,0.0,831.0,2016.0,831.0,-71.10889,42.335903
103067,2.0,1.0,0.0,1813.0,2017.0,1813.0,-71.09162,42.322926
103068,3.0,2.0,0.0,1982.0,2017.0,1982.0,-71.09162,42.322926


In [199]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [200]:
X

Unnamed: 0,BDRMS,FULL_BTH,HALF_BTH,LIVING_AREA,YR_BUILT,LAND_SF,LAT,LONG
0,2.0,1.0,0.0,1478.0,1900.0,1225.0,-71.07207,42.355618
1,6.0,3.0,0.0,2202.0,1900.0,1150.0,-71.03256,42.379173
2,3.0,3.0,0.0,2307.0,1920.0,1150.0,-71.03255,42.37957
3,5.0,3.0,0.0,2268.0,1905.0,1150.0,-71.03248,42.379593
4,5.0,3.0,0.0,2028.0,1900.0,1150.0,-71.03242,42.379616
...,...,...,...,...,...,...,...,...
101662,1.0,1.0,0.0,620.0,2016.0,620.0,-71.10889,42.335903
101663,1.0,1.0,0.0,831.0,2016.0,831.0,-71.10889,42.335903
103067,2.0,1.0,0.0,1813.0,2017.0,1813.0,-71.09162,42.322926
103068,3.0,2.0,0.0,1982.0,2017.0,1982.0,-71.09162,42.322926


In [201]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [202]:
knn.predict(X_test)

array([1732300,  229700,  360100, ...,  743042, 1263600,  517900])

In [203]:
# X = X_train.iloc[:]

In [209]:
n_neighbors = 6
nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='ball_tree').fit(X)
# distances, indices = nbrs.kneighbors([[4.0,1.0,1.0,1879.0,1929.0,3304.0,71.053566,42.28941]])
# print(indices)
# print(distances)

In [210]:
# x_slice = X.iloc[0:50]

In [211]:
# Get the neighbors
def add_neighbors(row):
    x = []
    distances, indices = nbrs.kneighbors([row])
    for i in range(n_neighbors):
        x.append(indices[0][i])
    return [x[1]], [x[1], x[2]], [x[1], x[2], x[3]], [x[1], x[2], x[3], x[4]], [x[1], x[2], x[3], x[4], x[5]]

X[["nbrs_1", "nbrs_2", "nbrs_3", "nbrs_4", "nbrs_5"]] = X.apply(add_neighbors, axis=1, result_type="expand")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


BDRMS                                        2
FULL_BTH                                     1
HALF_BTH                                     0
LIVING_AREA                               1478
YR_BUILT                                  1900
LAND_SF                                   1225
LAT                                  -71.07207
LONG                                 42.355618
nbrs_1                                  [4065]
nbrs_2                           [4065, 23716]
nbrs_3                     [4065, 23716, 5254]
nbrs_4               [4065, 23716, 5254, 7657]
nbrs_5         [4065, 23716, 5254, 7657, 3693]
Name: 0, dtype: object

BDRMS                                     2
FULL_BTH                                  1
HALF_BTH                                  1
LIVING_AREA                            1518
YR_BUILT                               1900
LAND_SF                                2500
LAT                               -71.01814
LONG                               42.38507
nbrs_1                                [459]
nbrs_2                           [459, 117]
nbrs_3                     [459, 117, 1743]
nbrs_4                [459, 117, 1743, 422]
nbrs_5         [459, 117, 1743, 422, 10053]
Name: 444, dtype: object