In [123]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# import statsmodels.api as sm
%matplotlib inline 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import metrics
from sklearn.model_selection import train_test_split 
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors

import requests
import kdtree as KDTree


In [124]:
df = pd.read_csv('houses_lat_long_v4.csv')

In [125]:
df = df.dropna(subset=['LAT_LONG'])

In [126]:
df.shape

(87158, 31)

In [127]:
# First, look for null values so we can dropna
null_columns=df.columns[df.isnull().any()]
df[null_columns].isnull().sum()

LAND_SF          19
YR_REMOD       7329
TOTAL_RMS        33
BTH_STYLE2    23961
BTH_STYLE3    58492
dtype: int64

In [128]:
# Get the coords
def split_coords(col):
    x = col.split(",")
    return x[0], x[1]

df['LAT'], df['LONG'] = zip(*df['LAT_LONG'].apply(some_func))

In [130]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,ST_NAME_SUF,ZIPCODE,OWN_OCC,AV_LAND,AV_BLDG,AV_TOTAL,GROSS_TAX,...,KITCH_STYLE,HEAT_TYP,AC,FPLACE,INT_CND,INT_FIN,VIEW,LAT_LONG,LAT,LONG
0,502555000,2,BEAVER,ST,2108,Y,1436300,1242500,2678800,2823455,...,M,W,N,0.0,G,N,A,"-71.07207,42.355618",-71.07207,42.355618
1,100001000,104 A 104,PUTNAM,ST,2128,Y,129900,411300,541200,570425,...,S,W,N,0.0,A,N,A,"-71.03256,42.379173",-71.03256,42.379173
2,100002000,197,LEXINGTON,ST,2128,N,121000,475100,596100,628289,...,M,F,C,0.0,A,N,A,"-71.03255,42.37957",-71.03255,42.37957
3,100003000,199,LEXINGTON,ST,2128,N,121400,421100,542500,571795,...,S,S,N,0.0,A,N,A,"-71.03248,42.379593",-71.03248,42.379593
4,100004000,201,LEXINGTON,ST,2128,N,121900,396400,518300,546288,...,S,W,N,0.0,A,N,A,"-71.03242,42.379616",-71.03242,42.379616
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101662,1000319288,80-82,FENWOOD,RD,2115,Y,0,561500,561500,591821,...,M,F,C,0.0,G,N,A,"-71.10889,42.335903",-71.10889,42.335903
101663,1000319290,80-82,FENWOOD,RD,2115,Y,0,195300,195300,205847,...,M,F,C,0.0,G,N,A,"-71.10889,42.335903",-71.10889,42.335903
103067,1100253022,126,THORNTON,ST,2119,N,0,563900,563900,594351,...,M,F,C,0.0,G,N,A,"-71.09162,42.322926",-71.09162,42.322926
103068,1100253024,126,THORNTON,ST,2119,Y,0,614800,614800,647999,...,M,F,C,0.0,G,N,A,"-71.09162,42.322926",-71.09162,42.322926


In [131]:
df = df.fillna(0)

In [132]:
# features: bed, bath, sqft, lon/lat, garage, age (or year built), and lotsize

features = ['BDRMS', 'FULL_BTH', 'HALF_BTH', 'LIVING_AREA', 'YR_BUILT', 'LAND_SF', 'LAT', 'LONG']
X = df[features] #Features
y = df['AV_TOTAL'] #Target
X

Unnamed: 0,BDRMS,FULL_BTH,HALF_BTH,LIVING_AREA,YR_BUILT,LAND_SF,LAT,LONG
0,2.0,1.0,0.0,1478.0,1900.0,1225.0,-71.07207,42.355618
1,6.0,3.0,0.0,2202.0,1900.0,1150.0,-71.03256,42.379173
2,3.0,3.0,0.0,2307.0,1920.0,1150.0,-71.03255,42.37957
3,5.0,3.0,0.0,2268.0,1905.0,1150.0,-71.03248,42.379593
4,5.0,3.0,0.0,2028.0,1900.0,1150.0,-71.03242,42.379616
...,...,...,...,...,...,...,...,...
101662,1.0,1.0,0.0,620.0,2016.0,620.0,-71.10889,42.335903
101663,1.0,1.0,0.0,831.0,2016.0,831.0,-71.10889,42.335903
103067,2.0,1.0,0.0,1813.0,2017.0,1813.0,-71.09162,42.322926
103068,3.0,2.0,0.0,1982.0,2017.0,1982.0,-71.09162,42.322926


In [133]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [134]:
knn = KNeighborsClassifier(n_neighbors=3)

knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [135]:
knn.predict(X_test)

array([1732300,  229700,  360100, ...,  743042, 1263600,  517900])

In [136]:
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(X)
distances, indices = nbrs.kneighbors([[-5,-5]])
print(indices)
print(distances)

[[2 1 0]]
[[3.60555128 5.         5.65685425]]


In [137]:
for d in distances:
    weight = 1.0/d
print(weight)

[0.2773501 0.2       0.1767767]
