In [34]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

In [35]:
import pandas as pd
import numpy as np
import scipy 

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)

housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [36]:
housing = pd.DataFrame(housing).to_numpy()
housing = np.delete(housing, np.s_[2:10], 1)
housing

array([[-122.23, 37.88],
       [-122.22, 37.86],
       [-122.24, 37.85],
       ...,
       [-121.22, 39.43],
       [-121.32, 39.43],
       [-121.24, 39.37]], dtype=object)

In [37]:
housing = housing**4  # we do this because the coordinates before were too small to make good calculations

import sklearn

tree = sklearn.neighbors.KDTree(housing)                               # we assume hotel is a matrix Mx2, with all the positions of the hotels
dist, ind = tree.query(housing[:len(housing)], k=3)               
print(ind)                                         # indexes to N+1 closest hotels
print(dist) 

#it's possible that between the housings there are  places with the same coordinates, that's why sometimes
#the house itself it's not the closer house. It would be good to work on the data.

[[    0   122   118]
 [    1   125   126]
 [  494     2   124]
 ...
 [20637 20636 10007]
 [20638 20630 20627]
 [20639 10024  1146]]
[[    0.          6514.71164511  8682.842688  ]
 [    0.          2169.85018991  4337.9812328 ]
 [    0.             0.             0.        ]
 ...
 [    0.         72751.02523011 77517.5222976 ]
 [    0.         34147.1663232  72728.0622624 ]
 [    0.         36405.30817905 69078.76499664]]
