# Overview

This Notebook is focused on getting geo data and visualizing it.
Output - df_geo dataframe in ./data/Data_frame_geoloc.pickle file

# Geo parsing the data using longitude, latitude

In [1]:
# Computations, dataframes, additional libraries
import math
import time
import numpy as np
import pandas as pd

# Save data
import pickle

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
# import plotly_express as px

#Heat Map
from geoplotlib.utils import read_csv as read_csv2
import geopandas as gpd
import geopy
import geoplotlib
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

from shapely.geometry import Point, Polygon
# from mpl_toolkits.basemap import Basemap
# import descartes
# import geoplot as gplt
# import geoplot.crs as gcrs
# import matplotlib.pyplot as plt




In [2]:
# Geo columns from original dataset
columns_geo = ["lat", "id", "long"]

# Setup locator
locator = Nominatim(user_agent= "xz@gmail.com" )

#Load data
data = pd.read_csv("./data/kc_house_data.csv")

# Make test dataframe
test_df = data[columns_geo].iloc[0:5]

# Make sample dataframe for exploratory data analysis and finding features to fit the model.
# Frac = the percentage of original dataframe (0.1 corresponds to 10%)
sample_df = data[columns_geo].sample(frac = 0.03)


# List of possible address fields
raw_address_list = []

In [3]:
# Test of locator function 
location = locator.reverse([test_df["lat"][0],test_df["long"][0]])
location.raw

{'place_id': 159583259,
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'osm_type': 'way',
 'osm_id': 236673600,
 'lat': '47.5112302',
 'lon': '-122.25676111324441',
 'display_name': '10012, 61st Avenue South, Rainier Beach, Seattle, King County, Washington, 98178, United States',
 'address': {'house_number': '10012',
  'road': '61st Avenue South',
  'neighbourhood': 'Rainier Beach',
  'city': 'Seattle',
  'county': 'King County',
  'state': 'Washington',
  'postcode': '98178',
  'country': 'United States',
  'country_code': 'us'},
 'boundingbox': ['47.511189', '47.5112943', '-122.2568571', '-122.2566651']}

In [4]:
# The process of parsing data for our dataframe from OSM takes a long time due to limitation: 1 request per second.
# For 21000 records it will take more than 6 hours. We need to create functions to save data during the process of saving as well as continue where we finished the process.
# Function to check if the record already exist

def nan_equal(a,b):
        try:
            np.testing.assert_equal(a,b)
        except AssertionError:
            return False
        return True 

In [5]:
# Explore function

def geoloc_explore(record, raw_address_list):
    lat = record["lat"]
    lon = record["long"]
    location = locator.reverse([lat,lon]) 
    raw_address_list.append(location.raw)
    time.sleep(1)             # 1 second delay due to OSM parsing limitations
    if (i % 50 == 0):
        print(f"record {i}")  # Check the progress
    if (i % 150 == 0):
        with open('./data/Geo_raw_file.pickle', 'wb') as f:   # Save the data
            pickle.dump(raw_address_list, f, pickle.HIGHEST_PROTOCOL)
            print("Pickled")
    return raw_address_list

In [6]:
raw_address_list

[]

In [7]:
# Optional load of sample dataset from pickle
# Load already processed data (uncomment to proceed)
# with open('./data/Geo_raw_file.pickle', 'rb') as f:
#     raw_address_list = pickle.load(f)

# Explore Sample dataset for unqiue features (uncomment to proceed)

# for i in range(len(sample_df)):
#         raw_address_list = geoloc_explore(sample_df.iloc[i], raw_address_list)

# # Final pickle of acquired data
# with open('./data/Geo_raw_file.pickle', 'wb') as f:
#             pickle.dump(raw_address_list, f, pickle.HIGHEST_PROTOCOL)
# print("Data is pickled")

In [8]:
# Optional load of sample dataset from pickle
# Load already processed data (uncomment to proceed)
with open('./data/Geo_raw_file.pickle', 'rb') as f:
    raw_address_list = pickle.load(f)

In [9]:
# Find possible features and create features frequency dictionary
feature_list={}
for address in raw_address_list:
        address_features = list(address["address"].keys())
        for feature in address_features:
            if feature not in feature_list:
                feature_list[feature] = 1
            else:
                feature_list[feature] += 1

In [10]:
# Explore frequency dictionary
feature_list

{'house_number': 622,
 'road': 643,
 'suburb': 180,
 'city': 277,
 'county': 653,
 'state': 653,
 'postcode': 650,
 'country': 653,
 'country_code': 653,
 'residential': 38,
 'town': 340,
 'neighbourhood': 211,
 'hamlet': 113,
 'village': 13,
 'building': 2,
 'amenity': 10,
 'commercial': 2,
 'highway': 1,
 'retail': 1,
 'leisure': 1}

In [11]:
# Dig deeper into usage of different fields, to find patterns that can be used later during data exploration
# features_search must be changed to each value from feature_list, to find pattern of data
features_search_list = []
features_search = "town"
for address in raw_address_list:
    address_features = list(address["address"].keys())
    if features_search in address_features:
        features_search_list.append(address)
features_search_list

[{'place_id': 116436957,
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
  'osm_type': 'way',
  'osm_id': 59871394,
  'lat': '47.647889946204785',
  'lon': '-122.12505971529322',
  'display_name': 'Northeast 42nd Court, Ridegemont East, Redmond, King County, Washington, 98052-5491, United States',
  'address': {'road': 'Northeast 42nd Court',
   'residential': 'Ridegemont East',
   'town': 'Redmond',
   'county': 'King County',
   'state': 'Washington',
   'postcode': '98052-5491',
   'country': 'United States',
   'country_code': 'us'},
  'boundingbox': ['47.647867', '47.6479456', '-122.1261318', '-122.1243291']},
 {'place_id': 116436957,
  'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
  'osm_type': 'way',
  'osm_id': 59871394,
  'lat': '47.647889946204785',
  'lon': '-122.12505971529322',
  'display_name': 'Northeast 42nd Court, Ridegemont East, Redmond, King County, Washington, 98052-5491, United States',


In [12]:
# Additional check for "type of cities involved"
# features_search_list = []
# features_search = ["city", "town", "village"]
# for address in raw_address_list:
#     address_features = list(address["address"].keys())
#     if (features_search[0] not in address_features) and (features_search[1] not in address_features) and (features_search[2] not in address_features)  :
#         features_search_list.append(address)

We can see that there are 3 types of locations: towns, cities, villages. Some of them use different names for suburbs - suburbs, hamlet etc. 
All this information should be used to create correct dataframe later

In [13]:
# Create function to parse data
def geoloc(record):
    lat = record["lat"]
    lon = record["long"]
    print(lat, lon)
    location = locator.reverse([lat,lon]) 
    time.sleep(1)
    return location.raw

In [14]:
# Based on previous analysis we created new features list for our dataframe
New_features_list = ["To_drop_place_ID", "To_drop_road", "Type_place", "city", "county" , "state" , "suburb" ]

# Create new geo dataframe
df_geo = data[columns_geo].copy()

# Add new features
df_geo[New_features_list] = np.NAN

# Check new DataFrame

print(f"The number of records {len(df_geo)}")
display(df_geo.head())
display(df_geo.tail())


The number of records 21597


Unnamed: 0,lat,id,long,To_drop_place_ID,To_drop_road,Type_place,city,county,state,suburb
0,47.5112,7129300520,-122.257,,,,,,,
1,47.721,6414100192,-122.319,,,,,,,
2,47.7379,5631500400,-122.233,,,,,,,
3,47.5208,2487200875,-122.393,,,,,,,
4,47.6168,1954400510,-122.045,,,,,,,


Unnamed: 0,lat,id,long,To_drop_place_ID,To_drop_road,Type_place,city,county,state,suburb
21592,47.6993,263000018,-122.346,,,,,,,
21593,47.5107,6600060120,-122.362,,,,,,,
21594,47.5944,1523300141,-122.299,,,,,,,
21595,47.5345,291310100,-122.069,,,,,,,
21596,47.5941,1523300157,-122.299,,,,,,,


In [15]:
# Load already processed data (uncomment to proceed)
with open('./data/Data_frame_geoloc.pickle', 'rb') as df_geo_data:
     df_geo = pickle.load(df_geo_data)

In [16]:
# Parsing algorithm based on previous data exploration

for i in range(len(df_geo)):
    if nan_equal(df_geo["state"][i],"Washington"):  #Check if record already exist
        if (i % 100 == 0):
            print(f"Record {i} exist")
        continue
    else:
        print(f"New_record{i}")
        data = geoloc(df_geo.iloc[i])
        df_geo["To_drop_place_ID"][i]=data.get("place_id")
        df_geo["To_drop_road"][i]=data.get("address").get("road")
        df_geo["county"][i]=data.get("address").get("county")
        df_geo["state"][i]=data.get("address").get("state")
        if "city" in list(data.get("address").keys()):
            df_geo["Type_place"][i] = "city"
            df_geo["city"][i] = data.get("address").get("city")
        elif "town" in list(data.get("address").keys()):
            df_geo["Type_place"][i] = "town"
            df_geo["city"][i] = data.get("address").get("town")
        elif "village" in list(data.get("address").keys()):
            df_geo["Type_place"][i] = "village"
            df_geo["city"][i] = data.get("address").get("village")
        else:
            df_geo["Type_place"][i] = np.NAN
            df_geo["city"][i] = np.NAN 
        if "suburb" in list(data.get("address").keys()):
            df_geo["suburb"][i] = data.get("address").get("suburb")
        elif "hamlet" in list(data.get("address").keys()):
            df_geo["suburb"][i] = data.get("address").get("hamlet")
                                            
        if (i % 100 == 0):
            with open('./data/Data_frame_geoloc.pickle', 'wb') as df_geo_data:   #Save data, each 150 iterations
                pickle.dump(df_geo, df_geo_data, pickle.HIGHEST_PROTOCOL)
                print("Pickled", i) 
                
    

Record 0 exist
Record 100 exist
Record 200 exist
Record 300 exist
Record 400 exist
Record 500 exist
Record 600 exist
Record 700 exist
Record 800 exist
Record 900 exist
Record 1000 exist
Record 1100 exist
Record 1200 exist
Record 1300 exist
Record 1400 exist
Record 1500 exist
Record 1600 exist
Record 1700 exist
Record 1800 exist
Record 1900 exist
Record 2000 exist
Record 2100 exist
Record 2200 exist
Record 2300 exist
Record 2400 exist
Record 2500 exist
Record 2600 exist
Record 2700 exist
Record 2800 exist
Record 2900 exist
Record 3000 exist
Record 3100 exist
Record 3200 exist
Record 3300 exist
Record 3400 exist
Record 3500 exist
Record 3600 exist
Record 3700 exist
Record 3800 exist
Record 3900 exist
Record 4000 exist
Record 4100 exist
Record 4200 exist
Record 4300 exist
Record 4400 exist
Record 4500 exist
Record 4600 exist
Record 4700 exist
Record 4800 exist
Record 4900 exist
Record 5000 exist
Record 5100 exist
Record 5200 exist
Record 5300 exist
Record 5400 exist
Record 5500 exist
Reco

In [17]:
# Check dataframe after parsing
print(f"The number of records {len(df_geo)}")
display(df_geo.head())
display(df_geo.tail())

The number of records 21597


Unnamed: 0,id,lat,price,yr_built,sqft_living,sqft_lot,lon,To_drop_place_ID,To_drop_road,Type_place,city,county,state,suburb
0,7129300520,47.5112,221900.0,1955,1180,5650,-122.257,159583259.0,61st Avenue South,city,Seattle,King County,Washington,
1,6414100192,47.721,538000.0,1951,2570,7242,-122.319,159668720.0,Northeast 127th Street,city,Seattle,King County,Washington,Northgate
2,5631500400,47.7379,180000.0,1933,770,10000,-122.233,74808506.0,81st Avenue Northeast,town,Kenmore,King County,Washington,Moorlands
3,2487200875,47.5208,604000.0,1965,1960,5000,-122.393,156392831.0,Fauntleroy Way Southwest,city,Seattle,King County,Washington,Fauntleroy
4,1954400510,47.6168,510000.0,1987,1680,8080,-122.045,293729110.0,221st Avenue Northeast,town,Sammamish,King County,Washington,


Unnamed: 0,id,lat,price,yr_built,sqft_living,sqft_lot,lon,To_drop_place_ID,To_drop_road,Type_place,city,county,state,suburb
21592,263000018,47.6993,360000.0,2009,1530,1131,-122.346,20331941.0,North 97th Street,city,Seattle,King County,Washington,Greenwood
21593,6600060120,47.5107,400000.0,2014,2310,5813,-122.362,231509570.0,Southwest 103rd Place,city,Seattle,King County,Washington,White Center
21594,1523300141,47.5944,402101.0,2009,1020,1350,-122.299,157689516.0,26th Avenue South,city,Seattle,King County,Washington,Leschi
21595,291310100,47.5345,400000.0,2004,1600,2388,-122.069,293824524.0,Northwest Boulder Way Drive,town,Issaquah,King County,Washington,
21596,1523300157,47.5941,325000.0,2008,1020,1076,-122.299,157811911.0,26th Avenue South,city,Seattle,King County,Washington,Leschi


In [18]:
# Save data
with open('./data/Data_frame_geoloc.pickle', 'wb') as df_geo_data:   
                pickle.dump(df_geo, df_geo_data, pickle.HIGHEST_PROTOCOL)
print("Pickled") 

Pickled


In [19]:
# Load already processed data (uncomment to proceed)
# with open('./data/Data_frame_geoloc.pickle', 'rb') as df_geo_data:
#     df_geo = pickle.load(df_geo_data)

# Output

In [None]:
# All data is in file ./data/Data_frame_geoloc.pickle