# Overview

# Business Understanding

# Data Understanding

## Geo parsing the data using longitude, latitude

In [128]:
import numpy as np
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import time
import pickle
import geoplotlib
import pandas as pd
from geoplotlib.utils import read_csv as read_csv2
from shapely.geometry import Point


In [98]:
# Geo columns from original dataset
columns_geo = ["lat", "id", "long"]

# Setup locator
locator = Nominatim(user_agent= "xz@gmail.com" )

#Load data
data = pd.read_csv("./data/kc_house_data.csv")

# Make test dataframe
test_df = data[columns_geo].iloc[0:5]

# Make sample dataframe for exploratory data analysis and finding features to fit the model.
# Frac = the percentage of original dataframe (0.1 corresponds to 10%)
sample_df = data[columns_geo].sample(frac = 0.03)


# List of possible address fields
raw_address_list = []

In [99]:
# Test of locator function 
location = locator.reverse([test_df["lat"][0],test_df["long"][0]])
location.raw

(21597, 14)

In [144]:
# The process of parsing data for our dataframe from OSM takes a long time due to limitation: 1 request per second.
# For 21000 records it will take more than 6 hours. We need to create functions to save data during the process of saving as well as continue where we finished the process.
# Function to check if the record already exist

def nan_equal(a,b):
        try:
            np.testing.assert_equal(a,b)
        except AssertionError:
            return False
        return True 

In [100]:
# Explore function

def geoloc_explore(record, raw_address_list):
    lat = record["lat"]
    lon = record["long"]
    location = locator.reverse([lat,lon]) 
    raw_address_list.append(location.raw)
    time.sleep(1)             # 1 second delay due to OSM parsing limitations
    if (i % 50 == 0):
        print(f"record {i}")  # Check the progress
    if (i % 150 == 0):
        with open('./data/Geo_raw_file.pickle', 'wb') as f:   # Save the data
            pickle.dump(raw_address_list, f, pickle.HIGHEST_PROTOCOL)
            print("Pickled")
    return raw_address_list

In [101]:
raw_address_list

In [102]:
# Optional load of sample dataset from pickle
# Load already processed data (uncomment to proceed)
# with open('./data/Geo_raw_file.pickle', 'rb') as f:
#     raw_address_list = pickle.load(f)

# Explore Sample dataset for unqiue features (uncomment to proceed)

for i in range(len(sample_df)):
        raw_address_list = geoloc_explore(sample_df.iloc[i], raw_address_list)

# Final pickle of acquired data
with open('./data/Geo_raw_file.pickle', 'wb') as f:
            pickle.dump(raw_address_list, f, pickle.HIGHEST_PROTOCOL)
print("Data is pickled")

The number of records with missing cities 779
Missing values are in 3.61 % of data


In [103]:
raw_address_list

(20814, 12)

In [104]:
# Investigate "Type_place" column for missing values
print(f"The number of records with missing values {sum(df_geo.Type_place.isna())}")

The number of records with missing values 0


In [89]:
# Dig deeper into usage of different fields, to find patterns that can be used later during data exploration
# features_search must be changed to each value from feature_list, to find pattern of data
features_search_list = []
features_search = "town"
for address in raw_address_list:
    address_features = list(address["address"].keys())
    if features_search in address_features:
        features_search_list.append(address)
features_search_list

In [149]:
# Additional check for "type of cities involved"
# features_search_list = []
# features_search = ["city", "town", "village"]
# for address in raw_address_list:
#     address_features = list(address["address"].keys())
#     if (features_search[0] not in address_features) and (features_search[1] not in address_features) and (features_search[2] not in address_features)  :
#         features_search_list.append(address)

We can see that there are 3 types of locations: towns, cities, villages. Some of them use different names for suburbs - suburbs, hamlet etc. 
All this information should be used to create correct dataframe later

In [None]:
# Create function to parse data
def geoloc(record):
    lat = record["lat"]
    lon = record["long"]
    print(lat, lon)
    location = locator.reverse([lat,lon]) 
    time.sleep(1)
    return location.raw

In [139]:
# Based on previous analysis we created new features list for our dataframe
New_features_list = ["To_drop_place_ID", "To_drop_road", "Type_place", "city", "county" , "state" , "suburb" ]

# Create new geo dataframe
df_geo = data[columns_geo].copy()

# Add new features
df_geo[New_features_list] = np.NAN

# Check new DataFrame

print(f"The number of records {len(df_geo)}")
display(df_geo.head())
display(df_geo.tail())


In [151]:
df['age'] = 2022 - df['yr_built']

OneHotEncoder(handle_unknown='ignore')

In [159]:
def renovated (year):
    """
    This returns a True / False value on whether a property has been renovated or not
    """
    if year == 0.0:
        return False
    elif year > 0.0:
        return True
    else:
        return False
    
df['renovated'] = df['yr_renovated'].map(renovated)

In [160]:
# Data Modeling

# Regression Results

# Conclusion