In [77]:
# Load packages
import numpy as np
import pandas as pd
from pathlib import Path
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from tqdm.notebook import tqdm
from pathlib import Path

In [78]:
# Define file path and file name
file_path = '../data/raw/'
file_name = 'property-sales_new-york-city_2022'

In [79]:
# Load data
df = pd.read_parquet(f'{file_path}{file_name}.parquet')

In [80]:
# Define API settings for geocoding
geolocator = Nominatim(user_agent='property-sales-locator')
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

In [81]:
# Drop rows without zip_code since it is used for geocoding
print(f'Number of rows before: {len(df)}')
df.dropna(subset=['zip_code'], inplace=True)
print(f'Number of rows after: {len(df)}')

Number of rows before: 93427
Number of rows after: 93420


In [82]:
# Remove comma and any text that follows from address
df.address = df.address.str.split(',').str[0]

In [83]:
# Remove rows entries where address contains 'N/A'
print(f'Number of rows before: {len(df)}')
df = df[~df.address.str.contains('N/A')]
print(f'Number of rows after: {len(df)}')

Number of rows before: 93420
Number of rows after: 92435


In [84]:
# Strip white outer white space from address column
df.address = df.address.str.strip()

In [85]:
# Split data into smaller chunks to prevent loss of progress
n_chunks = 100
data_chunks = np.array_split(df, n_chunks)

In [86]:
# Initialize progress bar for pandas
tqdm.pandas()

In [87]:
# Geocode all property sales records
for i, data_chunk in enumerate(data_chunks):
    if Path(f'{file_path}{file_name}_geocoded_chunks/{file_name}_geocoded_{i + 1}-{n_chunks}.parquet').is_file():
        print(f'Already processed chunk {i + 1}/{n_chunks}')
    else:
        print(f'Processing chunk {i + 1}/{n_chunks}')
        data_chunk[['location_lat', 'location_long']] = data_chunk.progress_apply(
            lambda x: geocode({'street': x['address'], 'postalcode': x['zip_code'], 'country': 'US'}), axis=1).apply(
            lambda x: pd.Series([np.nan, np.nan] if x is None else [x.latitude, x.longitude], index=['location_lat', 'location_long'])
            )
        
        # Save progress
        data_chunk.to_parquet(f'{file_path}{file_name}_geocoded_chunks/{file_name}_geocoded_{i + 1}-{n_chunks}.parquet')

Already processed chunk 1/100
Already processed chunk 2/100
Already processed chunk 3/100
Already processed chunk 4/100
Already processed chunk 5/100
Already processed chunk 6/100
Already processed chunk 7/100
Already processed chunk 8/100
Already processed chunk 9/100
Already processed chunk 10/100
Already processed chunk 11/100
Already processed chunk 12/100
Already processed chunk 13/100
Already processed chunk 14/100
Already processed chunk 15/100
Already processed chunk 16/100
Already processed chunk 17/100
Already processed chunk 18/100
Already processed chunk 19/100
Already processed chunk 20/100
Already processed chunk 21/100
Already processed chunk 22/100
Already processed chunk 23/100
Already processed chunk 24/100
Already processed chunk 25/100
Already processed chunk 26/100
Already processed chunk 27/100
Already processed chunk 28/100
Already processed chunk 29/100
Already processed chunk 30/100
Already processed chunk 31/100
Already processed chunk 32/100
Already processed

In [88]:
# Load geocoded data chunks
data_chunks_geocoded = [pd.read_parquet(f'{file_path}{file_name}_geocoded_chunks/{file_name}_geocoded_{i + 1}-{n_chunks}.parquet') for i in range(len(data_chunks))]

In [89]:
# Reload dataset
df = pd.read_parquet(f'{file_path}{file_name}.parquet')

In [90]:
# Add latitude and logitude columns
df[['location_lat', 'location_long']] = np.nan

In [91]:
# Add latitude and longitude values from all geocoded data chunks
for data_chunk_geocoded in data_chunks_geocoded:
    # Get index values from geocoded data chunk
    index_values = data_chunk_geocoded.index.to_list()

    # Add latitude and logitude values from geocoded data chunk
    df.loc[index_values ,['location_lat', 'location_long']] = data_chunk_geocoded[['location_lat', 'location_long']]

In [92]:
# Save geocoded data
df.to_csv(f'{file_path}{file_name}_geocoded.csv')
df.to_parquet(f'{file_path}{file_name}_geocoded.parquet')