**Find Mininmum Distance from a rental property to any point of interest**

In [22]:
# Import necessary libraries
from pyspark.sql import SparkSession, functions as F
import geopandas as gpd
from shapely.geometry import Point

# Create Spark session
spark = (
    SparkSession.builder.appName('Weekly Rental Listings Preprocessing')
    .config("spark.sql.repl.eagerEval.enabled", True)  # display full dataframe in console
    .config("spark.sql.parquet.cacheMetadata", "true") # cache parquet metadata
    .config("spark.sql.session.timeZone", "Etc/UTC") # set timezone to UTC
    # memory configurations - hopefully will reduce crashing
    .config("spark.driver.memory", "4g") # set driver memory
    .config("spark.executor.memory", "4g") # set executor memory
    .getOrCreate()
)

In [23]:
# Load CSV data into Spark DataFrame
sdf = spark.read.option("escape", '"').csv('../data/raw/domain/rental_listings_2025_09.csv', header=True, inferSchema=True)

# Show the first rows full_address column data 

sdf.select("full_address").limit(10)

full_address
14 Federation Lan...
106/609 Victoria ...
Abbotsford VIC 3067
4/2 Princes Stree...
"45 Park St, Abbot..."
302/14 Trenerry C...
DG04/18 Grosvenor...
Y102/125 Turner S...
204/11 Flockhart ...
207 Langridge Str...


In [24]:
sdf.limit(10)
sdf

age_0_to_19,age_20_to_39,age_40_to_59,age_60_plus,agency_name,agent_name,appointment_only,avg_days_on_market,bathrooms,bedrooms,car_spaces,description,family_percentage,features_list,first_listed_date,full_address,image_urls,inspection_text,land_area,last_sold_date,latitude,listing_status,listing_tag,listing_url,long_term_resident,longitude,median_rent_price,median_sold_price,number_of_photos,number_sold,owner_percentage,postcode,property_features,property_id,property_type,rental_price,renter_percentage,schools,single_percentage,state_abbreviation,street,street_number,structured_features,suburb,unit_number,updated_date,url
0.142523363,0.5,0.226635516,0.130841121,RT Edgar - Northside,Lily Passarelli,False,60,,3,2,Just freshly pain...,0.5304348,Split System Air ...,2025-08-26T16:14:...,14 Federation Lan...,https://rimh2.dom...,,,,-37.796893,live,,https://www.domai...,0.4220963,144.9961565,900,1350000,11,34,0.5895954,3067,"3, ,2, ,2,",17732837,Townhouse,$880 per week,0.410404623,[('Clifton Hill P...,0.4695652,vic,Federation Lane,14,"[{'name': 'Bath',...",Abbotsford,,2025-09-03 10:12:...,https://www.domai...
0.08416759,0.6407119,0.200222462,0.0748980343,Fletchers Project...,Joyee Jiang,False,69,,2,2,Perfectly positio...,0.496447623,,2025-09-02T17:45:...,106/609 Victoria ...,https://rimh2.dom...,,,,-37.8110653,live,,https://www.domai...,0.44155845,145.0076834,660,580000,14,187,0.2623574,3067,"2, ,2, ,1,",17744154,Apartment / Unit ...,$750 per week,0.7376426,[('Yarra Primary ...,0.5035524,vic,Victoria Street,609,"[{'name': 'Gas', ...",Abbotsford,106,2025-09-02 17:45:...,https://www.domai...
0.123569794,0.3340961,0.23569794,0.306636155,RentBetter,,True,85,,1,1,PROPERTY ID: 3999...,0.40625,"Air conditioning,...",2025-09-06T22:53:...,Abbotsford VIC 3067,https://rimh2.dom...,,,,-37.80210950000001,live,,https://www.domai...,0.491228074,145.0019064,510,393000,15,141,0.4144737,3067,"1, ,1, ,1,",17750349,Apartment / Unit ...,$630 per week,0.5855263,[('Sophia Mundi S...,0.59375,vic,,,[{'name': 'Intern...,Abbotsford,,2025-09-06 22:53:...,https://www.domai...
0.174757287,0.5080906,0.220064729,0.09708738,Miles Real Estate...,Carly Tomat,False,32,,2,1,FIRST INSPECTIONS...,0.44921875,"Air conditioning,...",2025-09-01T10:37:...,4/2 Princes Stree...,https://rimh2.dom...,,,,-37.8092053,recentlyUpdated,,https://www.domai...,0.558935344,144.999856,725,1100000,7,42,0.495575249,3067,"2, ,1, ,1,",17739910,Townhouse,$650 per week,0.504424751,[('Abbotsford Pri...,0.55078125,vic,Princes Street,2,[{'name': 'Intern...,Abbotsford,4,2025-09-11 17:00:...,https://www.domai...
0.170111284,0.400635928,0.275039762,0.154213041,Jellis Craig Rich...,Nathan Roberts,False,32,,2,1,Discover this spa...,0.43849206,"Air conditioning,...",2025-09-08T12:31:...,"45 Park St, Abbot...",https://rimh2.dom...,,,,-37.8080424,new,,https://www.domai...,0.4714829,144.9939399,725,1100000,11,42,0.607142866,3067,"2, ,1, ,−,",17751219,House,$750 per week,0.392857134,[('Abbotsford Pri...,0.56150794,vic,Park St,45,"[{'name': 'Gas', ...",Abbotsford,,2025-09-11 09:25:...,https://www.domai...
0.06559406,0.6311881,0.225247532,0.0779702961,Dingle Partners,Shane Dangen,True,69,,2,1,Be first to inspe...,0.5532234,Furnished,2025-07-28T14:38:...,302/14 Trenerry C...,https://rimh2.dom...,,,,-37.7996262,live,,https://www.domai...,0.200892851,145.0020093,660,580000,14,187,0.274939179,3067,"2, ,1, ,1,",17688466,Apartment / Unit ...,$825 per week,0.7250608,[('Sophia Mundi S...,0.4467766,vic,Trenerry Crescent,14,[{'name': 'Furnis...,Abbotsford,302,2025-08-22 09:21:...,https://www.domai...
0.08416759,0.6407119,0.200222462,0.0748980343,Ray White Southbank,Eva Christodoulou,True,69,,2,1,Brilliantly locat...,0.496447623,"Air conditioning,...",2025-08-27T14:48:...,DG04/18 Grosvenor...,https://rimh2.dom...,,,,-37.8095856,recentlyUpdated,,https://www.domai...,0.44155845,145.0068159,660,580000,14,187,0.2623574,3067,"2, ,1, ,1,",17734920,Apartment / Unit ...,$600 per week,0.7376426,[('Yarra Primary ...,0.5035524,vic,Grosvenor Street,DG04/18,[{'name': 'Air co...,Abbotsford,,2025-09-11 11:36:...,https://www.domai...
0.06559406,0.6311881,0.225247532,0.0779702961,The Hopkins Group,James Hickey,True,85,,1,1,Be first to inspe...,0.5532234,"Air Conditioning,...",2025-09-01T10:45:...,Y102/125 Turner S...,https://rimh2.dom...,,,,-37.7990015,live,,https://www.domai...,0.200892851,145.0008476,510,393000,13,141,0.274939179,3067,"1, ,1, ,1,",17739942,Apartment / Unit ...,$520 per week,0.7250608,[('Sophia Mundi S...,0.4467766,vic,Turner Street,125,[{'name': 'Air co...,Abbotsford,Y102,2025-09-01 10:45:...,https://www.domai...
0.08416759,0.6407119,0.200222462,0.0748980343,Areal Property - ...,Hilary Ho,True,69,,2,1,**Scheduling your...,0.496447623,,2025-09-11T14:50:...,204/11 Flockhart ...,,,,,-37.8109899,new,,https://www.domai...,0.44155845,145.0067059,660,580000,0,187,0.2623574,3067,"2, ,1, ,1,",17757992,Apartment / Unit ...,$580 per week,0.7376426,[('Yarra Primary ...,0.5035524,vic,Flockhart Street,11,,Abbotsford,204,2025-09-11 14:50:...,https://www.domai...
0.170111284,0.400635928,0.275039762,0.154213041,Kay & Burton Ston...,Dylan Archer,False,0,,4,4,Set across three ...,0.43849206,,2025-08-18T08:47:...,207 Langridge Str...,,,,,-37.8074901,live,,https://www.domai...,0.4714829,144.9937356,0,0,0,0,0.607142866,3067,"4, ,4, ,1,",17718788,Apartment / Unit ...,$1225 per week,0.392857134,[('Abbotsford Pri...,0.56150794,vic,Langridge Street,207,,Abbotsford,,2025-08-18 08:47:...,https://www.domai...


In [25]:
# Load the shapefile (use the .shp file as entry point)
sf = gpd.read_file("../data/geo/shpfile/LOCALITY_POLYGON.shp")
sf = sf.to_crs(epsg=4326)  # Convert to lat/lon (WGS84)
sf = sf[["LOCALITY", "geometry"]]
sf["LOCALITY"] = sf["LOCALITY"].str.strip()  # Remove whitespace    
sf["suburb"] = sf["LOCALITY"].str.lower().str.strip()
sf.head()

Unnamed: 0,LOCALITY,geometry,suburb
0,MOLLONGGHIP,"POLYGON ((144.06544 -37.48382, 144.06438 -37.4...",mollongghip
1,NORTH BLACKWOOD,"POLYGON ((144.38037 -37.42376, 144.38126 -37.4...",north blackwood
2,BASALT,"POLYGON ((144.1184 -37.31148, 144.1184 -37.311...",basalt
3,LLANELLY,"POLYGON ((143.81737 -36.75048, 143.81679 -36.7...",llanelly
4,MURRABIT WEST,"POLYGON ((143.87075 -35.49319, 143.86939 -35.4...",murrabit west


In [26]:
# Function to get suburb name from lon/lat
def get_suburb(lon, lat, sf):
    pt = Point(lon, lat) # e.g., Point(144.9631, -37.8136)
    for _, row in sf.iterrows():
        poly = row['geometry']
        if poly.covers(pt):
            return row['suburb'] # return lower case suburb name
    return None

# try out the function
sub = get_suburb(144.9961565, -37.796893, sf)
print(sub)

abbotsford


In [27]:
school_locations_sf = spark.read.csv('../data/landing/schools/school_locations_2025.csv', header=True, inferSchema=True)
school_locations_sf.show(5)
#find number of distinct school types and entry count for each type
school_locations_sf.groupBy("School_Type").count().show()

+----------------+-----------+---------+--------------------+-----------+-------------+--------------------+--------------+--------------+-------------+----------------+---------------------+---------------------+--------------+------------+---------------+-------------+--------------------+--------------------+------+----------------+---------+----------+----------+
|Education_Sector|Entity_Type|School_No|         School_Name|School_Type|School_Status|      Address_Line_1|Address_Line_2|  Address_Town|Address_State|Address_Postcode|Postal_Address_Line_1|Postal_Address_Line_2|   Postal_Town|Postal_State|Postal_Postcode|Full_Phone_No|              Region|                Area|LGA_ID|        LGA_Name| LGA_TYPE|         X|         Y|
+----------------+-----------+---------+--------------------+-----------+-------------+--------------------+--------------+--------------+-------------+----------------+---------------------+---------------------+--------------+------------+---------------+---

In [28]:
def iter_rows(df, columns=None, stream=True):
    """
    Unified row iterator for Pandas/GeoPandas and PySpark DataFrames.
    Yields dicts. Optionally select a subset of `columns`.
    - stream=True uses toLocalIterator() for Spark (memory-safe).
    """
    mod = type(df).__module__

    # --- PySpark DataFrame ---
    if mod.startswith("pyspark.sql"):
        it = df.toLocalIterator() if stream else df.collect()
        for row in it:
            d = row.asDict(recursive=True)
            yield {k: d[k] for k in (columns or d.keys())}

    # --- Pandas / GeoPandas DataFrame ---
    else:
        cols = list(columns) if columns else list(df.columns)
        # itertuples is faster than iterrows; yields namedtuples
        for tup in df.itertuples(index=False, name=None):
            # name=None => plain tuples
            yield dict(zip(df.columns, tup)) if columns is None else \
                {c: v for c, v in zip(df.columns, tup) if c in cols}

In [29]:
# Function that finds the closest POI to a given rental listing
def find_closest_poi_sf(rental_lon, rental_lat, poi_df):
    rental_sub = get_suburb(rental_lon, rental_lat, sf)
    rental_point = Point(rental_lon, rental_lat)
    min_distance = float('inf') # Initialize with infinity
    closest_poi = None

    if rental_sub is None:
        return None, None  # Rental listing not in any suburb
    for poi in iter_rows(poi_df, columns=['School_Name', 'X', 'Y']):
        poi_sub = get_suburb(poi['X'], poi['Y'], sf)
        if poi_sub != rental_sub:
            continue  # Skip POIs not in the same suburb
        poi_point = Point(poi['X'], poi['Y'])
        distance = rental_point.distance(poi_point)
    
        if distance < min_distance:
            min_distance = distance
            closest_poi = poi
            
    return closest_poi, min_distance

# Example usage
rental_lon = 144.9961565
rental_lat = -37.796893
closest_poi, distance = find_closest_poi_sf(rental_lon, rental_lat, school_locations_sf)
if closest_poi:
    print(f"Closest POI: {closest_poi['School_Name']} at distance {distance}")

Closest POI: Sophia Mundi Steiner School at distance 0.011317426087662855


In [30]:
# Read in geojson file with geopandas
ptv_stops_gdf = gpd.read_file("../data/landing/ptv/public_transport_stops.geojson")
ptv_stops_gdf = ptv_stops_gdf.to_crs(epsg=4326)  # Convert to lat/lon (WGS84)

ptv_stops_gdf

Unnamed: 0,STOP_ID,STOP_NAME,MODE,geometry
0,17204,Wallan Station,REGIONAL TRAIN,POINT (145.00537 -37.41686)
1,19980,Melton Station,REGIONAL TRAIN,POINT (144.57222 -37.70336)
2,19981,Rockbank Station,REGIONAL TRAIN,POINT (144.65071 -37.72919)
3,19982,Deer Park Station,REGIONAL TRAIN,POINT (144.77083 -37.77727)
4,19998,Sunbury Station,REGIONAL TRAIN,POINT (144.72803 -37.57915)
...,...,...,...,...
29197,6586,Northcott St/Exford Rd,REGIONAL BUS,POINT (144.5743 -37.70666)
29198,6642,Northcott St/Exford Rd,REGIONAL BUS,POINT (144.57443 -37.70668)
29199,6644,Exford Rd/Staughton St,REGIONAL BUS,POINT (144.56982 -37.7024)
29200,7009,Harrison St/Marengo Crst,REGIONAL BUS,POINT (143.66323 -38.77712)


In [None]:
import openrouteservice as ors
import numpy as np
from sklearn.neighbors import BallTree
import requests
import time

rentals = sdf.select("full_address", "longitude", "latitude").limit(100).toPandas()
rentals= gpd.GeoDataFrame(
    rentals,
    geometry=[Point(xy) for xy in zip(rentals.longitude, rentals.latitude)],
    crs="EPSG:4326"
)

rentals

Unnamed: 0,full_address,longitude,latitude,geometry
0,"14 Federation Lane, Abbotsford VIC 3067",144.9961565,-37.796893,POINT (144.99616 -37.79689)
1,"106/609 Victoria Street, Abbotsford VIC 3067",145.0076834,-37.8110653,POINT (145.00768 -37.81107)
2,Abbotsford VIC 3067,145.0019064,-37.80210950000001,POINT (145.00191 -37.80211)
3,"4/2 Princes Street, Abbotsford VIC 3067",144.999856,-37.8092053,POINT (144.99986 -37.80921)
4,"45 Park St, Abbotsford VIC 3067",144.9939399,-37.8080424,POINT (144.99394 -37.80804)
...,...,...,...,...
95,"5/2-4 Watt St, Airport West VIC 3042",144.8799126,-37.7314592,POINT (144.87991 -37.73146)
96,"9 Thomas St, Airport West VIC 3042",144.876504,-37.7206421,POINT (144.8765 -37.72064)
97,"2/128 Bowes Avenue, Airport West VIC 3042",144.8863906,-37.7238332,POINT (144.88639 -37.72383)
98,"34A Walters Ave, Airport West VIC 3042",144.8864192,-37.73188030000001,POINT (144.88642 -37.73188)


**PTV Stops Matrix Pulls**

In [None]:
# read csv in as a geopandas dataframe
cleaned_listings = pd.read_csv("../data/curated/rent_features/cleaned_listings.csv", low_memory=False)
cleaned_listings["geometry"] = cleaned_listings["coordinates"].apply(wkt.loads)
cleaned_listings_gdf = gpd.GeoDataFrame(cleaned_listings, geometry="geometry", crs="EPSG:4326")
gdf_tmp = cleaned_listings_gdf


In [None]:
# Do 500 API calls at a time and then save the results to a CSV and continue from the last index
import pandas as pd
poi_rad = np.radians(np.c_[ptv_stops_gdf.geometry.y, ptv_stops_gdf.geometry.x])
tree = BallTree(poi_rad, metric="haversine")

def shortlist(point, k=6, max_km=2.0):
    pt = np.radians([[point.y, point.x]])
    dist, idx = tree.query(pt, k=k)
    dist_km = dist[0] * 6371.0088
    mask = dist_km <= max_km
    return ptv_stops_gdf.iloc[idx[0][mask]]

# --- routing helper (OpenRouteService foot example) ---
API_KEY = "YOUR_API_KEY"  # replace with your ORS API key
BASE_URL = "https://api.openrouteservice.org/v2/matrix/driving-car"

def routed_distance(source, targets, max_attempts=5, base_wait=1.0):
    coords = [[source.x, source.y]] + [[pt.x, pt.y] for pt in targets]
    body = {"locations": coords, "sources": [0], "metrics": ["distance", "duration"]}
    for attempt in range(1, max_attempts + 1):
        resp = requests.post(BASE_URL, json=body,
                            headers={"Authorization": API_KEY},
                            timeout=30)

        if resp.status_code == 429:
            wait = base_wait * (2 ** (attempt - 1))  # 1, 2, 4, 8, ...
            time.sleep(wait)
            continue

        resp.raise_for_status()
        data = resp.json()
        return data["distances"][0][1:], data["durations"][0][1:]

    raise RuntimeError(f"ORS matrix still returning 429 after {max_attempts} retries")
# --- main loop ---
rows = []
for idx, row in gdf_tmp.head(500).iterrows():
    candidates = shortlist(row.geometry, k=10, max_km=3)
    if candidates.empty:
        continue

    distances, durations = routed_distance(row.geometry, candidates.geometry, max_attempts=5, base_wait=1.0)
    idx = int(np.argmin(durations))  # use distance if you prefer metres
    chosen = candidates.iloc[idx]

    cleaned_listings_gdf.loc[idx, "StationID"] = chosen["STOP_ID"]
    cleaned_listings_gdf.loc[idx, "min_route_dist_m"] = distances[best]
    cleaned_listings_gdf.loc[idx, "min_route_dur_s"] = durations[best]


# skip to the next 500
gdf_tmp = cleaned_listings_gdf.iloc[500:]

NameError: name 'ptv_stops_sf' is not defined

In [None]:
import pandas as pd
closest = pd.DataFrame(rows)
closest

Unnamed: 0,rental_id,poi_id,route_dist_m,route_dur_s
0,"14 Federation Lane, Abbotsford VIC 3067",Lulie St,307.36,51.77
1,"106/609 Victoria Street, Abbotsford VIC 3067",Leslie St/Victoria St #23,2.42,0.17
2,Abbotsford VIC 3067,Clarke St/Johnston St,189.53,33.30
3,"4/2 Princes Street, Abbotsford VIC 3067",Church St/Victoria St #21,281.79,37.71
4,"45 Park St, Abbotsford VIC 3067",Victoria St,293.38,56.78
...,...,...,...,...
95,"5/2-4 Watt St, Airport West VIC 3042",Grange Rd/Fullarton Rd,323.38,47.71
96,"9 Thomas St, Airport West VIC 3042",Thomas St/Etzel St,129.94,29.31
97,"2/128 Bowes Avenue, Airport West VIC 3042",King St/Matthews Ave,510.76,62.36
98,"34A Walters Ave, Airport West VIC 3042",Creswell Ave/Fullarton Rd,282.59,55.59


**Run from here @jack**

In [90]:
import pandas as pd
from shapely import wkt
from geopy.distance import geodesic

# read csv in as a geopandas dataframe
cleaned_listings = pd.read_csv("../data/curated/rent_features/cleaned_listings.csv", low_memory=False)
cleaned_listings["geometry"] = cleaned_listings["coordinates"].apply(wkt.loads)
cleaned_listings_gdf = gpd.GeoDataFrame(cleaned_listings, geometry="geometry", crs="EPSG:4326")
cleaned_listings_gdf_split = cleaned_listings_gdf.head(100)
cleaned_listings_gdf_split

Unnamed: 0,property_id,bedrooms,bathrooms,car_spaces,property_type,land_area,property_features,suburb,postcode,year,...,schools,single_percentage,state_abbreviation,structured_features,unit_number,updated_date,url,coordinates,weekly_rent,geometry
0,17732837,3,,2.0,townhouse,,3 2 2,abbotsford,3067,2025,...,'clifton hill primary school' 'government' 'pr...,0.469565,vic,'name' 'bath' 'category' 'indoor' 'source' 'su...,,2025-09-03t10121796,httpswwwdomaincomau14-federation-lane-abbotsfo...,POINT (-37.796893 144.9961565),880.0,POINT (-37.79689 144.99616)
1,17744154,2,,2.0,apartment unit flat,,2 2 1,abbotsford,3067,2025,...,'yarra primary school' 'government' 'primary' ...,0.503552,vic,'name' 'gas' 'category' 'indoor' 'source' 'sug...,106,2025-09-02t17452316,httpswwwdomaincomau106-609-victoria-street-abb...,POINT (-37.8110653 145.0076834),750.0,POINT (-37.81107 145.00768)
2,17750349,1,,1.0,apartment unit flat,,1 1 1,abbotsford,3067,2025,...,'sophia mundi steiner school' 'private' 'combi...,0.593750,vic,'name' 'internal laundry' 'category' 'indoor' ...,,2025-09-06t225353917,httpswwwdomaincomauabbotsford-vic-3067-17750349,POINT (-37.80210950000001 145.0019064),630.0,POINT (-37.80211 145.00191)
3,17739910,2,,1.0,townhouse,,2 1 1,abbotsford,3067,2025,...,'abbotsford primary school' 'government' 'prim...,0.550781,vic,'name' 'internal laundry' 'category' 'indoor' ...,4,2025-09-11t170052007,httpswwwdomaincomau4-2-princes-street-abbotsfo...,POINT (-37.8092053 144.999856),650.0,POINT (-37.80921 144.99986)
4,17751219,2,,1.0,house,,2 1,abbotsford,3067,2025,...,'abbotsford primary school' 'government' 'prim...,0.561508,vic,'name' 'gas' 'category' 'indoor' 'source' 'sug...,,2025-09-11t092517693,httpswwwdomaincomau45-park-st-abbotsford-vic-3...,POINT (-37.8080424 144.9939399),750.0,POINT (-37.80804 144.99394)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,17745979,3,,2.0,house,,3 2 1,airport west,3042,2025,...,'niddrie primary school' 'government' 'primary...,0.511022,vic,'name' 'heating' 'category' 'indoor' 'source' ...,,2025-09-11t13552365,httpswwwdomaincomau34a-walters-ave-airport-wes...,POINT (-37.73188030000001 144.8864192),650.0,POINT (-37.73188 144.88642)
96,17748058,2,,1.0,apartment unit flat,,2 1 3,airport west,3042,2025,...,'penleigh essendon grammar school' 'private' ...,0.405093,vic,'name' 'internal laundry' 'category' 'indoor' ...,1,2025-09-11t17422768,httpswwwdomaincomau1-298-parer-road-airport-we...,POINT (-37.7245868 144.8693738),440.0,POINT (-37.72459 144.86937)
97,17748150,3,,1.0,house,,3 1 4,airport west,3042,2025,...,st christopher's school 'catholic' 'primary' 3...,0.351906,vic,'name' 'heating' 'category' 'indoor' 'source' ...,,2025-09-11t122157177,httpswwwdomaincomau11-hillside-grove-airport-w...,POINT (-37.726887 144.8767379),550.0,POINT (-37.72689 144.87674)
98,17213252,3,,2.0,townhouse,,3 2 2,airport west,3042,2025,...,st christopher's school 'catholic' 'primary' 5...,0.461053,vic,'name' 'internal laundry' 'category' 'indoor' ...,2,2025-09-09t131838637,httpswwwdomaincomau2-106-bowes-avenue-airport-...,POINT (-37.7256072 144.8861927),710.0,POINT (-37.72561 144.88619)


In [None]:
import time, requests

def overpass_post(query, retries=5, pause=10):
    url = "https://overpass-api.de/api/interpreter"
    for _ in range(retries):
        resp = requests.post(url, data=query)
        if resp.status_code in (429, 502, 504):
            retry_after = int(resp.headers.get("Retry-After", pause))
            time.sleep(retry_after)
            continue
        resp.raise_for_status()
        return resp
    raise RuntimeError("Overpass throttled after retries")

# Show counts of each amenity type and min distance for each rental property and create a new df with this info
rows = []
for _, rental in cleaned_listings_gdf.iterrows():
    lat1, lon1 = rental.geometry.x, rental.geometry.y
    dist = 2000                  # metres
    tags = "theatre|cafe|nightclub|kindergarten|doctors|fuel|bank|library|cinema|restaurant|atm|bar|fast_food|pharmacy|veterinary|taxi|brothel|university|police|events_venue|college|car_rental|clinic|community_centre|courier|food_court|social_facility|parking_space|hospital|waste_disposal|parcel_locker|charging_station|coworking_space|meeting_point|motorcycle_parking|childcare|social_centre|music_venue|healthcare|waste_transfer_station|casino|fire_station|student_accommodation|retail|prison|nursing_home|events_centre|exhibition_centre|conference_centre|biergarten|bus_station"
    query = f"""
    [out:json][timeout:25];
    nwr["amenity"~"{tags}"](around:{dist},{lat1},{lon1});
    out center;
    """

    url = "https://overpass-api.de/api/interpreter"
    resp = overpass_post(query)
    data = resp.json()

    
    for element in data["elements"]:
        
        props = element.get("tags", {})
        name = props.get("name", "Unnamed")
        lat = element.get("lat") or element["center"]["lat"]
        lon = element.get("lon") or element["center"]["lon"]
        distance_m = geodesic((lat1, lon1), (lat, lon)).meters
        rows.append({"PropertyID": rental["property_id"],
                    "name": name, 
                    "amenity": props.get("amenity"),
                    "geometry": Point(lon, lat),
                    "distance_m": distance_m})  # Calculate distance from the reference point

pois_gdf = gpd.GeoDataFrame(rows, crs="EPSG:4326")
pois_gdf.head()

In [None]:
dist = 2000                  # metres
tags = "theatre|cafe|nightclub|kindergarten|doctors|fuel|bank|library|cinema|restaurant|atm|bar|fast_food|pharmacy|veterinary|taxi|brothel|university|police|events_venue|college|car_rental|clinic|community_centre|courier|food_court|social_facility|parking_space|hospital|waste_disposal|parcel_locker|charging_station|coworking_space|meeting_point|motorcycle_parking|childcare|social_centre|music_venue|healthcare|waste_transfer_station|casino|fire_station|student_accommodation|retail|prison|nursing_home|events_centre|exhibition_centre|conference_centre|biergarten|bus_station"

from requests.exceptions import RequestException

def overpass_post(query, retries=5, pause=10):
    url = "https://overpass.openstreetmap.ru/api/interpreter"
    backoff = pause
    last_error = None
    for attempt in range(1, retries + 1):
        try:
            resp = requests.post(url, data=query, timeout=23)
            if resp.status_code in (429, 502, 504):
                retry_after = int(resp.headers.get("Retry-After", backoff))
                time.sleep(retry_after)
                backoff = min(backoff * 2, 60)
                continue
            resp.raise_for_status()
            return resp
        except RequestException as err:
            last_error = err
            time.sleep(backoff)
            backoff = min(backoff * 2, 60)
    raise RuntimeError(f"Overpass request failed after {retries} retries: {last_error}")

In [136]:
def collect_pois(cleaned_listings_gdf, dist=2000, tags=tags, retries=5, processed_ids=None):
    processed_ids = set() if processed_ids is None else set(processed_ids)

    rows = []
    last_error = None

    for _, rental in cleaned_listings_gdf.iterrows():
        pid = rental["property_id"]
        if pid in processed_ids:
            continue

        lat1, lon1 = rental.geometry.x, rental.geometry.y
        query = f"""
        [out:json][timeout:25];
        nwr["amenity"~"{tags}"](around:{dist},{lat1},{lon1});
        out center;
        """
        try:
            resp = overpass_post(query, retries=retries)
        except RuntimeError as err:
            last_error = err
            break

        for element in resp.json().get("elements", []):
            props = element.get("tags", {})
            lat = element.get("lat") or element["center"]["lat"]
            lon = element.get("lon") or element["center"]["lon"]
            rows.append({
                "PropertyID": pid,
                "name": props.get("name", "Unnamed"),
                "amenity": props.get("amenity"),
                "geometry": Point(lon, lat),
                "distance_m": geodesic((lat1, lon1), (lat, lon)).meters,
            })

        processed_ids.add(pid)

    pois_gdf = gpd.GeoDataFrame(rows, crs="EPSG:4326")
    if last_error:
        print(f"Stopped early: {last_error}")
    return pois_gdf, processed_ids


**DO NOT RUN THIS @JACK**

**First Run when processed_ids is empty**

In [None]:
pois_gdf, processed_ids = collect_pois(cleaned_listings_gdf, dist=2000, tags=tags, retries=5)
pois_gdf.head()

Stopped early: Overpass throttled after retries


Unnamed: 0,PropertyID,name,amenity,geometry,distance_m
0,17732837,Rubber Duck Cafe,cafe,POINT (144.99063 -37.78908),993.933704
1,17732837,7-Eleven Fuel,fuel,POINT (144.97531 -37.79773),1838.475429
2,17732837,Boulevard Restaurant,restaurant,POINT (145.01076 -37.80454),1541.099192
3,17732837,Metro Petroleum,fuel,POINT (144.97538 -37.79325),1874.448398
4,17732837,Vaud d'vile Drag Cabaret Restaurant,bar,POINT (144.97685 -37.7983),1707.5201


In [101]:
# Show counts of each amenity type for a given PropertyID and min distance for each specific amenity type
agg = pois_gdf.groupby(["PropertyID", "amenity"]).agg(count=("name", "count"), min_distance_m=("distance_m", "min")).reset_index()
count_wide = (
    agg.pivot(index="PropertyID", columns="amenity", values="count")
    .add_prefix("count_")
    .fillna(0)
)

dist_wide = (
    agg.pivot(index="PropertyID", columns="amenity", values="min_distance_m")
    .add_prefix("min_dist_")
    .fillna(0)
)

property_summary = count_wide.join(dist_wide).reset_index()
property_summary


amenity,PropertyID,count_atm,count_bank,count_bar,count_biergarten,count_brothel,count_bus_station,count_bus_station;shelter,count_cafe,count_cafe;bar,...,min_dist_social_facility,min_dist_student_accommodation,min_dist_taxi,min_dist_theatre,min_dist_tool_library,min_dist_toy_library,min_dist_university,min_dist_veterinary,min_dist_waste_disposal,min_dist_waste_transfer_station
0,5470976,2.0,5.0,9.0,0.0,0.0,0.0,0.0,41.0,0.0,...,1358.782002,0.000000,1321.611949,1387.003182,0.000000,0.0,0.000000,566.733206,1467.554757,2008.773587
1,8074858,0.0,0.0,6.0,0.0,0.0,0.0,0.0,13.0,0.0,...,935.877569,0.000000,0.000000,1626.794111,0.000000,0.0,484.827760,0.000000,0.000000,0.000000
2,8253224,3.0,6.0,7.0,0.0,0.0,1.0,0.0,22.0,0.0,...,726.902504,0.000000,0.000000,417.430931,0.000000,0.0,142.865012,0.000000,0.000000,0.000000
3,8430052,8.0,3.0,1.0,0.0,0.0,0.0,0.0,38.0,0.0,...,1066.050945,0.000000,317.686770,0.000000,0.000000,0.0,129.557669,943.677245,1552.321112,0.000000
4,8431355,62.0,46.0,169.0,0.0,1.0,1.0,0.0,516.0,0.0,...,436.062378,804.487999,445.875778,507.685229,0.000000,0.0,427.287971,704.056355,590.614414,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6592,17758698,4.0,7.0,11.0,0.0,0.0,0.0,0.0,77.0,0.0,...,946.387572,0.000000,0.000000,1603.364131,0.000000,0.0,0.000000,277.085211,0.000000,0.000000
6593,17758700,5.0,9.0,2.0,0.0,0.0,2.0,0.0,32.0,0.0,...,1366.645361,0.000000,1351.254851,1388.164912,0.000000,0.0,1290.353417,428.532188,0.000000,0.000000
6594,17758717,14.0,3.0,41.0,0.0,0.0,0.0,0.0,115.0,0.0,...,296.256232,0.000000,0.000000,0.000000,1506.250226,0.0,692.671774,352.204723,0.000000,1803.072196
6595,17758719,3.0,11.0,15.0,0.0,1.0,1.0,0.0,61.0,0.0,...,632.465555,0.000000,1216.320543,1256.120228,0.000000,0.0,1267.076747,1622.980291,995.663541,0.000000


In [None]:
# Save the property_summary to a CSV file
property_summary.to_csv('../data/curated/rent_features/property_summary.csv', index=False)

**RUN AGAIN FROM HERE**

**Subsequent Runs**

In [111]:
# take PropertyID from property summary
property_summary = pd.read_csv('../data/curated/rent_features/property_summary.csv', low_memory=False)
processed_ids = set(property_summary["PropertyID"].unique())
len(processed_ids)

6597

In [112]:
pois_gdf, processed_ids = collect_pois(cleaned_listings_gdf, dist=2000, tags=tags, retries=5)

ConnectionError: ('Connection aborted.', TimeoutError(60, 'Operation timed out'))

In [113]:
# Show counts of each amenity type for a given PropertyID and min distance for each specific amenity type
agg = pois_gdf.groupby(["PropertyID", "amenity"]).agg(count=("name", "count"), min_distance_m=("distance_m", "min")).reset_index()
count_wide = (
    agg.pivot(index="PropertyID", columns="amenity", values="count")
    .add_prefix("count_")
    .fillna(0)
)

dist_wide = (
    agg.pivot(index="PropertyID", columns="amenity", values="min_distance_m")
    .add_prefix("min_dist_")
    .fillna(0)
)

property_summary = count_wide.join(dist_wide).reset_index()
property_summary


amenity,PropertyID,count_atm,count_bank,count_bar,count_biergarten,count_brothel,count_bus_station,count_bus_station;shelter,count_cafe,count_cafe;bar,...,min_dist_social_facility,min_dist_student_accommodation,min_dist_taxi,min_dist_theatre,min_dist_tool_library,min_dist_toy_library,min_dist_university,min_dist_veterinary,min_dist_waste_disposal,min_dist_waste_transfer_station
0,5470976,2.0,5.0,9.0,0.0,0.0,0.0,0.0,41.0,0.0,...,1358.782002,0.000000,1321.611949,1387.003182,0.000000,0.0,0.000000,566.733206,1467.554757,2008.773587
1,8074858,0.0,0.0,6.0,0.0,0.0,0.0,0.0,13.0,0.0,...,935.877569,0.000000,0.000000,1626.794111,0.000000,0.0,484.827760,0.000000,0.000000,0.000000
2,8253224,3.0,6.0,7.0,0.0,0.0,1.0,0.0,22.0,0.0,...,726.902504,0.000000,0.000000,417.430931,0.000000,0.0,142.865012,0.000000,0.000000,0.000000
3,8430052,8.0,3.0,1.0,0.0,0.0,0.0,0.0,38.0,0.0,...,1066.050945,0.000000,317.686770,0.000000,0.000000,0.0,129.557669,943.677245,1552.321112,0.000000
4,8431355,62.0,46.0,169.0,0.0,1.0,1.0,0.0,516.0,0.0,...,436.062378,804.487999,445.875778,507.685229,0.000000,0.0,427.287971,704.056355,590.614414,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6592,17758698,4.0,7.0,11.0,0.0,0.0,0.0,0.0,77.0,0.0,...,946.387572,0.000000,0.000000,1603.364131,0.000000,0.0,0.000000,277.085211,0.000000,0.000000
6593,17758700,5.0,9.0,2.0,0.0,0.0,2.0,0.0,32.0,0.0,...,1366.645361,0.000000,1351.254851,1388.164912,0.000000,0.0,1290.353417,428.532188,0.000000,0.000000
6594,17758717,14.0,3.0,41.0,0.0,0.0,0.0,0.0,115.0,0.0,...,296.256232,0.000000,0.000000,0.000000,1506.250226,0.0,692.671774,352.204723,0.000000,1803.072196
6595,17758719,3.0,11.0,15.0,0.0,1.0,1.0,0.0,61.0,0.0,...,632.465555,0.000000,1216.320543,1256.120228,0.000000,0.0,1267.076747,1622.980291,995.663541,0.000000


In [114]:
# Deal with existing summary file if it exists
from pathlib import Path

summary_path = Path("../data/curated/rent_features/property_summary.csv")

if summary_path.exists():
    existing_summary = pd.read_csv(summary_path)
    property_summary = (
        pd.concat([existing_summary, property_summary], ignore_index=True)
        .drop_duplicates(subset=["PropertyID"])
    )

property_summary.to_csv(summary_path, index=False)

In [None]:
test_gdf, _ = collect_pois(cleaned_listings_gdf.tail(1), dist=2000, tags=tags, retries=5, processed_ids=processed_ids)
test_gdf