First subsample (stratified) in case we don't have enough time to make enough requests.

In [88]:
# Stratified sampling script for rental listings data
# Performs stratified sampling by suburb, year, and quarter to reduce data to 50%

import pandas as pd
import numpy as np

# Load the data
print("Loading data...")
df = pd.read_csv('../data/curated/rent_features/cleaned_listings.csv')
print(f"Original dataset shape: {df.shape}")
print(f"Original dataset size: {len(df):,} records")

# Display data distribution before sampling
print("\nData distribution before sampling:")
print("Year distribution:")
print(df['year'].value_counts().sort_index())
print("\nQuarter distribution:")
print(df['quarter'].value_counts().sort_index())
print(f"\nUnique suburbs: {df['suburb'].nunique()}")
print(f"Unique year-quarter combinations: {df[['year', 'quarter']].drop_duplicates().shape[0]}")

# Create stratification groups
print("\nCreating stratification groups...")
df['strata'] = df['suburb'].astype(str) + '_' + df['year'].astype(str) + '_' + df['quarter'].astype(str)
strata_counts = df['strata'].value_counts()
print(f"Number of strata: {len(strata_counts)}")
print(f"Strata size distribution:")
print(strata_counts.describe())

# Perform stratified sampling
print("\nPerforming stratified sampling...")
sampled_dfs = []

for stratum in df['strata'].unique():
    stratum_data = df[df['strata'] == stratum]
    
    # Calculate sample size (50% of stratum)
    sample_size = max(1, len(stratum_data) // 2)  # Ensure at least 1 record per stratum
    
    # If stratum has only 1 record, keep it
    if len(stratum_data) == 1:
        sampled_dfs.append(stratum_data)
    else:
        # Random sampling within stratum
        sampled_stratum = stratum_data.sample(n=sample_size, random_state=42)
        sampled_dfs.append(sampled_stratum)

# Combine all sampled strata
sampled_df = pd.concat(sampled_dfs, ignore_index=True)

# Remove the temporary strata column
sampled_df = sampled_df.drop('strata', axis=1)

print(f"\nSampled dataset shape: {sampled_df.shape}")
print(f"Sampled dataset size: {len(sampled_df):,} records")
print(f"Reduction: {((len(df) - len(sampled_df)) / len(df) * 100):.1f}%")

# Display data distribution after sampling
print("\nData distribution after sampling:")
print("Year distribution:")
print(sampled_df['year'].value_counts().sort_index())
print("\nQuarter distribution:")
print(sampled_df['quarter'].value_counts().sort_index())
print(f"\nUnique suburbs: {sampled_df['suburb'].nunique()}")

# Verify stratification is maintained
print("\nVerifying stratification:")
strata_verification = sampled_df.groupby(['suburb', 'year', 'quarter']).size()
print(f"Strata with data after sampling: {len(strata_verification)}")
print("Sample of strata sizes:")
print(strata_verification.head(10))


# Create a new column for isochrones and fill with NaN, we expect 6 columns (3 for driving, 3 for walking with fixed time of 5, 10, 15 minutes respectively)
sampled_df[['driving_5min', 'driving_10min', 'driving_15min', 'walking_5min', 'walking_10min', 'walking_15min']] = np.nan

# Save the sampled data
output_path = '../data/curated/rent_features/cleaned_listings_sampled.csv'
sampled_df.to_csv(output_path, index=False)
print(f"\nSampled data saved to: {output_path}")

# Summary statistics
print("\nSummary:")
print(f"Original records: {len(df):,}")
print(f"Sampled records: {len(sampled_df):,}")
print(f"Sampling ratio: {len(sampled_df)/len(df):.2%}")
print(f"Records removed: {len(df) - len(sampled_df):,}")


Loading data...


  df = pd.read_csv('../data/curated/rent_features/cleaned_listings.csv')


Original dataset shape: (25687, 40)
Original dataset size: 25,687 records

Data distribution before sampling:
Year distribution:
year
2022     2620
2023      340
2024     4065
2025    18662
Name: count, dtype: int64

Quarter distribution:
quarter
3      3357
6      5937
9     15208
12     1185
Name: count, dtype: int64

Unique suburbs: 988
Unique year-quarter combinations: 15

Creating stratification groups...
Number of strata: 2721
Strata size distribution:
count    2721.000000
mean        9.440279
std        24.438703
min         1.000000
25%         1.000000
50%         3.000000
75%        11.000000
max       846.000000
Name: count, dtype: float64

Performing stratified sampling...

Sampled dataset shape: (12882, 40)
Sampled dataset size: 12,882 records
Reduction: 49.9%

Data distribution after sampling:
Year distribution:
year
2022    1295
2023     175
2024    2090
2025    9322
Name: count, dtype: int64

Quarter distribution:
quarter
3     1694
6     2977
9     7596
12     615
Name

Because the ORS isochrone is limited to 500 requests. We will rerun this notebook and update the `curated/rent_features/cleaned_listings.csv` as we impute. 

In [126]:
# First read in the subsampled data
df = pd.read_csv('../data/curated/rent_features/cleaned_listings_sampled.csv')



df[['property_id', 'coordinates','driving_5min', 'driving_10min', 'driving_15min', 'walking_5min', 'walking_10min', 'walking_15min']].to_csv('../data/raw/missing_isochrones.csv', index=False)


# Running API request below

In [127]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import geopandas as gpd
import os 


# Import the GeoUtils class from utils/geo.py
from utils.geo import GeoUtils
from dotenv import load_dotenv

load_dotenv()
APIKEY1 = os.getenv('ORS_API_KEY1')
APIKEY2 = os.getenv('ORS_API_KEY2')
APIKEY3 = os.getenv('ORS_API_KEY3')
APIKEY4 = os.getenv('ORS_API_KEY4')
APIKEY5 = os.getenv('ORS_API_KEY5')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [131]:
# read in the missing isochrones using geopandas
gdf = gpd.read_file('../data/raw/missing_isochrones.csv') 

# convert property_id to int
gdf['property_id'] = gdf['property_id'].astype(int)

# get the first 500 
gdf_tmp = gdf.head(500) 

In [132]:
# Convert coordinates to Point objects from shapely
from shapely.geometry import Point
from shapely import wkt

# Check the format of the coordinates column
sample_coords = gdf_tmp['coordinates'].iloc[0]
print(f"Sample coordinate format: {sample_coords}")

# Convert WKT POINT strings to Point objects
if isinstance(sample_coords, str) and sample_coords.startswith('POINT'):
    # Handle WKT POINT format: "POINT (lon lat)"
    gdf_tmp['geometry'] = gdf_tmp['coordinates'].apply(
        lambda coord: wkt.loads(coord)
    )
    print("Successfully converted WKT POINT strings to Point objects")
else:
    print("Unexpected coordinate format. Expected WKT POINT format.")

# Verify the conversion worked
print(f"\nCreated geometry column with {gdf_tmp['geometry'].notna().sum()} valid points")
print("Sample geometry objects:")
print(gdf_tmp['geometry'].head())
print(f"\nGeometry type: {type(gdf_tmp['geometry'].iloc[0])}")


Sample coordinate format: POINT (-37.8058235 144.9940691)
Successfully converted WKT POINT strings to Point objects

Created geometry column with 500 valid points
Sample geometry objects:
0    POINT (-37.8058235 144.9940691)
1     POINT (-37.8111571 145.008908)
2    POINT (-37.8041915 144.9956615)
3     POINT (-37.8111571 145.008908)
4    POINT (-37.8109899 145.0067059)
Name: geometry, dtype: object

Geometry type: <class 'shapely.geometry.point.Point'>


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf_tmp['geometry'] = gdf_tmp['coordinates'].apply(


In [135]:
# Initialize GeoUtils 
geoutils = GeoUtils(ors_api_key=APIKEY1)

# Call API to get isochrones
driving_result = gdf_tmp['geometry'].apply(lambda x: geoutils.get_isochrone_with_delay(profile="driving-car", range_values=[300, 600, 900], coordinate=x))
gdf_tmp.loc[:, ['driving_5min', 'driving_10min', 'driving_15min'] ] = pd.DataFrame(driving_result.tolist(), columns=['driving_5min', 'driving_10min', 'driving_15min']).values

OpenRouteService client initialized successfully.
Getting driving-car isochrone for coordinate: (144.9940691, -37.8058235)
{'type': 'FeatureCollection', 'bbox': [144.93386, -37.859372, 145.118402, -37.747524], 'features': [{'type': 'Feature', 'properties': {'group_index': 0, 'value': 300.0, 'center': [144.99389447507681, -37.80580526792242]}, 'geometry': {'coordinates': [[[144.976522, -37.810288], [144.978183, -37.811026], [144.984651, -37.815904], [144.985887, -37.818562], [144.987568, -37.821397], [144.989112, -37.823206], [144.989171, -37.823213], [144.994326, -37.821505], [145.000247, -37.82041], [145.008361, -37.81811], [145.010052, -37.817995], [145.011985, -37.816807], [145.014808, -37.813881], [145.014919, -37.812973], [145.01523, -37.810306], [145.012953, -37.802426], [145.010214, -37.800089], [145.00282, -37.795083], [145.002688, -37.794636], [145.002304, -37.792999], [144.995731, -37.790734], [144.995667, -37.790726], [144.988999, -37.790029], [144.988801, -37.790008], [144.



{'type': 'FeatureCollection', 'bbox': [144.930151, -37.861752, 145.101758, -37.756532], 'features': [{'type': 'Feature', 'properties': {'group_index': 0, 'value': 300.0, 'center': [144.9940825497732, -37.808055937393306]}, 'geometry': {'coordinates': [[[144.97256, -37.807922], [144.973631, -37.809957], [144.974817, -37.811314], [144.981315, -37.817388], [144.987148, -37.824122], [144.989751, -37.825909], [144.989801, -37.825882], [144.995527, -37.823596], [144.998933, -37.823277], [145.000321, -37.822427], [145.008146, -37.821223], [145.008791, -37.821297], [145.01265, -37.818359], [145.019838, -37.814493], [145.021069, -37.812857], [145.021072, -37.812207], [145.021061, -37.812145], [145.020307, -37.810924], [145.019521, -37.81082], [145.016292, -37.810432], [145.012911, -37.810234], [145.007639, -37.807788], [145.005042, -37.800754], [145.004844, -37.79992], [145.004807, -37.799848], [145.001784, -37.797748], [144.994194, -37.79523], [144.992018, -37.795246], [144.988431, -37.797571]



{'type': 'FeatureCollection', 'bbox': [144.936753, -37.85128, 145.115972, -37.747562], 'features': [{'type': 'Feature', 'properties': {'group_index': 0, 'value': 300.0, 'center': [145.00202256680123, -37.79975692487193]}, 'geometry': {'coordinates': [[[144.981586, -37.800868], [144.981967, -37.801756], [144.983561, -37.803797], [144.989614, -37.809844], [144.991536, -37.811707], [144.992982, -37.812921], [144.993227, -37.812946], [144.993985, -37.813016], [145.000319, -37.813113], [145.002075, -37.812511], [145.002764, -37.811158], [145.003463, -37.80994], [145.01049, -37.808331], [145.012763, -37.811022], [145.013162, -37.811065], [145.021072, -37.808515], [145.021951, -37.80734], [145.021981, -37.807276], [145.022054, -37.807088], [145.023787, -37.802629], [145.023481, -37.802074], [145.02279, -37.801788], [145.014251, -37.799905], [145.013852, -37.799654], [145.012954, -37.79915], [145.012304, -37.798867], [145.011891, -37.798805], [145.010726, -37.798649], [145.009846, -37.798793],



{'type': 'FeatureCollection', 'bbox': [144.572954, -37.972161, 144.909232, -37.756509], 'features': [{'type': 'Feature', 'properties': {'group_index': 0, 'value': 300.0, 'center': [144.77810530284873, -37.86740979334115]}, 'geometry': {'coordinates': [[[144.707821, -37.883171], [144.707724, -37.883536], [144.711203, -37.884461], [144.715759, -37.881868], [144.716499, -37.881482], [144.719919, -37.879789], [144.72106, -37.879396], [144.726439, -37.878042], [144.733482, -37.879628], [144.735397, -37.881108], [144.735852, -37.881358], [144.736701, -37.880999], [144.745224, -37.881338], [144.746814, -37.882856], [144.747527, -37.882938], [144.747686, -37.882954], [144.756045, -37.885983], [144.756047, -37.885984], [144.758308, -37.884646], [144.758509, -37.884498], [144.76015, -37.883303], [144.76412, -37.879498], [144.770492, -37.881137], [144.772183, -37.882692], [144.773817, -37.885916], [144.775485, -37.887413], [144.776007, -37.887467], [144.784357, -37.889199], [144.78646, -37.889195

In [148]:
geoutils = GeoUtils(ors_api_key=APIKEY2)

walking_result = gdf_tmp['geometry'].apply(lambda x: geoutils.get_isochrone_with_delay(profile="foot-walking", range_values=[300, 600, 900], coordinate=x))
gdf_tmp.loc[:, ['walking_5min', 'walking_10min', 'walking_15min'] ] = pd.DataFrame(walking_result.tolist(), columns=['walking_5min', 'walking_10min', 'walking_15min']).values

OpenRouteService client initialized successfully.
Getting foot-walking isochrone for coordinate: (144.9940691, -37.8058235)
{'type': 'FeatureCollection', 'bbox': [144.981465, -37.81559, 145.007754, -37.795778], 'features': [{'type': 'Feature', 'properties': {'group_index': 0, 'value': 300.0, 'center': [144.99394897334315, -37.805811080656916]}, 'geometry': {'coordinates': [[[144.990015, -37.804929], [144.990847, -37.807159], [144.991267, -37.807705], [144.991691, -37.808006], [144.993658, -37.809203], [144.994013, -37.809261], [144.994092, -37.809232], [144.995946, -37.808103], [144.996421, -37.807779], [144.996542, -37.807649], [144.99815, -37.805904], [144.998101, -37.805609], [144.99799, -37.805345], [144.996702, -37.804016], [144.995194, -37.802606], [144.994838, -37.802557], [144.993811, -37.802456], [144.99379, -37.802454], [144.991881, -37.803531], [144.991825, -37.803568], [144.990285, -37.804548], [144.990184, -37.80461], [144.990099, -37.804692], [144.990029, -37.804845], [14



{'type': 'FeatureCollection', 'bbox': [144.987075, -37.808467, 145.01199, -37.790744], 'features': [{'type': 'Feature', 'properties': {'group_index': 0, 'value': 300.0, 'center': [145.00022460993213, -37.799291094776486]}, 'geometry': {'coordinates': [[[144.995656, -37.799313], [144.995726, -37.799399], [144.996154, -37.799838], [144.996695, -37.800384], [144.997166, -37.800817], [144.998057, -37.801587], [144.998446, -37.801901], [145.000388, -37.80227], [145.000743, -37.802327], [145.001206, -37.802192], [145.003717, -37.801031], [145.003975, -37.80078], [145.00371, -37.800447], [145.003536, -37.800275], [145.002588, -37.799576], [144.999183, -37.797178], [144.997921, -37.79644], [144.997564, -37.796394], [144.996977, -37.797159], [144.995692, -37.798955], [144.995656, -37.799313]]], 'type': 'Polygon'}}, {'type': 'Feature', 'properties': {'group_index': 0, 'value': 600.0, 'center': [145.00022460993213, -37.799291094776486]}, 'geometry': {'coordinates': [[[144.991882, -37.799466], [14



{'type': 'FeatureCollection', 'bbox': [144.989534, -37.80661, 145.010017, -37.790626], 'features': [{'type': 'Feature', 'properties': {'group_index': 0, 'value': 300.0, 'center': [145.00208129078683, -37.7995310675634]}, 'geometry': {'coordinates': [[[144.997877, -37.799448], [144.999129, -37.800538], [144.999568, -37.800869], [145.004044, -37.802574], [145.004351, -37.802626], [145.005146, -37.802123], [145.005219, -37.802069], [145.005297, -37.801928], [145.005281, -37.801753], [145.005126, -37.801579], [145.003536, -37.800275], [145.003511, -37.800256], [145.002588, -37.799576], [144.999027, -37.79683], [144.998916, -37.796952], [144.998206, -37.798411], [144.99793, -37.799092], [144.997877, -37.799448]]], 'type': 'Polygon'}}, {'type': 'Feature', 'properties': {'group_index': 0, 'value': 600.0, 'center': [145.00208129078683, -37.7995310675634]}, 'geometry': {'coordinates': [[[144.994202, -37.799696], [144.994167, -37.800054], [144.995116, -37.80094], [144.996963, -37.802698], [144.9

In [149]:
# We read in the cleaned_listings_sampled.csv file that needs to be imputed with the API result
df = pd.read_csv('../data/curated/rent_features/cleaned_listings_sampled.csv')

print("Number of missing coordinates before imputation: ", df['driving_5min'].isnull().sum())


# Select only the columns we need from gdf_tmp for the merge
isochrone_cols = ['property_id', 'driving_5min', 'driving_10min', 'driving_15min', 'walking_5min', 'walking_10min', 'walking_15min']
gdf_subset = gdf_tmp[isochrone_cols].copy()

# Perform left join to update df with isochrone data
df_updated = df.merge(gdf_subset, on='property_id', how='left', suffixes=('', '_new'))

# Update the original columns with the new data where it's not null
for col in ['driving_5min', 'driving_10min', 'driving_15min', 'walking_5min', 'walking_10min', 'walking_15min']:
    # Only update where the original value is null and the new value is not null
    mask = df_updated[col].isnull() & df_updated[f'{col}_new'].notnull()
    df_updated.loc[mask, col] = df_updated.loc[mask, f'{col}_new']

# Drop the temporary columns
df_updated = df_updated.drop(columns=[f'{col}_new' for col in ['driving_5min', 'driving_10min', 'driving_15min', 'walking_5min', 'walking_10min', 'walking_15min']])

print("Number of missing coordinates after imputation: ", df_updated['driving_5min'].isnull().sum())

# Update df with the merged results
df = df_updated

# # save the imputed results
# df.to_csv('../data/curated/rent_features/cleaned_listings.csv', index=False)



Number of missing coordinates before imputation:  12882


 <POLYGON ((144.984 -37.809, 144.984 -37.811, 144.988 -37.816, 144.99 -37.818...>
 <POLYGON ((144.979 -37.806, 144.979 -37.806, 144.979 -37.806, 144.979 -37.80...>
 <POLYGON ((144.984 -37.809, 144.984 -37.811, 144.988 -37.816, 144.99 -37.818...>
 <POLYGON ((144.98 -37.809, 144.981 -37.811, 144.984 -37.814, 144.988 -37.819...>
 <POLYGON ((144.973 -37.808, 144.974 -37.81, 144.975 -37.811, 144.978 -37.813...>
 <POLYGON ((144.984 -37.809, 144.984 -37.811, 144.988 -37.817, 144.991 -37.82...>
 <POLYGON ((144.977 -37.798, 144.979 -37.8, 144.981 -37.806, 144.984 -37.811,...>
 <POLYGON ((144.98 -37.809, 144.981 -37.811, 144.985 -37.814, 144.988 -37.818...>
 <POLYGON ((144.979 -37.799, 144.982 -37.803, 144.985 -37.805, 144.99 -37.812...>
 <POLYGON ((144.973 -37.808, 144.974 -37.81, 144.975 -37.811, 144.981 -37.817...>
 <POLYGON ((144.979 -37.799, 144.982 -37.803, 144.983 -37.803, 144.985 -37.80...>
 <POLYGON ((144.982 -37.801, 144.982 -37.802, 144.984 -37.804, 144.99 -37.81,...>
 <POLYGON ((144.

Number of missing coordinates after imputation:  12382


 <POLYGON ((145.004 -37.811, 145.004 -37.811, 145.004 -37.811, 145.007 -37.81...>
 <POLYGON ((144.992 -37.805, 144.992 -37.805, 144.992 -37.805, 144.992 -37.80...>
 <POLYGON ((145.004 -37.811, 145.004 -37.811, 145.004 -37.811, 145.007 -37.81...>
 <POLYGON ((145.002 -37.811, 145.002 -37.811, 145.002 -37.811, 145.004 -37.81...>
 <POLYGON ((144.989 -37.807, 144.991 -37.81, 144.991 -37.81, 144.994 -37.811,...>
 <POLYGON ((145.005 -37.811, 145.005 -37.811, 145.005 -37.811, 145.006 -37.81...>
 <POLYGON ((144.992 -37.801, 144.992 -37.802, 144.994 -37.804, 144.994 -37.80...>
 <POLYGON ((145.003 -37.811, 145.003 -37.811, 145.004 -37.813, 145.005 -37.81...>
 <POLYGON ((144.993 -37.796, 144.993 -37.796, 144.993 -37.798, 144.993 -37.79...>
 <POLYGON ((144.99 -37.807, 144.991 -37.81, 144.991 -37.81, 144.993 -37.811, ...>
 <POLYGON ((144.996 -37.799, 144.996 -37.799, 144.996 -37.8, 144.997 -37.8, 1...>
 <POLYGON ((144.998 -37.799, 144.999 -37.801, 145 -37.801, 145.004 -37.803, 1...>
 <POLYGON ((145.

In [151]:
df.to_csv('../data/curated/rent_features/cleaned_listings_sampled.csv', index=False)