In [13]:
# Import necessary libraries
import geopandas as gpd
from shapely.geometry import Point
from shapely import wkt
import pandas as pd
from shapely import wkt
from geopy.distance import geodesic
import requests
import time
from requests.exceptions import RequestException

**Read in as ascending=TRUE**

In [14]:
# read csv in as a geopandas dataframe
cleaned_listings = pd.read_csv("../data/curated/rent_features/cleaned_listings_sampled.csv", low_memory=False)
cleaned_listings["geometry"] = cleaned_listings["coordinates"].apply(wkt.loads)
cleaned_listings_gdf = gpd.GeoDataFrame(cleaned_listings, geometry="geometry", crs="EPSG:4326")
# sort on ascending property_id
cleaned_listings_gdf = cleaned_listings_gdf.sort_values("property_id", ascending=True)

In [15]:
dist = 2000                  # metres
tags = "theatre|cafe|nightclub|kindergarten|doctors|fuel|bank|library|cinema|restaurant|atm|bar|fast_food|pharmacy|veterinary|taxi|brothel|university|police|events_venue|college|car_rental|clinic|community_centre|courier|food_court|social_facility|parking_space|hospital|waste_disposal|parcel_locker|charging_station|coworking_space|meeting_point|motorcycle_parking|childcare|social_centre|music_venue|healthcare|waste_transfer_station|casino|fire_station|student_accommodation|retail|prison|nursing_home|events_centre|exhibition_centre|conference_centre|biergarten|bus_station"

def overpass_post(query, session=None, retries=5, pause=10):
    url = "https://overpass-api.de/api/interpreter"
    backoff = pause
    last_error = None
    
    for attempt in range(1, retries + 1):
        try:
            resp = session.post(url, data=query, timeout=23)
            if resp.status_code in (429, 502, 504):
                retry_after = int(resp.headers.get("Retry-After", backoff))
                time.sleep(retry_after)
                backoff = min(backoff * 2, 60)
                continue
            resp.raise_for_status()
            return resp
        except RequestException as err:
            last_error = err
            time.sleep(backoff)
            backoff = min(backoff * 2, 60)
    raise RuntimeError(f"Overpass request failed after {retries} retries: {last_error}")

In [16]:
def collect_pois(cleaned_listings_gdf, tags, dist=2000, retries=5, processed_ids=None):
    processed_ids = set() if processed_ids is None else set(processed_ids)

    rows = []
    last_error = None

    with requests.Session() as session:
        session.headers.update({"User-Agent": "project-poi-fetcher/1.0 (email@example.com)"})
        for _, rental in cleaned_listings_gdf.iterrows():
            pid = rental["property_id"]
            if pid in processed_ids:
                continue

            lat1 = rental.geometry.y
            lon1 = rental.geometry.x
            
            query = f"""
            [out:json][timeout:25];
            nwr["amenity"~"{tags}"](around:{dist},{lat1},{lon1});
            out center;
            """
            try:
                resp = overpass_post(query, session=session, retries=retries)
            except RuntimeError as err:
                last_error = err
                break

            for element in resp.json().get("elements", []):
                props = element.get("tags", {})
                lat = element.get("lat") or element["center"]["lat"]
                lon = element.get("lon") or element["center"]["lon"]
                rows.append({
                    "PropertyID": pid,
                    "name": props.get("name", "Unnamed"),
                    "amenity": props.get("amenity"),
                    "geometry": Point(lon, lat),
                    "distance_m": geodesic((lat1, lon1), (lat, lon)).meters,
                })

            processed_ids.add(pid)

    if not rows: # if no rows were found, create an empty GeoDataFrame with the correct columns
        cols = ["PropertyID", "name", "amenity", "geometry", "distance_m"]
        pois_gdf = gpd.GeoDataFrame(columns=cols, geometry="geometry", crs="EPSG:4326")
    else:
        pois_gdf = gpd.GeoDataFrame(rows, geometry="geometry", crs="EPSG:4326")
    
    if last_error:
        print(f"Stopped early: {last_error}")
        
    return pois_gdf, processed_ids


**DO NOT RUN THIS @JACK**

**First Run when processed_ids is empty**

In [17]:
pois_gdf, processed_ids = collect_pois(cleaned_listings_gdf, tags=tags, dist=3000, retries=5)
pois_gdf.head()

Stopped early: Overpass request failed after 5 retries: 400 Client Error: Bad Request for url: https://overpass-api.de/api/interpreter


Unnamed: 0,PropertyID,name,amenity,geometry,distance_m
0,5215310,Racecourse Hotel,restaurant,POINT (145.04549 -37.8759),2285.699982
1,5215310,McDonald's,fast_food,POINT (145.0025 -37.84822),2603.431679
2,5215310,Bistro Thierry,restaurant,POINT (145.00367 -37.84799),2548.383496
3,5215310,Chemist Warehouse,pharmacy,POINT (145.0039 -37.84801),2532.112119
4,5215310,High Street Early Learning Centre,kindergarten,POINT (144.99963 -37.85248),2516.29989


In [18]:
# Show counts of each amenity type for a given PropertyID and min distance for each specific amenity type
agg = pois_gdf.groupby(["PropertyID", "amenity"]).agg(count=("name", "count"), min_distance_m=("distance_m", "min")).reset_index()
count_wide = (
    agg.pivot(index="PropertyID", columns="amenity", values="count")
    .add_prefix("count_")
    .fillna(0)
)

dist_wide = (
    agg.pivot(index="PropertyID", columns="amenity", values="min_distance_m")
    .add_prefix("min_dist_")
    .fillna(0)
)

property_summary = count_wide.join(dist_wide).reset_index()
property_summary


amenity,PropertyID,count_atm,count_bank,count_bar,count_cafe,count_charging_station,count_childcare,count_cinema,count_clinic,count_community_centre,...,min_dist_parking_space,min_dist_pharmacy,min_dist_police,min_dist_restaurant,min_dist_social_facility,min_dist_taxi,min_dist_theatre,min_dist_university,min_dist_veterinary,min_dist_waste_transfer_station
0,5215310,12,14,11,72,8,5,2,12,2,...,455.993611,1226.895333,1062.324174,436.192073,358.292735,2171.878865,2795.292713,2333.917746,2710.876187,2405.789616


In [19]:
# Save the property_summary to a CSV file
property_summary.to_csv('../data/curated/rent_features/property_summary.csv', index=False)

**RUN AGAIN FROM HERE**

**Subsequent Runs**

**Code for descending=TRUE**

In [None]:
# reverse the order of cleaned_listings_gdf to descending
cleaned_listings_gdf = cleaned_listings_gdf.sort_values("property_id", ascending=False)

In [None]:
pois_gdf, processed_ids = collect_pois(cleaned_listings_gdf, tags=tags, dist=3000, retries=5)

In [None]:
# Show counts of each amenity type for a given PropertyID and min distance for each specific amenity type
agg = pois_gdf.groupby(["PropertyID", "amenity"]).agg(count=("name", "count"), min_distance_m=("distance_m", "min")).reset_index()
count_wide = (
    agg.pivot(index="PropertyID", columns="amenity", values="count")
    .add_prefix("count_")
    .fillna(0)
)

dist_wide = (
    agg.pivot(index="PropertyID", columns="amenity", values="min_distance_m")
    .add_prefix("min_dist_")
    .fillna(0)
)

property_summary = count_wide.join(dist_wide).reset_index()
property_summary


amenity,PropertyID,count_atm,count_bank,count_bar,count_biergarten,count_brothel,count_bus_station,count_bus_station;shelter,count_cafe,count_cafe;bar,...,min_dist_social_facility,min_dist_student_accommodation,min_dist_taxi,min_dist_theatre,min_dist_tool_library,min_dist_toy_library,min_dist_university,min_dist_veterinary,min_dist_waste_disposal,min_dist_waste_transfer_station
0,5470976,2.0,5.0,9.0,0.0,0.0,0.0,0.0,41.0,0.0,...,1358.782002,0.000000,1321.611949,1387.003182,0.000000,0.0,0.000000,566.733206,1467.554757,2008.773587
1,8074858,0.0,0.0,6.0,0.0,0.0,0.0,0.0,13.0,0.0,...,935.877569,0.000000,0.000000,1626.794111,0.000000,0.0,484.827760,0.000000,0.000000,0.000000
2,8253224,3.0,6.0,7.0,0.0,0.0,1.0,0.0,22.0,0.0,...,726.902504,0.000000,0.000000,417.430931,0.000000,0.0,142.865012,0.000000,0.000000,0.000000
3,8430052,8.0,3.0,1.0,0.0,0.0,0.0,0.0,38.0,0.0,...,1066.050945,0.000000,317.686770,0.000000,0.000000,0.0,129.557669,943.677245,1552.321112,0.000000
4,8431355,62.0,46.0,169.0,0.0,1.0,1.0,0.0,516.0,0.0,...,436.062378,804.487999,445.875778,507.685229,0.000000,0.0,427.287971,704.056355,590.614414,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6592,17758698,4.0,7.0,11.0,0.0,0.0,0.0,0.0,77.0,0.0,...,946.387572,0.000000,0.000000,1603.364131,0.000000,0.0,0.000000,277.085211,0.000000,0.000000
6593,17758700,5.0,9.0,2.0,0.0,0.0,2.0,0.0,32.0,0.0,...,1366.645361,0.000000,1351.254851,1388.164912,0.000000,0.0,1290.353417,428.532188,0.000000,0.000000
6594,17758717,14.0,3.0,41.0,0.0,0.0,0.0,0.0,115.0,0.0,...,296.256232,0.000000,0.000000,0.000000,1506.250226,0.0,692.671774,352.204723,0.000000,1803.072196
6595,17758719,3.0,11.0,15.0,0.0,1.0,1.0,0.0,61.0,0.0,...,632.465555,0.000000,1216.320543,1256.120228,0.000000,0.0,1267.076747,1622.980291,995.663541,0.000000


In [None]:
# Save the property_summary to a CSV file
property_summary.to_csv('../data/curated/rent_features/property_summary_reversed.csv', index=False)

**Ignore this @jack**

In [None]:
# Deal with existing summary file if it exists
from pathlib import Path

summary_path = Path("../data/curated/rent_features/property_summary.csv")

if summary_path.exists():
    existing_summary = pd.read_csv(summary_path)
    property_summary = (
        pd.concat([existing_summary, property_summary], ignore_index=True)
        .drop_duplicates(subset=["PropertyID"])
    )

property_summary.to_csv(summary_path, index=False)