In [6]:
import requests
import random
import geopandas as gpd
from shapely.geometry import Polygon
from datetime import datetime
import os
import re
import json
import urllib3
import pandas as pd
import hashlib


urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


In [2]:
# Output directory
output_dir = "OUTPUT/HCAD_SAMPLES"
os.makedirs(output_dir, exist_ok=True)

In [3]:
# Base URL for parcel endpoint
url = "https://arcweb.hcad.org/server/rest/services/public/public_query/MapServer/0/query"

In [4]:
# Metro service area polygon (as GeoSeries or single unified geometry)
metro_gdf = gpd.read_file("../REF\Metro_MTA_Tax_Area.geojson")
metro_union = metro_gdf.union_all()

# Collect Samples

In [7]:
def run_random_sample(metro_union, target_count=1000, batch_size=500, max_attempts=20, output_dir="OUTPUT"):
    # 1. Get all parcel object IDs
    url = "https://arcweb.hcad.org/server/rest/services/public/public_query/MapServer/0/query"
    id_params = {
        "where": "1=1",
        "returnIdsOnly": "true",
        "f": "json"
    }
    id_response = requests.get(url, params=id_params, verify=False)
    all_ids = id_response.json().get("objectIds", [])
    
    if not all_ids:
        raise ValueError("No ObjectIDs returned from HCAD endpoint.")
    
    # 2. Resampling logic
    final_records = []
    final_geometries = []
    used_ids = set()
    attempts = 0

    def fetch_and_filter(batch_size=500):
        remaining_ids = list(set(all_ids) - used_ids)
        if not remaining_ids:
            return

        sample_ids = random.sample(remaining_ids, min(batch_size, len(remaining_ids)))
        query_params = {
            "objectIds": ",".join(map(str, sample_ids)),
            "outFields": "*",
            "returnGeometry": "true",
            "f": "json",
            "outSR": "4326"
        }

        response = requests.get(url, params=query_params, verify=False)
        features = response.json().get("features", [])

        for feature in features:
            oid = feature.get("attributes", {}).get("OBJECTID")
            if oid in used_ids:
                continue

            rings = feature.get("geometry", {}).get("rings", [])
            if rings and isinstance(rings, list) and len(rings[0]) > 2:
                try:
                    polygon = Polygon(rings[0])
                    if polygon.intersects(metro_union):
                        final_records.append(feature["attributes"])
                        final_geometries.append(polygon)
                        used_ids.add(oid)
                except Exception:
                    continue

    while len(final_records) < target_count and attempts < max_attempts:
        fetch_and_filter(batch_size)
        attempts += 1
        print(f"Collected {len(final_records)} valid records (attempt {attempts})")

    if not final_geometries:
        raise ValueError("No valid geometries found in sampling process.")

    final_gdf = gpd.GeoDataFrame(final_records, geometry=final_geometries, crs="EPSG:4326")

    # Add a stable unique ID for each record using a hash of the HCAD_NUM
    final_gdf["unique_id"] = final_gdf.apply(lambda row: hashlib.md5(str(row.get("HCAD_NUM")).encode()).hexdigest(), axis=1)

    # Metadata
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
    sampleset_id = f"sampleset_{timestamp_str}"
    metadata = {
        "timestamp": timestamp,
        "sampling_batch_size": batch_size,
        "total_properties_sampled": len(final_gdf),
        "sampling_attempts": attempts,
        "samplesetID": sampleset_id
    }

    # Save with metadata
    timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"metro_intersecting_parcels_{timestamp_str}.geojson"
    filepath = os.path.join(output_dir, filename)
    os.makedirs(output_dir, exist_ok=True)

    geojson_obj = json.loads(final_gdf.to_json())
    geojson_obj["metadata"] = metadata

    with open(filepath, "w") as f:
        json.dump(geojson_obj, f)

    return final_gdf, metadata

In [8]:
all_samples = []

independent_sample_size = 5

In [9]:
for i in range(5):
    gdf, metadata = run_random_sample(metro_union)
    all_samples.append((gdf, metadata))

Collected 394 valid records (attempt 1)
Collected 807 valid records (attempt 2)
Collected 1210 valid records (attempt 3)
Collected 403 valid records (attempt 1)
Collected 802 valid records (attempt 2)
Collected 1220 valid records (attempt 3)
Collected 403 valid records (attempt 1)
Collected 806 valid records (attempt 2)
Collected 1217 valid records (attempt 3)
Collected 422 valid records (attempt 1)
Collected 829 valid records (attempt 2)
Collected 1242 valid records (attempt 3)
Collected 408 valid records (attempt 1)
Collected 804 valid records (attempt 2)
Collected 1198 valid records (attempt 3)


# Combine all Samples

In [10]:
# Folder where your sample files are saved
sample_dir = "OUTPUT/HCAD_SAMPLES"

In [11]:
# List all GeoJSONs that match pattern
sample_files = [f for f in os.listdir(sample_dir) if f.endswith(".geojson") and f.startswith("metro_intersecting_parcels_")]

# Initialize container
all_gdfs = []

In [13]:
for file in sample_files:
    filepath = os.path.join(sample_dir, file)

    with open(filepath, "r") as f:
        geojson_data = json.load(f)

    metadata = geojson_data.get("metadata", {})
    gdf = gpd.GeoDataFrame.from_features(geojson_data["features"])
    gdf.set_crs("EPSG:4326", inplace=True)

    # Attach metadata as columns
    for key, value in metadata.items():
        gdf[key] = value

    # Extract sample_id from filename or timestamp
    match = re.search(r"_(\d{8}_\d{6})", file)
    sample_id = match.group(1) if match else file
    gdf["sample_id"] = sample_id

    all_gdfs.append(gdf)

In [14]:
# Concatenate
combined_gdf = gpd.GeoDataFrame(pd.concat(all_gdfs, ignore_index=True), crs="EPSG:4326")

In [15]:
# Add index column
combined_gdf.reset_index(inplace=True)
combined_gdf.rename(columns={"index": "parcel_index"}, inplace=True)

# Save for next steps
combined_gdf.to_file("OUTPUT/combined_random_samples.geojson", driver="GeoJSON")
combined_gdf.to_parquet("OUTPUT/combined_random_samples.parquet")


In [None]:
combined_gdf

# All Samples Map

In [16]:
import folium
from folium import GeoJson

In [None]:
m = folium.Map(location=[29.76, -95.37], zoom_start=11, tiles="cartodbpositron")

for _, row in combined_gdf.iterrows():
    sim_geo = gpd.GeoSeries([row.geometry]).__geo_interface__
    folium.GeoJson(sim_geo).add_to(m)

m