## Preprocessing for School Location Database


In [None]:
import os
import pandas as pd
from shapely.geometry import Point
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)

# Directory containing the school CSV files
schools_dir = '../data/landing/schools'

# List all CSV files in the directory
csv_files = [f for f in os.listdir(schools_dir) if f.endswith('.csv')]

# Load each CSV and collect their columns
schemas = {}
for file in csv_files:
    try:
        df = pd.read_csv(os.path.join(schools_dir, file), nrows=0, encoding='utf-8')
    except UnicodeDecodeError:
        df = pd.read_csv(os.path.join(schools_dir, file), nrows=0, encoding='latin1')
    schemas[file] = set(df.columns)

# Display the columns for each file
for file, cols in schemas.items():
    print(f"{file}: {sorted(cols)}")

# Check if all schemas match
all_schemas = list(schemas.values())
schemas_match = all(s == all_schemas[0] for s in all_schemas)
print(f"\nSchemas match: {schemas_match}")

In [None]:
# make pandas show all rows when printing dataframe
pd.set_option('display.max_rows', None)

In [None]:
# Define the standardized schema based on common columns across all years
standard_columns = [
    'Address_Line_1', 'Address_Line_2', 'Address_Postcode', 'Address_State', 'Address_Town',
    'Education_Sector', 'Entity_Type', 'Full_Phone_No', 'LGA_ID', 'LGA_Name',
    'Postal_Address_Line_1', 'Postal_Address_Line_2', 'Postal_Postcode', 'Postal_State', 'Postal_Town',
    'School_Name', 'School_No', 'School_Type', 'X', 'Y',
    # Additional columns that exist in some years
    'Area', 'LGA_TYPE', 'Region', 'School_Status'
]

print("Standardized schema:")
for col in standard_columns:
    print(f"  - {col}")


In [None]:
# Function to standardize column names and add missing columns
def standardize_school_dataframe(df, year):
    """
    Standardize a school dataframe to have consistent columns.
    
    Parameters:
    df: pandas DataFrame
    year: string indicating the year (2023, 2024, or 2025)
    
    Returns:
    pandas DataFrame with standardized columns
    """
    # Create a copy to avoid modifying the original
    df_std = df.copy()
    
    # Handle column name variations
    column_mapping = {
        'AREA_Name': 'Area',  # 2024 has AREA_Name instead of Area
        'Region_Name': 'Region'  # 2024 has Region_Name instead of Region
    }
    
    # Rename columns
    df_std = df_std.rename(columns=column_mapping)
    
    # Add missing columns with NaN values
    for col in standard_columns:
        if col not in df_std.columns:
            df_std[col] = None
    
    # Reorder columns to match standard schema
    df_std = df_std[standard_columns]
    
    # Add year column to indicate when school was established
    df_std['establishment_year'] = year
    
    return df_std

# Test the function with a small sample
print("Testing standardization function...")
for file in csv_files:
    year = file.split('_')[-1].split('.')[0]  # Extract year from filename
    print(f"\nProcessing {file} (year: {year})")
    
    # Load a small sample to test
    try:
        df_sample = pd.read_csv(os.path.join(schools_dir, file), nrows=5, encoding='utf-8')
    except UnicodeDecodeError:
        df_sample = pd.read_csv(os.path.join(schools_dir, file), nrows=5, encoding='latin1')
    
    # Standardize
    df_std = standardize_school_dataframe(df_sample, year)
    
    print(f"Original columns: {len(df_sample.columns)}")
    print(f"Standardized columns: {len(df_std.columns)}")
    print(f"Missing columns added: {[col for col in standard_columns if col not in df_sample.columns]}")


In [None]:
# Load and standardize all school datasets
print("Loading and standardizing all school datasets...")
standardized_dfs = []

for file in csv_files:
    year = file.split('_')[-1].split('.')[0]  # Extract year from filename
    print(f"\nProcessing {file} (year: {year})")
    
    # Load the full dataset
    try:
        df = pd.read_csv(os.path.join(schools_dir, file), encoding='utf-8')
    except UnicodeDecodeError:
        df = pd.read_csv(os.path.join(schools_dir, file), encoding='latin1')
    
    print(f"  Loaded {len(df)} schools")
    
    # Standardize the dataframe
    df_std = standardize_school_dataframe(df, year)
    standardized_dfs.append(df_std)
    
    print(f"  Standardized to {len(df_std.columns)} columns")

# Combine all standardized dataframes
print(f"\nCombining {len(standardized_dfs)} datasets...")
combined_schools = pd.concat(standardized_dfs, ignore_index=True)

print(f"Combined dataset shape: {combined_schools.shape}")
print(f"Total schools: {len(combined_schools)}")
print(f"\nEstablishment year distribution:")
print(combined_schools['establishment_year'].value_counts().sort_index())


In [None]:
# select School_Name, School_No, School_Type, School_Status, establishment_year, X, Y
combined_schools = combined_schools[['School_Name', 'Education_Sector', 'School_Type', 'School_Status', 'establishment_year', 'X', 'Y']]

In [None]:
# convert the X, Y columns to a Point object from shapely 
combined_schools['coordinates'] = combined_schools.apply(lambda row: Point(row['X'], row['Y']), axis=1)

In [None]:
# round the X, Y columns to 4 decimal places
combined_schools['X'] = combined_schools['X'].round(1)
combined_schools['Y'] = combined_schools['Y'].round(1)


In [None]:
combined_schools['establishment_year'].value_counts()

In [None]:
combined_schools['School_Type'].value_counts()


In [None]:
# remove schools with the same fuzzy matched name and X and Y duplicates at 1 dp
from difflib import get_close_matches
to_drop = set()
name_to_indices = {}    
for idx, row in combined_schools.iterrows():
    name = row['School_Name']
    x = row['X']
    y = row['Y']

    # Find close matches to the current school name
    close_matches = get_close_matches(name, name_to_indices.keys(), n=1, cutoff=0.99)
    
    if close_matches:
        matched_name = close_matches[0]
        for matched_idx in name_to_indices[matched_name]:
            matched_row = combined_schools.loc[matched_idx]
            if matched_row['X'] == x and matched_row['Y'] == y:
                # If X and Y also match, mark the current index for dropping
                to_drop.add(idx)
                break
    
    # Add the current index to the mapping
    if name not in name_to_indices:
        name_to_indices[name] = []
    name_to_indices[name].append(idx)  
print(f"Dropping {len(to_drop)} duplicate schools based on fuzzy name and coordinates match.")
combined_schools = combined_schools.drop(index=to_drop).reset_index(drop=True)
print(f"Dataset shape after removing duplicates: {combined_schools.shape}")

In [None]:
# sort by school_name
combined_schools = combined_schools.sort_values(by='School_Name')

In [None]:
combined_schools['School_Type'].value_counts()

In [None]:
# remove 'School_Type' where it is 'Language', 'Camp' 
combined_schools = combined_schools[~combined_schools['School_Type'].isin(['Language', 'Camp'])]

combined_schools['School_Type'].value_counts()

In [None]:
combined_schools['School_Status'].value_counts()

In [None]:
# remove 'School_Status' where it is 'Closed'
combined_schools = combined_schools[combined_schools['School_Status'] != 'C']

In [None]:
# drop School_Status column
combined_schools = combined_schools.drop(columns=['School_Status'])



In [None]:
# convert column names to lower casing
combined_schools.columns = combined_schools.columns.str.lower()

combined_schools.head(5)

In [None]:
# check for duplicate school_name
duplicate_schools = combined_schools[combined_schools.duplicated(subset=['school_name'])]

duplicate_schools


In [None]:
# group duplicate_schools by school_name, education_sector, school_type, establishment_year and count the number of occurences
duplicate_schools_grouped = duplicate_schools.groupby(['school_name', 'education_sector', 'school_type', 'establishment_year']).size().reset_index(name='count')

# get the school_name from duplicate_schools_grouped where count is greater than 1
duplicate_school_names = duplicate_schools_grouped[duplicate_schools_grouped['count'] > 1]['school_name']

duplicate_school_names

In [None]:
combined_schools_unique = combined_schools[~combined_schools['school_name'].isin(duplicate_school_names)]

combined_schools_unique[combined_schools_unique['school_name'].duplicated()]

In [None]:
coburg_special_developmental_school = combined_schools_unique[combined_schools_unique['school_name']=="Coburg Special Developmental School"]


In [None]:
coburg_special_developmental_school

In [None]:
combined_schools_duplicates = combined_schools[combined_schools['school_name'].isin(duplicate_school_names)]

In [None]:
# Save the combined dataset
output_path = '../data/processed/schools/schools.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True)

combined_schools.to_csv(output_path, index=False)


**School Rank Code**

In [None]:
# Scrape this website for top 100 schools in victoria when school type is secondary and 101 for schools that are not in the top 100. If the school type
# is not secondry set the column to None
# https://bettereducation.com.au/Results/vce.aspx

import io
import pandas as pd
import requests

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-AU,en;q=0.9",
    "Referer": "https://bettereducation.com.au/Results/vce.aspx",
    "Cache-Control": "no-cache",
}
url = "https://bettereducation.com.au/Results/vce.aspx"
resp = requests.get(url, headers=headers, timeout=30)

resp.raise_for_status()

tables = pd.read_html(io.StringIO(resp.text))  # no match -> gets all tables

In [None]:
school_rank_df = tables[1].copy()
# make the Better Education Rank columns go from 1 to 100 including every number in between
school_rank_df["Better Education Rank"] = range(1, 101)
school_rank_df

In [None]:
from difflib import get_close_matches
import re

# Match the schools.csv with the scraped data and add a new column called vic_secondary_rank
# If the school is in the top 100, set the rank to the rank from the scraped data
# If the school type is secondary or primary/secondary and the school is not in the top 100, set the rank to 101
# If the school type is not secondary or primary/secondary, set the rank to None
school_rank_df.columns = [c.lower().strip() for c in school_rank_df.columns]
school_rank_df = school_rank_df.rename(columns={"better education rank": "vic_secondary_rank", "school": "school_name"})
school_rank_df["school_name"] = school_rank_df["school_name"].str.strip()
school_rank_df = school_rank_df[pd.to_numeric(school_rank_df["vic_secondary_rank"], errors="coerce").notna()]
school_rank_df["vic_secondary_rank"] = school_rank_df["vic_secondary_rank"].astype(int) 

schools = pd.read_csv("../data/processed/schools/schools.csv")
def normalize(name: str) -> str:
    if not isinstance(name, str):
        return ""
    cleaned = (
        name.lower()
            .replace("’", "'")
            .replace("–", "-")
            .strip()
    )
    # drop everything after the first comma (suburb/campus info)
    cleaned = cleaned.split(",", 1)[0]
    cleaned = re.sub(r"\s+", " ", cleaned)
    return cleaned

# ranking table
school_rank_df["name_norm"] = school_rank_df["school_name"].map(normalize)
lookup_norm = dict(zip(school_rank_df["name_norm"], school_rank_df["vic_secondary_rank"]))

# schools.csv
schools["school_name_norm"] = schools["school_name"].map(normalize)

def lookup_fuzzy(name, candidates, cutoff=0.8):
    matches = get_close_matches(name, candidates, n=1, cutoff=cutoff)
    return matches[0] if matches else None

candidate_names = list(lookup_norm.keys())

def assign_rank(row):
    rank = lookup_norm.get(row["school_name_norm"])
    if rank is None:
        # try fuzzy match against known names (casefolded) and goes both way for better matching
        match = lookup_fuzzy(row["school_name_norm"], candidate_names, cutoff=0.85)
        if match:
            rank = lookup_norm[match]
    school_type = row["school_type"].strip().lower()
    if rank:
        return rank
    if school_type not in {"secondary", "pri/sec"}:
        return None
    return 101

schools["vic_secondary_rank"] = schools.apply(assign_rank, axis=1)
schools = schools.drop(columns=["school_name_norm"])
schools.to_csv("../data/processed/schools/schools_ranked.csv", index=False)

In [None]:
set(range(1, 101)) - set(schools["vic_secondary_rank"].unique())

In [None]:
schools[schools["school_name"].str.contains("Haileybury College", case=False, na=False)]

In [None]:
import numpy as np
eps = 0.0001
# make a school goodness column based on the vic_secondary_rank
def school_goodness(rank):
    if pd.isna(rank):
        return "N/A"
    else:
        goodness = 1 - (np.log(rank) / (np.log(101)) + eps) # Normalized log rank
        return round(goodness, 4)
    
schools["school_goodness"] = schools["vic_secondary_rank"].map(school_goodness)
schools.to_csv("../data/processed/schools/schools_ranked.csv", index=False)
schools

**Best School per Isochrone**

In [None]:
import sys
from pathlib import Path

# Add project root to Python path
# Get the current notebook's directory and go up to project root
current_dir = Path().resolve()
if current_dir.name == 'notebooks':
    project_root = current_dir.parent
elif current_dir.name == 'project2':
    project_root = current_dir
else:
    # If we're in the parent directory, look for project2
    project_root = current_dir / 'project2'
sys.path.insert(0, str(project_root))
print(f'Project root: {project_root}')

import sys
from pathlib import Path

# Add project root to Python path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

import pandas as pd
import geopandas as gpd
from pyproj import Geod
from shapely import wkt, ops
from utils.preprocess import PreprocessUtils

def safe_wkt(value):
    if pd.isna(value):
        return None
    value = str(value).strip()
    if not value:
        return None
    try:
        return wkt.loads(value)
    except Exception:
        return None


def to_geom(val):
    if isinstance(val, str):
        cleaned = val.strip()
        if cleaned.lower() in {"", "nan", "none"}:
            return None
        return wkt.loads(cleaned)
    if pd.isna(val):
        return None
    return val  # already a geometry

def clean_geom_cell(val):
    if isinstance(val, str) and val.strip().lower() in {"", "nan", "none"}:
        return pd.NA
    if pd.isna(val):
        return pd.NA
    return val

def swap_axes(geom):
    return ops.transform(lambda x, y, z=None: (y, x), geom)

In [None]:
# ---- Load + parse ----
spark_df_loaded = spark.read.parquet("../data/curated/rent_features/cleaned_listings_sampled_parquet")
df_loaded = spark_df_loaded.toPandas()

# Convert WKT back to geometry
from shapely import wkt
df_loaded['coordinates'] = df_loaded['coordinates_wkt'].apply(wkt.loads)
df_loaded = df_loaded.drop(columns=['coordinates_wkt'])

# Convert back to GeoDataFrame
listings_gdf = gpd.GeoDataFrame(df_loaded, geometry='coordinates')

iso_columns = [c for c in listings_gdf.columns if c.endswith("min_imputed") or c.endswith("min")]

# elementwise across the selected columns
listings_gdf[iso_columns] = listings_gdf[iso_columns].map(clean_geom_cell)

listings_gdf["listing_point"] = (
    listings_gdf["coordinates"]
        .apply(safe_wkt)                                  # raw WKT -> Point/None
        .apply(lambda g: swap_axes(g) if g is not None else None)
)
# reset this as the geometry of the gdf
# set the GeoDataFrame geometry to the new column
listings_gdf = listings_gdf.set_geometry("listing_point")

# preserve CRS if you already know it (e.g., EPSG:4326)
listings_gdf = listings_gdf.set_crs("EPSG:4326", inplace=False)

**Nearest neighbour imputation**

In [None]:
preprocessor = PreprocessUtils()

listings_gdf[['driving_5min_imputed', 'driving_10min_imputed', 'driving_15min_imputed']] = preprocessor.impute_by_nearest_point(listings_gdf, ['driving_5min', 'driving_10min', 'driving_15min'])
listings_gdf[['walking_5min_imputed', 'walking_10min_imputed', 'walking_15min_imputed']] = preprocessor.impute_by_nearest_point(listings_gdf, ['walking_5min', 'walking_10min', 'walking_15min'])

In [None]:
for col in iso_columns:
    listings_gdf[col] = listings_gdf[col].map(safe_wkt)   

for col in iso_columns:
    listings_gdf[f"geom_{col}"] = listings_gdf[col].apply(to_geom)

listings_gdf["year"] = (
    listings_gdf["year"]
    .astype("Int64")
)

schools["establishment_year"] = (
    pd.to_numeric(schools["establishment_year"], errors="coerce")
      .round()
      .astype("Int64")   # null-friendly
)
schools["geometry"] = schools["coordinates"].apply(safe_wkt)
schools["coordinates"] = schools["coordinates"].apply(safe_wkt)
schools_gdf = gpd.GeoDataFrame(schools, geometry="geometry", crs="EPSG:4326")

geod = Geod(ellps="WGS84")
beta = 0.2 # equivalent to lambda = 1/ beta 
def score_row(goodness, dist_km):
    return goodness / (1 + beta * dist_km)

In [None]:
iso_columns

In [None]:
pairs = {}
for col in iso_columns:
    if col.endswith("_imputed"):
        base = col.replace("_imputed", "")
        if base in listings_gdf.columns:
            pairs[base] = (base, col)

missing_pairs = pd.Series(False, index=listings_gdf.index)

for base, (orig, imp) in pairs.items():
    both_null = listings_gdf[[orig, imp]].isna().all(axis=1)
    if both_null.any():
        print(f"{base}: {both_null.sum()} rows missing both original and imputed")
        missing_pairs |= both_null

rows_missing_any_pair = listings_gdf.loc[missing_pairs]
print("Rows missing every column in at least one pair:", len(rows_missing_any_pair))

# Adding a flag feature for listings with both original and imputed missing

pairs = [
    (base, f"{base}_imputed")
    for base in iso_columns
    if base in listings_gdf.columns and f"{base}_imputed" in listings_gdf.columns
]

missing_iso = pd.Series(False, index=listings_gdf.index)
for base, imp in pairs:
    missing_iso |= listings_gdf[[base, imp]].isna().all(axis=1)

listings_gdf["missing_iso_feature"] = missing_iso.astype("int8")

In [None]:
best_cols = [c for c in listings_gdf.columns if c.startswith("geom_") and "_imputed" not in c]
iso_columns_imp = [c for c in iso_columns if "_imputed" not in c]
# imputes the isochrone and the geom column with nearest neighbour if they are empty
for col in best_cols:
    imp = f"{col}_imputed"
    if imp in listings_gdf.columns:
        listings_gdf[col] = listings_gdf[col].fillna(listings_gdf[imp]).infer_objects(copy=False)

for col in iso_columns_imp:
    imp = f"{col}_imputed"
    if imp in listings_gdf.columns:
        listings_gdf[col] = listings_gdf[col].fillna(listings_gdf[imp]).infer_objects(copy=False)


to_drop1 = [c for c in listings_gdf.columns if c.endswith("min_imputed")]


In [None]:
listings_gdf = listings_gdf.drop(columns=to_drop1)

In [None]:
listings_gdf.head()

In [None]:
subset = listings_gdf.iloc[:, -15:]      # grab the trailing 30 columns
mask = subset.isna().all(axis=1)         # True where all 30 are null/NaN
count_empty = mask.sum()                 # how many such rows

empty_rows = listings_gdf.loc[mask]      # inspect the actual rows
print("Rows with all of the last 30 columns empty:", count_empty)
empty_rows.head()                                # or empty_rows[subset.columns] to focus on that slice

In [None]:
iso_columns2 = [c for c in listings_gdf.columns if c.endswith("min") and "geom_" not in c]

iso_columns2

In [None]:
all_results = []
iso_columns2 = [c for c in listings_gdf.columns if c.endswith("min") and "geom_" not in c]

for col in iso_columns2:
    poly_col = f"geom_{col}"
    # activate polygon geometry and drop rows without polygons
    iso_poly = listings_gdf.set_geometry(poly_col)
    iso_poly = iso_poly[iso_poly[poly_col].notna()]
    iso_poly = iso_poly.set_crs("EPSG:4326", allow_override=True)  # define if missing
    # (optionally ensure validity)
    # iso_poly[poly_col] = iso_poly[poly_col].buffer(0)

    # spatial join: schools inside polygon (keep listing_point column intact)
    joined = gpd.sjoin(
        iso_poly,
        schools_gdf,
        how="left",
        predicate="covers",
        rsuffix="school",
    )

    # only keep schools that existed by first_listed_year (or unknown year)
    mask = joined["establishment_year"].isna() | (
        joined["establishment_year"] <= joined["year"]
    )
    joined = joined[mask]

    if not joined.empty:
        # geodesic distance from listing point to school (FIX 3 & 4)
        lon1 = joined["listing_point"].apply(lambda g: g.x).values
        lat1 = joined["listing_point"].apply(lambda g: g.y).values
        lon2 = gpd.GeoSeries(joined["coordinates_school"]).x
        lat2 = gpd.GeoSeries(joined["coordinates_school"]).y
        _, _, dists_m = geod.inv(lon1, lat1, lon2, lat2)
        joined["dist_km"] = dists_m / 1000.0
    
        joined["school_goodness"] = pd.to_numeric(joined["school_goodness"], errors="coerce")
        joined["dist_km"] = pd.to_numeric(joined["dist_km"], errors="coerce")

        valid = joined["school_goodness"].notna() & joined["dist_km"].notna()
        joined["score"] = pd.NA
        joined.loc[valid, "score"] = score_row(joined.loc[valid, "school_goodness"], joined.loc[valid, "dist_km"])

        # a count of how many schools within a given isochrone
        count = (joined.groupby("property_id")["school_name"]
                .count()                                 # counts rows in each group
                .rename(f"n_schools_{col}")             # e.g., n_schools_driving_5min
                .to_frame())

        best_inside = (
            joined.dropna(subset=["score"])
            .sort_values("score", ascending=False)
            .groupby("property_id")
            .head(1)
            .set_index("property_id")
        )

        # build out indexed by property_id
        out = (iso_poly[["property_id"]]
            .drop_duplicates()
            .set_index("property_id"))
        for c in [f"best_school_name_{col}", f"best_school_coord_{col}", f"best_score_{col}", f"best_dist_km_{col}"]:
            out[c] = None

        if not best_inside.empty:
            idx = best_inside.index
            out.loc[idx, f"best_school_name_{col}"]  = best_inside["school_name"].to_numpy()
            out.loc[idx, f"best_school_coord_{col}"] = best_inside["coordinates_school"].to_numpy()
            out.loc[idx, f"best_score_{col}"]        = best_inside["score"].to_numpy()
            out.loc[idx, f"best_dist_km_{col}"]      = best_inside["dist_km"].to_numpy()



        # join back to listings_gdf by property_id
        all_results.append(out)
        all_results.append(count)

final_out = pd.concat(all_results, axis=1)  # columns are already unique per {col}
overlap = final_out.columns.intersection(listings_gdf.columns)
listings_gdf = (
    listings_gdf
    .drop(columns=overlap, errors="ignore")
    .join(final_out, on="property_id")
)


In [None]:
listings_gdf.head()

**Filling in Walking and Driving best columns with nearest neighbour column logic**

In [None]:
listings_gdf.columns

In [None]:
fields = ["best_school_name", "best_school_coord", "best_score", "best_dist_km"]

def all_fields_present(df, token):
    mode, dur = token.split("_", 1)
    cols = [f"{f}_{mode}_{dur}" for f in fields if f"{f}_{mode}_{dur}" in df.columns]
    if not cols:
        return pd.Series(False, index=df.index)
    return df[cols].notna().all(axis=1)

# availability for every token we might touch
tokens = [
    "walking_5min", "walking_10min", "walking_15min",
    "driving_5min", "driving_10min", "driving_15min",
]
avail = {tok: all_fields_present(listings_gdf, tok) for tok in tokens}

# per-target fallback order
fallback = {
    "walking_5min":  ["walking_10min", "walking_15min", "driving_5min", "driving_10min", "driving_15min"],
    "walking_10min": ["walking_15min", "driving_5min", "driving_10min", "driving_15min"],
    "walking_15min": ["driving_5min", "driving_10min", "driving_15min"],
    "driving_5min":  ["driving_10min", "driving_15min"],
    "driving_10min": ["driving_5min", "driving_15min"],
    "driving_15min": ["driving_10min", "driving_5min"],
}

for target in ["walking_5min", "walking_10min", "driving_5min", "driving_10min", "driving_15min"]:
    mode_tgt, dur_tgt = target.split("_", 1)
    target_cols = [f"{f}_{mode_tgt}_{dur_tgt}" for f in fields if f"{f}_{mode_tgt}_{dur_tgt}" in listings_gdf.columns]
    if not target_cols:
        continue

    missing = listings_gdf[target_cols].isna().all(axis=1)
    filled = pd.Series(False, index=listings_gdf.index)

    for candidate in fallback.get(target, []):
        rows = missing & avail[candidate] & ~filled
        if not rows.any():
            continue

        mode_src, dur_src = candidate.split("_", 1)
        for f in fields:
            src_col = f"{f}_{mode_src}_{dur_src}"
            tgt_col = f"{f}_{mode_tgt}_{dur_tgt}"
            if src_col in listings_gdf.columns and tgt_col in listings_gdf.columns:
                listings_gdf.loc[rows, tgt_col] = listings_gdf.loc[rows, src_col].values

        tgt_cnt = f"n_schools_{mode_tgt}_{dur_tgt}"
        src_cnt = f"n_schools_{mode_src}_{dur_src}"
        if tgt_cnt in listings_gdf.columns and src_cnt in listings_gdf.columns:
            need = rows & listings_gdf[tgt_cnt].fillna(0).eq(0)
            listings_gdf.loc[need, tgt_cnt] = listings_gdf.loc[need, src_cnt].values

        filled |= rows  # stop once we copy from the first available candidate



In [None]:
pd.set_option('future.no_silent_downcasting', True)

In [None]:
score_cols = [c for c in listings_gdf.columns if c.startswith("best_score_")]
listings_gdf[score_cols] = listings_gdf[score_cols].fillna(3e-8) # Justification can be asked by venura(formula with reasonably max args)
dist_cols = [c for c in listings_gdf.columns if c.startswith("best_dist_km_")]
for col in dist_cols:
    max_val = listings_gdf[col].max(skipna=True)
    listings_gdf[col] = listings_gdf[col].fillna(max_val + 1 if pd.notna(max_val) else 1.0)

count_cols = [c for c in listings_gdf.columns if c.startswith("n_schools_")]
listings_gdf[count_cols] = listings_gdf[count_cols].fillna(0) 

In [None]:
mask = listings_gdf.iloc[:, -18:].isna().any(axis=1)

# should be zero as all missing values are imputed
len(listings_gdf.loc[mask])

In [None]:
# convert back to a csv file
listings_gdf.to_csv("../data/curated/rent_features/cleaned_listings_isochrones_added_with_best_schools.csv", index=False)

In [None]:
listings_gdf.head()