<a href="https://colab.research.google.com/github/hamydang16/EY_Data-Challenge-2025/blob/main/EY_Data_Challenge__2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Urban Heat Island Challenge

## Load Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
!pip install rioxarray
!pip install stackstac
!pip install pystac_client
!pip install planetary_computer
!pip install odc-stac
!pip install rasterstats
!pip install geopy
!pip install reverse_geocode
!pip install osmnx

In [3]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Science
import numpy as np
import pandas as pd

# Multi-dimensional arrays and datasets
import xarray as xr

# Geospatial raster data handling
import rioxarray as rxr

# Geospatial data analysis
import geopandas as gpd
import reverse_geocode
import osmnx as ox

# Geospatial operations
import rasterio as rio
from rasterio import windows
from rasterio import features
from rasterio import warp
from rasterio.warp import transform_bounds
from rasterio.windows import from_bounds

# Image Processing
from PIL import Image

# Coordinate transformations
from pyproj import Proj, Transformer, CRS

# Feature Engineering
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

# Machine Learning
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
import xgboost as xgb
from sklearn.metrics import r2_score


# Planetary Computer Tools
import pystac_client
import planetary_computer as pc
from pystac.extensions.eo import EOExtension as eo

# Others
import os
from tqdm.auto import tqdm
from shapely.geometry import Point, mapping, box
from geopy.distance import geodesic
from rasterstats import zonal_stats
from multiprocessing import Pool
import time
from functools import partial

In [4]:
pd.set_option('display.max_colwidth', None)

## Training data

In [5]:
#csv path
csv_path = '/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/Training_data_uhi_index_2025-02-18.csv'

#Load the training data from csv file and display the first few rows to inspect the data
ground_df = pd.read_csv(csv_path)
ground_df.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index
0,-73.909167,40.813107,24-07-2021 15:53,1.030289
1,-73.909187,40.813045,24-07-2021 15:53,1.030289
2,-73.909215,40.812978,24-07-2021 15:53,1.023798
3,-73.909242,40.812908,24-07-2021 15:53,1.023798
4,-73.909257,40.812845,24-07-2021 15:53,1.021634


## Response Variables

### Buidling data

Building data is taken from Microsoft Building Footprints for the state of New York. Data is subset to only include Bronx and New York county. [Link](https://github.com/microsoft/USBuildingFootprints)

In [6]:
#Import building data
buildings = gpd.read_file('/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/NewYork.geojson')

In [7]:
#Get centroid of building
buildings['centroid'] = buildings['geometry'].centroid

In [8]:
#Reverse geocoding into buildings by creating a new column to signify region by reverse geocoding from the centroid column
buildings['region'] = buildings.apply(lambda x: reverse_geocode.search([(x['centroid'].y, x['centroid'].x)]), axis=1)

In [9]:
#get only the county
buildings['county'] = buildings['region'].apply(lambda x: x[0].get('county'))

In [10]:
#filter out the county for only bronx and manhattan
buildings = buildings[buildings['county'].isin(['Bronx County', 'New York County'])]
#drop centroid, region and county
buildings = buildings.drop(columns=['centroid', 'region', 'county'])

In [11]:
buildings.head()

Unnamed: 0,release,capture_dates_range,geometry
51809,1,,"POLYGON ((-73.78183 40.83828, -73.78177 40.83832, -73.78186 40.8384, -73.78196 40.83833, -73.78189 40.83828, -73.78186 40.8383, -73.78183 40.83828))"
52172,1,,"POLYGON ((-73.78385 40.84886, -73.78375 40.84887, -73.78377 40.84896, -73.78387 40.84894, -73.78385 40.84886))"
52348,1,,"POLYGON ((-73.78477 40.84717, -73.78483 40.8473, -73.78497 40.84727, -73.78491 40.84713, -73.78477 40.84717))"
52358,1,,"POLYGON ((-73.78493 40.84023, -73.78497 40.84033, -73.78507 40.84031, -73.78503 40.84021, -73.78493 40.84023))"
52363,1,,"POLYGON ((-73.785 40.83903, -73.7849 40.83905, -73.78494 40.83916, -73.78503 40.83915, -73.785 40.83903))"


Using OSMNx library to find average heights of buildings within 100 meters of coordinate

DON'T RUN THIS CODE BLOCK

In [None]:
#### TEST ###### include height, total building area, building density
# Function to create geometry points - vectorized instead of apply
def create_geometry_points(df):
    geometry = [Point(lon, lat) for lon, lat in zip(df['Longitude'], df['Latitude'])]
    return gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')

# Convert DataFrame to GeoDataFrame in a more efficient way
start = time.time()
geo_df = create_geometry_points(ground_df)
print(f"GeoDataFrame creation took {time.time() - start:.2f} seconds")

# Function to get buildings data with batching
def get_buildings_data_batch(coords_batch, dist=100):
    results = []

    # Process each coordinate in the batch
    for lat, lon in coords_batch:
        try:
            # Create a small buffer to check for previously downloaded buildings
            point = (lat, lon)

            # Use features_from_point which is faster than geometries_from_point
            buildings = ox.features_from_point(point, tags={'building': True}, dist=dist)

            # Check for height data - consider multiple height columns
            height_cols = ['height', 'building:height']
            heights = []

            # Extract heights exactly as in the original code
            for col in height_cols:
                if col in buildings.columns:
                    col_heights = buildings[col].dropna()
                    # Convert to numeric, coerce errors to NaN
                    numeric_heights = pd.to_numeric(col_heights, errors='coerce')
                    heights.extend(numeric_heights.dropna().tolist())

            # Calculate area for each building to compute building density
            area_data = {}

            if len(buildings) > 0:
                # Reproject to a projected CRS for accurate area calculation
                buildings_proj = buildings.to_crs(epsg=3857)  # Web Mercator projection

                # Calculate area for each building
                building_areas = buildings_proj.geometry.area

                # Calculate total area covered by buildings (in square meters)
                total_building_area = building_areas.sum()

                # Calculate area of the buffer circle (in square meters)
                buffer_area = np.pi * (dist ** 2)

                # Store results
                area_data = {
                    'num_buildings': len(buildings),
                    'total_building_area': total_building_area,
                    'individual_building_areas': building_areas.tolist(),
                    'buffer_area': buffer_area,
                    'building_density': total_building_area / buffer_area if buffer_area > 0 else 0,
                    'building_count_density': len(buildings) / (buffer_area / 10000) if buffer_area > 0 else 0,  # buildings per hectare
                }
            else:
                area_data = {
                    'num_buildings': 0,
                    'total_building_area': 0,
                    'individual_building_areas': [],
                    'buffer_area': np.pi * (dist ** 2),
                    'building_density': 0,
                    'building_count_density': 0,
                }

            results.append((heights, area_data))
        except Exception as e:
            # Return empty data if there's an error
            results.append(([], {
                'num_buildings': 0,
                'total_building_area': 0,
                'individual_building_areas': [],
                'buffer_area': np.pi * (dist ** 2),
                'building_density': 0,
                'building_count_density': 0,
            }))

    return results

# Function to batch process coordinates
def process_in_batches(coordinates, batch_size=10, n_processes=4):
    # Split coordinates into batches for more efficient processing
    n_coords = len(coordinates)
    batches = [coordinates[i:i+batch_size] for i in range(0, n_coords, batch_size)]

    print(f"Processing {n_coords} points in {len(batches)} batches using {n_processes} processes")

    # Process batches in parallel
    with Pool(processes=n_processes) as pool:
        results = list(tqdm(
            pool.imap(get_buildings_data_batch, batches),
            total=len(batches),
            desc="Processing batches"
        ))

    # Flatten the results
    flattened_heights = []
    flattened_area_data = []

    for batch_result in results:
        for heights, area_data in batch_result:
            flattened_heights.append(heights)
            flattened_area_data.append(area_data)

    return flattened_heights, flattened_area_data

# Set the number of processes and batch size based on your system
n_processes = min(8, os.cpu_count())  # Use more cores if available
batch_size = 20  # Increase batch size for fewer network calls

# Create a list of coordinates
coordinates = geo_df[['Latitude', 'Longitude']].values.tolist()

# Time the processing
start_time = time.time()

# Process the coordinates in batches
building_heights, building_area_data = process_in_batches(
    coordinates,
    batch_size=batch_size,
    n_processes=n_processes
)

print(f"Processing completed in {time.time() - start_time:.2f} seconds")

# Add the building heights to the GeoDataFrame
geo_df['building_heights'] = building_heights

# Extract and add area data to the GeoDataFrame
geo_df['num_buildings'] = [data['num_buildings'] for data in building_area_data]
geo_df['total_building_area'] = [data['total_building_area'] for data in building_area_data]
geo_df['individual_building_areas'] = [data['individual_building_areas'] for data in building_area_data]
geo_df['buffer_area'] = [data['buffer_area'] for data in building_area_data]
geo_df['building_density'] = [data['building_density'] for data in building_area_data]
geo_df['building_count_density'] = [data['building_count_density'] for data in building_area_data]

# Calculate height statistics exactly as in the original code
geo_df['num_buildings_with_height'] = geo_df['building_heights'].apply(len)
geo_df['mean_height'] = geo_df['building_heights'].apply(
    lambda heights: np.mean(heights) if heights else np.nan
)
geo_df['max_height'] = geo_df['building_heights'].apply(
    lambda heights: max(heights) if heights else np.nan
)

# Calculate additional area statistics
geo_df['mean_building_area'] = geo_df.apply(
    lambda row: np.mean(row['individual_building_areas']) if row['individual_building_areas'] else np.nan,
    axis=1
)
geo_df['max_building_area'] = geo_df.apply(
    lambda row: max(row['individual_building_areas']) if row['individual_building_areas'] else np.nan,
    axis=1
)

# Display summary of key metrics (non-NaN values)
print(f"Points with buildings: {(geo_df['num_buildings'] > 0).sum()} of {len(geo_df)} ({(geo_df['num_buildings'] > 0).sum()/len(geo_df)*100:.1f}%)")
print(f"Points with height data: {(~geo_df['mean_height'].isna()).sum()} of {len(geo_df)} ({(~geo_df['mean_height'].isna()).sum()/len(geo_df)*100:.1f}%)")

# Display the updated GeoDataFrame
geo_df.head()

GeoDataFrame creation took 0.12 seconds
Processing 11229 points in 562 batches using 2 processes


Processing batches:   0%|          | 0/562 [00:00<?, ?it/s]

KeyboardInterrupt: 

**RUN THIS CODE BLOCK INSTEAD**

In [15]:
#### TEST ###### to include floor area ratio
#FAR = defined as the total floor area of a building (or buildings) divided by the area of the lot on which they are built.
#FAR provides a measure of the intensity of land use by comparing how much floor space is built relative to the plot size
# Function to create geometry points - vectorized instead of apply
def create_geometry_points(df):
    geometry = [Point(lon, lat) for lon, lat in zip(df['Longitude'], df['Latitude'])]
    return gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')

# Convert DataFrame to GeoDataFrame in a more efficient way
start = time.time()
geo_df = create_geometry_points(ground_df)
print(f"GeoDataFrame creation took {time.time() - start:.2f} seconds")

# Function to estimate number of floors from height
def estimate_floors(height):
    # Assuming average floor height of 6 meters
    # Return at least 1 floor even for small buildings
    if height is None or pd.isna(height):
        return None
    return max(1, int(height / 6))

# Function to get buildings data with batching
def get_buildings_data_batch(coords_batch, dist=50):
    results = []

    # Process each coordinate in the batch
    for lat, lon in coords_batch:
        try:
            # Create a small buffer to check for previously downloaded buildings
            point = (lat, lon)

            # Use features_from_point which is faster than geometries_from_point
            buildings = ox.features_from_point(point, tags={'building': True}, dist=dist)

            # Check for height data - consider multiple height columns
            height_cols = ['height', 'building:height']
            heights = []
            building_floor_areas = []
            total_floor_area = 0

            # Reproject to a projected CRS for accurate area calculation
            if len(buildings) > 0:
                buildings_proj = buildings.to_crs(epsg=3857)  # Web Mercator projection

                # Calculate area for each building
                building_areas = buildings_proj.geometry.area

                # Calculate total area covered by buildings (in square meters)
                total_building_area = building_areas.sum()

                # Extract heights and calculate floor areas
                for i, building in buildings.iterrows():
                    building_height = None

                    # Try to get height from available columns
                    for col in height_cols:
                        if col in buildings.columns and pd.notna(building.get(col)):
                            try:
                                building_height = float(building[col])
                                heights.append(building_height)
                                break
                            except (ValueError, TypeError):
                                pass

                    # Get the building footprint area
                    building_area = building_areas.loc[i] if i in building_areas.index else 0

                    # Estimate number of floors and calculate floor area
                    if building_height is not None:
                        floors = estimate_floors(building_height)
                        floor_area = building_area * floors
                        building_floor_areas.append(floor_area)
                        total_floor_area += floor_area
                    else:
                        # Assume 1 floor if no height data
                        building_floor_areas.append(building_area)
                        total_floor_area += building_area

                # Calculate buffer area and FAR
                buffer_area = np.pi * (dist ** 2)
                far = total_floor_area / buffer_area if buffer_area > 0 else 0

                # Store results
                area_data = {
                    'num_buildings': len(buildings),
                    'total_building_area': total_building_area,
                    'total_floor_area': total_floor_area,
                    'individual_building_areas': building_areas.tolist(),
                    'individual_floor_areas': building_floor_areas,
                    'buffer_area': buffer_area,
                    'building_density': total_building_area / buffer_area if buffer_area > 0 else 0,
                    'building_count_density': len(buildings) / (buffer_area / 10000) if buffer_area > 0 else 0,  # buildings per hectare
                    'floor_area_ratio': far
                }
            else:
                area_data = {
                    'num_buildings': 0,
                    'total_building_area': 0,
                    'total_floor_area': 0,
                    'individual_building_areas': [],
                    'individual_floor_areas': [],
                    'buffer_area': np.pi * (dist ** 2),
                    'building_density': 0,
                    'building_count_density': 0,
                    'floor_area_ratio': 0
                }

            results.append((heights, area_data))
        except Exception as e:
            # Return empty data if there's an error
            results.append(([], {
                'num_buildings': 0,
                'total_building_area': 0,
                'total_floor_area': 0,
                'individual_building_areas': [],
                'individual_floor_areas': [],
                'buffer_area': np.pi * (dist ** 2),
                'building_density': 0,
                'building_count_density': 0,
                'floor_area_ratio': 0
            }))

    return results

# Function to batch process coordinates
def process_in_batches(coordinates, batch_size=10, n_processes=4):
    # Split coordinates into batches for more efficient processing
    n_coords = len(coordinates)
    batches = [coordinates[i:i+batch_size] for i in range(0, n_coords, batch_size)]

    print(f"Processing {n_coords} points in {len(batches)} batches using {n_processes} processes")

    # Process batches in parallel
    with Pool(processes=n_processes) as pool:
        results = list(tqdm(
            pool.imap(get_buildings_data_batch, batches),
            total=len(batches),
            desc="Processing batches"
        ))

    # Flatten the results
    flattened_heights = []
    flattened_area_data = []

    for batch_result in results:
        for heights, area_data in batch_result:
            flattened_heights.append(heights)
            flattened_area_data.append(area_data)

    return flattened_heights, flattened_area_data

# Set the number of processes and batch size based on your system
n_processes = min(8, os.cpu_count())  # Use more cores if available
batch_size = 20  # Increase batch size for fewer network calls

# Create a list of coordinates
coordinates = geo_df[['Latitude', 'Longitude']].values.tolist()

# Time the processing
start_time = time.time()

# Process the coordinates in batches
building_heights, building_area_data = process_in_batches(
    coordinates,
    batch_size=batch_size,
    n_processes=n_processes
)

print(f"Processing completed in {time.time() - start_time:.2f} seconds")

# Add the building heights to the GeoDataFrame
geo_df['building_heights'] = building_heights

# Extract and add area data to the GeoDataFrame
geo_df['num_buildings'] = [data['num_buildings'] for data in building_area_data]
geo_df['total_building_area'] = [data['total_building_area'] for data in building_area_data]
geo_df['total_floor_area'] = [data['total_floor_area'] for data in building_area_data]
geo_df['individual_building_areas'] = [data['individual_building_areas'] for data in building_area_data]
geo_df['individual_floor_areas'] = [data.get('individual_floor_areas', []) for data in building_area_data]
geo_df['buffer_area'] = [data['buffer_area'] for data in building_area_data]
geo_df['building_density'] = [data['building_density'] for data in building_area_data]
geo_df['building_count_density'] = [data['building_count_density'] for data in building_area_data]
geo_df['floor_area_ratio'] = [data['floor_area_ratio'] for data in building_area_data]

# Calculate height statistics exactly as in the original code
geo_df['num_buildings_with_height'] = geo_df['building_heights'].apply(len)
geo_df['mean_height'] = geo_df['building_heights'].apply(
    lambda heights: np.mean(heights) if heights else np.nan
)
geo_df['max_height'] = geo_df['building_heights'].apply(
    lambda heights: max(heights) if heights else np.nan
)

# Calculate additional area statistics
geo_df['mean_building_area'] = geo_df.apply(
    lambda row: np.mean(row['individual_building_areas']) if row['individual_building_areas'] else np.nan,
    axis=1
)
geo_df['max_building_area'] = geo_df.apply(
    lambda row: max(row['individual_building_areas']) if row['individual_building_areas'] else np.nan,
    axis=1
)

# Calculate mean floor area
geo_df['mean_floor_area'] = geo_df.apply(
    lambda row: np.mean(row['individual_floor_areas']) if row['individual_floor_areas'] else np.nan,
    axis=1
)

# Display the updated GeoDataFrame
geo_df.head()

GeoDataFrame creation took 0.11 seconds
Processing 11229 points in 562 batches using 2 processes


Processing batches:   0%|          | 0/562 [00:00<?, ?it/s]

Process ForkPoolWorker-8:
Process ForkPoolWorker-7:
Traceback (most recent call last):
  File "/usr/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Traceback (most recent call last):


KeyboardInterrupt: 

In [None]:
# Function to create geometry points - vectorized instead of apply
def create_geometry_points(df):
    geometry = [Point(lon, lat) for lon, lat in zip(df['Longitude'], df['Latitude'])]
    return gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')

# Convert DataFrame to GeoDataFrame in a more efficient way
start = time.time()
geo_df = create_geometry_points(ground_df)
print(f"GeoDataFrame creation took {time.time() - start:.2f} seconds")

# Function to get building heights with batching
def get_building_heights_batch(coords_batch, dist=100):
    results = []

    # Process each coordinate in the batch
    for lat, lon in coords_batch:
        try:
            # Create a small buffer to check for previously downloaded buildings
            point = (lat, lon)

            # Use features_from_point which is faster than geometries_from_point
            buildings = ox.features_from_point(point, tags={'building': True}, dist=dist)

            # Check for height data - consider multiple height columns
            height_cols = ['height', 'building:height']
            heights = []

            for col in height_cols:
                if col in buildings.columns:
                    col_heights = buildings[col].dropna()
                    # Convert to numeric, coerce errors to NaN
                    numeric_heights = pd.to_numeric(col_heights, errors='coerce')
                    heights.extend(numeric_heights.dropna().tolist())

            results.append(heights)
        except Exception as e:
            # Return empty list if there's an error
            results.append([])

    return results

# Function to batch process coordinates
def process_in_batches(coordinates, batch_size=10, n_processes=4):
    # Split coordinates into batches for more efficient processing
    n_coords = len(coordinates)
    batches = [coordinates[i:i+batch_size] for i in range(0, n_coords, batch_size)]

    print(f"Processing {n_coords} points in {len(batches)} batches using {n_processes} processes")

    # Process batches in parallel
    with Pool(processes=n_processes) as pool:
        results = list(tqdm(
            pool.imap(get_building_heights_batch, batches),
            total=len(batches),
            desc="Processing batches"
        ))

    # Flatten the results
    flattened_results = []
    for batch_result in results:
        flattened_results.extend(batch_result)

    return flattened_results

# Set the number of processes and batch size based on your system
n_processes = min(8, os.cpu_count())  # Use more cores if available
batch_size = 20  # Increase batch size for fewer network calls

# Create a list of coordinates
coordinates = geo_df[['Latitude', 'Longitude']].values.tolist()

# Time the processing
start_time = time.time()

# Process the coordinates in batches
building_heights = process_in_batches(
    coordinates,
    batch_size=batch_size,
    n_processes=n_processes
)

print(f"Processing completed in {time.time() - start_time:.2f} seconds")

# Add the building heights to the GeoDataFrame
geo_df['building_heights'] = building_heights

# Calculate some useful statistics
geo_df['num_buildings'] = geo_df['building_heights'].apply(len)
geo_df['mean_height'] = geo_df['building_heights'].apply(
    lambda heights: np.mean(heights) if heights else np.nan
)

# Display the updated GeoDataFrame
geo_df.head()

GeoDataFrame creation took 0.13 seconds
Processing 11229 points in 562 batches using 2 processes


Processing batches:   0%|          | 0/562 [00:00<?, ?it/s]

Process ForkPoolWorker-2:


KeyboardInterrupt: 

In [None]:
#Turn back into a dataframe
ground_df = geo_df.drop(columns='geometry')

In [None]:
ground_df.columns

Index(['Longitude', 'Latitude', 'datetime', 'UHI Index', 'total_building_area',
       'building_density', 'mean_height', 'Relative Humidity [percent]',
       'Avg Wind Speed [m/s]', 'Wind Direction [degrees]',
       'Solar Flux [W/m^2]', 'floor_area_ratio', 'region', 'building_heights',
       'num_buildings', 'total_floor_area', 'individual_building_areas',
       'individual_floor_areas', 'buffer_area', 'building_count_density',
       'num_buildings_with_height', 'max_height', 'mean_building_area',
       'max_building_area', 'mean_floor_area'],
      dtype='object')

In [None]:
#Drop building heights and number of buildings
ground_df = ground_df.drop(columns=['building_heights', 'num_buildings',
       'individual_building_areas', 'buffer_area', 'building_count_density',
       'num_buildings_with_height', 'max_height', 'mean_building_area',
       'max_building_area'])

In [None]:
# @title
#DONT RUN
# Load the building footprints dataset
buildings = gpd.read_file('/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/NewYork.geojson')

# Convert points to a GeoDataFrame efficiently
geometry = [Point(lon, lat) for lon, lat in zip(ground_df['Longitude'], ground_df['Latitude'])]
gdf = gpd.GeoDataFrame(ground_df, geometry=geometry, crs="EPSG:4326")

# Make sure CRS matches
if buildings.crs != gdf.crs:
    buildings = buildings.to_crs(gdf.crs)

# Define what "near" means - this distance is in the units of your CRS
# If using EPSG:4326 (WGS84 lat/long), distance is in degrees
# For better distance calculations, consider projecting to a local projected CRS
# Example: for NYC, EPSG:2263 (NY State Plane Long Island) would be appropriate
near_distance = 0.0001  # Roughly 10-11 meters in WGS84 at NYC latitude
                        # Adjust based on your needs

# For more accurate distance calculations, project to a meter-based CRS
# Uncomment and modify these lines if you need accurate distances:
# buildings = buildings.to_crs(epsg=2263)  # NY State Plane
# gdf = gdf.to_crs(epsg=2263)
# near_distance = 10  # 10 meters

# Buffer the buildings to identify "near" areas
print("Buffering buildings...")
buffered_buildings = buildings.copy()
buffered_buildings['geometry'] = buildings.geometry.buffer(near_distance)

# Initialize result dataframe
result_df = ground_df.copy()
result_df['near_building'] = False

# Process in efficient batches
batch_size = 10000
for i in range(0, len(gdf), batch_size):
    print(f"Processing batch {i//batch_size + 1}...")
    batch = gdf.iloc[i:i+batch_size]

    # Use spatial index to find candidate matches
    possible_matches_index = list(buffered_buildings.sindex.intersection(batch.total_bounds))
    possible_matches = buffered_buildings.iloc[possible_matches_index]

    # Perform spatial join between points and buffered buildings
    joined = gpd.sjoin(batch, possible_matches, predicate='within', how='left')
    near_indices = joined.dropna(subset=['index_right']).index.unique()

    # Update results
    result_df.loc[near_indices, 'near_building'] = True

# Alternative approach if you prefer to calculate actual distances
# This would create a 'distance_to_nearest' column
# Uncomment if you need actual distances (will be slower)
"""
print("Calculating distances to nearest buildings...")
result_df['distance_to_nearest'] = np.inf

# Process in batches
for i in range(0, len(gdf), batch_size):
    batch = gdf.iloc[i:i+batch_size]

    for idx, point in batch.iterrows():
        # Use spatial index to narrow down candidates
        point_buffer = point.geometry.buffer(near_distance * 5)  # Larger search radius
        candidates_idx = list(buildings.sindex.intersection(point_buffer.bounds))

        if candidates_idx:
            candidates = buildings.iloc[candidates_idx]
            distances = candidates.geometry.distance(point.geometry)
            min_distance = distances.min()
            result_df.loc[idx, 'distance_to_nearest'] = min_distance
            result_df.loc[idx, 'near_building'] = min_distance <= near_distance
"""

# Display results
result_df.head()

In [None]:
# @title
# Load the building footprints dataset
#buildings = gpd.read_file('/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/NewYork.geojson')

# Define "near" distance (adjust as needed)
near_distance = 0.0001  # ~10m in WGS84 at NYC latitude

# ---------------------------------------------------------------------
# OPTIMIZATION 1: Create a grid-based spatial index for fast lookups
# ---------------------------------------------------------------------
print("Creating grid index...")

# Get bounds of all buildings and expand by near_distance
minx, miny, maxx, maxy = buildings.total_bounds
minx -= near_distance
miny -= near_distance
maxx += near_distance
maxy += near_distance

# Create a grid index (adjust grid size based on your data density)
# Smaller grid = more memory but faster lookups
grid_size = near_distance * 10  # Grid cells 10x the near distance

# Calculate number of cells in each dimension
nx = int((maxx - minx) / grid_size) + 1
ny = int((maxy - miny) / grid_size) + 1

print(f"Grid dimensions: {nx} x {ny} cells")

# Create empty grid
grid = {}

# Function to get grid cell for a point
def get_cell(x, y):
    i = int((x - minx) / grid_size)
    j = int((y - miny) / grid_size)
    return i, j

# Create a bounding box for each building and assign to grid cells
for idx, building in buildings.iterrows():
    # Get building bounds and expand by near_distance
    bx_min, by_min, bx_max, by_max = building.geometry.bounds
    bx_min -= near_distance
    by_min -= near_distance
    bx_max += near_distance
    by_max += near_distance

    # Find all grid cells this building overlaps
    min_i, min_j = get_cell(bx_min, by_min)
    max_i, max_j = get_cell(bx_max, by_max)

    # Create a bounding box for this building with buffer
    building_box = box(bx_min, by_min, bx_max, by_max)

    # Add to all overlapping grid cells
    for i in range(min_i, max_i + 1):
        for j in range(min_j, max_j + 1):
            if (i, j) not in grid:
                grid[(i, j)] = []
            grid[(i, j)].append((idx, building_box))

print(f"Grid index created with {len(grid)} non-empty cells")

# ---------------------------------------------------------------------
# OPTIMIZATION 2: Use bounding box checks before more expensive operations
# ---------------------------------------------------------------------
print("Processing points...")

# Convert points to a NumPy array for faster access
points = np.array([(lon, lat) for lon, lat in zip(ground_df['Longitude'], ground_df['Latitude'])])

# Initialize results array
near_building = np.zeros(len(points), dtype=bool)

# Process points in batches
batch_size = 10000
for batch_start in range(0, len(points), batch_size):
    batch_end = min(batch_start + batch_size, len(points))
    print(f"Processing points {batch_start} to {batch_end-1}...")

    for i in range(batch_start, batch_end):
        x, y = points[i]
        point = Point(x, y)

        # Get grid cell for this point
        cell = get_cell(x, y)

        # Check if grid cell exists (has buildings nearby)
        if cell not in grid:
            continue

        # Check against building bounding boxes in this cell
        for building_idx, building_box in grid[cell]:
            # Fast bounding box check
            if building_box.contains(point):
                # Only do precise check if bounding box check passes
                if buildings.loc[building_idx, 'geometry'].distance(point) <= near_distance:
                    near_building[i] = True
                    break  # Early termination once we find a match

# Create result DataFrame
result_df = ground_df.copy()
result_df['near_building'] = near_building

print("Analysis complete!")
result_df.head()

Creating grid index...
Grid dimensions: 261 x 255 cells
Grid index created with 15816 non-empty cells
Processing points...
Processing points 0 to 9999...
Processing points 10000 to 11228...
Analysis complete!


Unnamed: 0,Longitude,Latitude,datetime,UHI Index,near_building
0,-73.909167,40.813107,24-07-2021 15:53,1.030289,False
1,-73.909187,40.813045,24-07-2021 15:53,1.030289,False
2,-73.909215,40.812978,24-07-2021 15:53,1.023798,False
3,-73.909242,40.812908,24-07-2021 15:53,1.023798,False
4,-73.909257,40.812845,24-07-2021 15:53,1.021634,False


In [None]:
#Rename near_building to Building
ground_df = result_df.copy()
ground_df = ground_df.rename(columns={'near_building': 'Building'})
#Change value from True False to 1 0
ground_df['Building'] = ground_df['Building'].apply(lambda x: 1 if x == True else 0)

NameError: name 'result_df' is not defined

### Weather data

In [None]:
#Change datetime object
ground_df['datetime'] = pd.to_datetime(ground_df['datetime'])

In [None]:
#Reverse geocoding into ground_df by creating a new column to signify region by reverse geocoding from lat and long
ground_df['region'] = ground_df.apply(lambda x: reverse_geocode.search([(x['Latitude'], x['Longitude'])])[0]['county'], axis=1)

In [None]:
#Group into 2 areas by bronx and not bronx
ground_df['region'] = ground_df['region'].apply(lambda x: 'Bronx' if x == 'Bronx County' else 'Manhattan')

In [None]:
#Load the weather dataset
weather_bronx = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/NY_Mesonet_Weather.xlsx', sheet_name='Bronx')
weather_manhattan = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/NY_Mesonet_Weather.xlsx', sheet_name='Manhattan')

In [None]:
#change datetime of weather bronx and weather manhattan
weather_bronx['Date / Time'] = pd.to_datetime(weather_bronx['Date / Time'])
weather_manhattan['Date / Time'] = pd.to_datetime(weather_manhattan['Date / Time'])

In [None]:
ground_df.columns

Index(['Longitude', 'Latitude', 'datetime', 'UHI Index', 'total_building_area',
       'building_density', 'mean_height', 'Relative Humidity [percent]',
       'Avg Wind Speed [m/s]', 'Wind Direction [degrees]',
       'Solar Flux [W/m^2]', 'floor_area_ratio', 'region', 'total_floor_area',
       'individual_floor_areas', 'mean_floor_area'],
      dtype='object')

In [None]:
#Function to find the nearest hour and minute match in weather data
def nearest_time_match(ground_time, weather_df):
    #Calculate time differences in minutes
    time_diffs = abs((weather_df['Date / Time'] - ground_time).dt.total_seconds() / 60)

    #Find the index of the minimum time difference
    nearest_index = time_diffs.idxmin()

    return weather_df.loc[nearest_index]

#Apply the nearest_time_match function to each row in ground_df for Bronx region
ground_df_bronx = ground_df[ground_df['region'] == 'Bronx'].copy()
merged_data = []
for index, row in ground_df_bronx.iterrows():
    try:
      nearest_weather = nearest_time_match(row['datetime'], weather_bronx)
      merged_row = pd.concat([row, nearest_weather])
      merged_data.append(merged_row)
    except:
      pass
# Create the merged DataFrame
merged_bronx = pd.DataFrame(merged_data)

merged_bronx.head()

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [None]:
#Apply the nearest_time_match function to each row in ground_df for Manhattan region
ground_df_manhattan = ground_df[ground_df['region'] == 'Manhattan'].copy()
merged_data = []
for index, row in ground_df_manhattan.iterrows():
    try:
      nearest_weather = nearest_time_match(row['datetime'], weather_manhattan)
      merged_row = pd.concat([row, nearest_weather])
      merged_data.append(merged_row)
    except:
      pass

# Create the merged DataFrame
merged_manhattan = pd.DataFrame(merged_data)

merged_manhattan.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,total_building_area,building_density,mean_height,region,Date / Time,Air Temp at Surface [degC],Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],Solar Flux [W/m^2]
0,-73.981233,40.784337,2021-07-24 15:36:00,1.009974,39293.71586,1.250758,28.12549,Manhattan,2021-07-24 15:35:00,26.8,47.6,2.4,209,511
1,-73.981257,40.784328,2021-07-24 15:36:00,1.00781,38693.159519,1.231642,28.152,Manhattan,2021-07-24 15:35:00,26.8,47.6,2.4,209,511
2,-73.981285,40.784322,2021-07-24 15:36:00,1.00781,39916.262657,1.270574,28.359615,Manhattan,2021-07-24 15:35:00,26.8,47.6,2.4,209,511
3,-73.981318,40.784297,2021-07-24 15:36:00,1.00781,39879.871585,1.269416,28.271154,Manhattan,2021-07-24 15:35:00,26.8,47.6,2.4,209,511
4,-73.981338,40.784268,2021-07-24 15:36:00,1.00781,39032.340531,1.242438,28.522,Manhattan,2021-07-24 15:35:00,26.8,47.6,2.4,209,511


In [None]:
#combine merge_bronx and merge_manhattan
ground_df = pd.concat([merged_bronx, merged_manhattan])
ground_df.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,total_building_area,building_density,mean_height,region,Date / Time,Air Temp at Surface [degC],Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],Solar Flux [W/m^2]
Unnamed 0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,26892.391463,0.856011,15.641176,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,24927.629968,0.793471,14.484375,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,24557.541148,0.781691,14.89375,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,24742.197435,0.787569,15.051515,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,24896.404772,0.792477,14.95,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621


In [None]:
#combine and clean column
ground_df = ground_df.reset_index()
#drop column
ground_df = ground_df.drop(columns=['Date / Time', 'region', 'Air Temp at Surface [degC]', 'index'])

KeyError: "['Date / Time', 'Air Temp at Surface [degC]'] not found in axis"

In [None]:
ground_df.drop(columns=['total_floor_area', 'individual_floor_areas','mean_floor_area'], inplace=True)

In [None]:
#drop index
ground_df = ground_df.drop(columns=['index'])

KeyError: "['index'] not found in axis"

### Sattelite data

#### Extracting Landsat-8 data

In [None]:
#Extracting lst data
from rasterio.windows import Window
def map_lst_data(tiff_path, csv_path):
    """
    Extract raster values at specified lat/lon coordinates from a GeoTIFF file.

    Parameters:
    -----------
    tiff_path : str
        Path to the GeoTIFF file
    csv_path : str
        Path to CSV file containing 'Latitude' and 'Longitude' columns

    Returns:
    --------
    pd.DataFrame
        DataFrame containing original coordinates and extracted values
    """
    # Read points from CSV
    df = pd.read_csv(csv_path)

    # Open the raster file and extract values
    with rio.open(tiff_path) as src:
        coords = list(zip(df['Longitude'], df['Latitude']))
        lst_values = []

        for lon, lat in coords:
            try:
                row, col = src.index(lon, lat)
                window = Window(col, row, 1, 1)
                value = src.read(1, window=window)
                lst_values.append(float(value[0][0]))
            except (IndexError, ValueError):
                lst_values.append(None)

    # Create and return output DataFrame
    return pd.DataFrame({
        'Latitude': df['Latitude'],
        'Longitude': df['Longitude'],
        'LST': lst_values
    })

In [None]:
#map satellite data from landsat
tiff_path_2 = '/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/Landsat_LST.tiff'
lst_data = map_lst_data(tiff_path = tiff_path_2, csv_path = csv_path)

In [None]:
lst_data.describe()

Unnamed: 0,Latitude,Longitude,LST
count,11229.0,11229.0,11229.0
mean,40.8088,-73.933927,40.588086
std,0.023171,0.028253,2.752488
min,40.758792,-73.994457,32.532037
25%,40.790905,-73.955703,39.121979
50%,40.810688,-73.932968,40.827571
75%,40.824515,-73.909647,42.310992
max,40.859497,-73.879458,53.200804


#### Extracting Sentinel-2 data

In [None]:
#Extracting spectral data from geotiff image, allowing for buffer zone

def map_sent_data(tiff_path, csv_path, buffer_distance):
    # Read the CSV file using pandas
    df = pd.read_csv(csv_path)

    # Create points from coordinates
    geometry = [Point(lon, lat) for lon, lat in zip(df['Longitude'], df['Latitude'])]
    gdf = gpd.GeoDataFrame(df, crs='epsg:4326', geometry=geometry)

    # Initialize results DataFrame with original data
    results_df = df.copy()

    with rio.open(tiff_path) as src:
        # Transform points to raster CRS
        gdf = gdf.to_crs(src.crs)

        band_name_mapping = {
            1: 'B01',
            2: 'B02',
            3: 'B03',
            4: 'B04',
            5: 'B08',
            6: 'B12'
        }

        # Process each point individually
        for idx, point in enumerate(tqdm(gdf.geometry, desc="Processing locations")):
            # Create buffer for this specific point
            buffered_point = point.buffer(buffer_distance)

            # Get the pixel coordinates for this specific point
            row, col = src.index(point.x, point.y)

            # Calculate window size based on buffer
            buffer_pixels = int(np.ceil(buffer_distance / src.res[0]))
            window = rio.windows.Window(
                col - buffer_pixels,
                row - buffer_pixels,
                2 * buffer_pixels + 1,
                2 * buffer_pixels + 1
            )

            # Process each band for this specific point
            for band_idx, band_name in band_name_mapping.items():
                try:
                    # Read data for this window
                    data = src.read(band_idx, window=window)

                    # Create mask for the buffer
                    shapes = [(buffered_point, 1)]
                    mask = rio.features.rasterize(
                        shapes,
                        out_shape=data.shape,
                        transform=rio.windows.transform(window, src.transform),
                        fill=0,
                        dtype='uint8'
                    )

                    # Calculate mean for masked area
                    masked_data = data[mask == 1]
                    if len(masked_data) > 0:
                        mean_value = np.mean(masked_data)
                    else:
                        # Fallback to single pixel value if no pixels in buffer
                        mean_value = src.read(band_idx, window=((row, row+1), (col, col+1)))[0][0]

                    # Assign value to specific row and band
                    results_df.at[idx, band_name] = mean_value

                except Exception as e:
                    # Fallback to single pixel value in case of any error
                    value = src.read(band_idx, window=((row, row+1), (col, col+1)))[0][0]
                    results_df.at[idx, band_name] = value

    return results_df

In [None]:
#Open the GeoTIFF file
tiff_path = '/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/S2_sample.tiff'
#Call function
senti_data = map_sent_data(tiff_path = tiff_path, csv_path = csv_path, buffer_distance = 100)

Processing locations:   0%|          | 0/11229 [00:00<?, ?it/s]

#### Calculate Indexes

In [None]:
# Calculate NDVI (Normalized Difference Vegetation Index) and handle division by zero by replacing infinities with NaN.
# See the Sentinel-2 sample notebook for more information about the NDVI index
senti_data['NDVI'] = (senti_data['B08'] - senti_data['B04']) / (senti_data['B08'] + senti_data['B04'])
senti_data['NDVI'] = senti_data['NDVI'].replace([np.inf, -np.inf], np.nan)

In [None]:
#EVI
senti_data['EVI'] = (2.5*(senti_data['B08'] - senti_data['B04'])) / (senti_data['B08'] + 6*senti_data['B04'] - 7.5 * senti_data['B02'] + 1)
senti_data['EVI'] = senti_data['EVI'].replace([np.inf, -np.inf], np.nan)

In [None]:
# NDBI
senti_data['NDBI'] = (senti_data['B12'] - senti_data['B08']) / (senti_data['B12'] + senti_data['B08'])
senti_data['NDBI'] = senti_data['NDBI'].replace([np.inf, -np.inf], np.nan)

## Joining the predictor variables and response variables

In [None]:
#Combining ground data, weather data and satellite data into a single dataset.
uhi_data = pd.concat([ground_df,senti_data, lst_data], axis=1)
uhi_data = uhi_data.loc[:,~uhi_data.columns.duplicated()]
uhi_data.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,total_building_area,building_density,mean_height,Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],...,B01,B02,B03,B04,B08,B12,NDVI,EVI,NDBI,LST
0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,26892.391463,0.856011,15.641176,47.3,2.6,165,...,730.5,703.5,775.5,775.5,1199.5,1188.5,0.214684,1.836293,-0.004606,42.345172
1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,24927.629968,0.793471,14.484375,47.3,2.6,165,...,730.5,703.5,775.5,775.5,1199.5,1188.5,0.214684,1.836293,-0.004606,42.345172
2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,24557.541148,0.781691,14.89375,47.3,2.6,165,...,730.5,564.0,760.5,650.5,1852.0,879.0,0.48012,1.968381,-0.35628,41.442815
3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,24742.197435,0.787569,15.051515,47.3,2.6,165,...,730.5,491.5,652.0,552.5,1757.0,879.0,0.521541,2.171444,-0.33308,41.442815
4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,24896.404772,0.792477,14.95,47.3,2.6,165,...,730.5,500.0,601.5,585.0,1644.0,951.0,0.475101,1.884342,-0.267052,41.152283


In [None]:
#Drop unnecessary column
uhi_data = uhi_data.drop(columns=['level_0'])

KeyError: "['level_0'] not found in axis"

## Cleaning data

### Remove duplicates

In [None]:
#Check for columns to clean
uhi_data.columns[3:]

Index(['UHI Index', 'total_building_area', 'building_density', 'mean_height',
       'Relative Humidity [percent]', 'Avg Wind Speed [m/s]',
       'Wind Direction [degrees]', 'Solar Flux [W/m^2]', 'floor_area_ratio',
       'region', 'B01', 'B02', 'B03', 'B04', 'B08', 'B12', 'NDVI', 'EVI',
       'NDBI', 'LST'],
      dtype='object')

In [None]:
# Remove duplicate rows from the DataFrame based on specified columns and keep the first occurrence
columns_to_check = uhi_data.columns[3:]
for col in columns_to_check:
    # Check if the value is a numpy array and has more than one dimension
    uhi_data[col] = uhi_data[col].apply(lambda x: tuple(x) if isinstance(x, np.ndarray) and x.ndim > 0 else x)

# Now remove duplicates
uhi_data = uhi_data.drop_duplicates(subset=columns_to_check, keep='first')
uhi_data.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,total_building_area,building_density,mean_height,Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],...,B01,B02,B03,B04,B08,B12,NDVI,EVI,NDBI,LST
0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,26892.391463,0.856011,15.641176,47.3,2.6,165,...,730.5,703.5,775.5,775.5,1199.5,1188.5,0.214684,1.836293,-0.004606,42.345172
1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,24927.629968,0.793471,14.484375,47.3,2.6,165,...,730.5,703.5,775.5,775.5,1199.5,1188.5,0.214684,1.836293,-0.004606,42.345172
2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,24557.541148,0.781691,14.89375,47.3,2.6,165,...,730.5,564.0,760.5,650.5,1852.0,879.0,0.48012,1.968381,-0.35628,41.442815
3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,24742.197435,0.787569,15.051515,47.3,2.6,165,...,730.5,491.5,652.0,552.5,1757.0,879.0,0.521541,2.171444,-0.33308,41.442815
4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,24896.404772,0.792477,14.95,47.3,2.6,165,...,730.5,500.0,601.5,585.0,1644.0,951.0,0.475101,1.884342,-0.267052,41.152283


### Remove missing values

In [None]:
#check for nan
uhi_data.isna().sum()

Unnamed: 0,0
Longitude,0
Latitude,0
datetime,0
UHI Index,0
total_building_area,0
building_density,0
mean_height,295
Relative Humidity [percent],0
Avg Wind Speed [m/s],0
Wind Direction [degrees],0


In [None]:
#remove missing values
uhi_data = uhi_data.dropna()

In [None]:
# Resetting the index of the dataset
uhi_data=uhi_data.reset_index(drop=True)

In [None]:
uhi_data.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,total_building_area,building_density,mean_height,Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],...,B01,B02,B03,B04,B08,B12,NDVI,EVI,NDBI,LST
0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,26892.391463,0.856011,15.641176,47.3,2.6,165,...,730.5,703.5,775.5,775.5,1199.5,1188.5,0.214684,1.836293,-0.004606,42.345172
1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,24927.629968,0.793471,14.484375,47.3,2.6,165,...,730.5,703.5,775.5,775.5,1199.5,1188.5,0.214684,1.836293,-0.004606,42.345172
2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,24557.541148,0.781691,14.89375,47.3,2.6,165,...,730.5,564.0,760.5,650.5,1852.0,879.0,0.48012,1.968381,-0.35628,41.442815
3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,24742.197435,0.787569,15.051515,47.3,2.6,165,...,730.5,491.5,652.0,552.5,1757.0,879.0,0.521541,2.171444,-0.33308,41.442815
4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,24896.404772,0.792477,14.95,47.3,2.6,165,...,730.5,500.0,601.5,585.0,1644.0,951.0,0.475101,1.884342,-0.267052,41.152283


In [None]:
### TEST #### include building volume
#Create a new column to calculate building volume
#Doesn't seem to improve accuracy
uhi_data['building_volume'] = uhi_data['mean_height'] * uhi_data['total_building_area']

In [None]:
uhi_data.describe()

Unnamed: 0,UHI Index,total_building_area,building_density,mean_height,Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],Solar Flux [W/m^2],floor_area_ratio,B01,B02,B03,B04,B08,B12,NDVI,EVI,NDBI,LST,building_volume
count,10636.0,10636.0,10636.0,10636.0,10636.0,10636.0,10636.0,10636.0,10636.0,10636.0,10636.0,10636.0,10636.0,10636.0,10636.0,10636.0,10636.0,10636.0,10636.0,10636.0
mean,1.00045,36959.884915,1.17647,18.51144,46.368607,3.070825,159.440579,455.328977,8.287103,1015.063793,967.821032,1113.306271,1162.285493,1749.414159,1486.849332,0.205956,0.94244,-0.075598,40.738815,700789.2
std,0.016207,17583.611624,0.559704,10.927547,2.771798,0.665421,32.228141,177.71057,7.018947,323.919729,454.421638,480.152373,537.561357,605.319348,535.26404,0.198457,4.385172,0.190478,2.622671,592711.9
min,0.956122,55.286741,0.00176,2.5,40.2,1.4,75.0,128.0,0.00176,232.5,173.0,182.0,170.5,214.0,185.5,-0.173883,-193.181818,-0.740248,32.532037,138.2169
25%,0.989178,27378.815737,0.871495,11.491017,45.4,2.8,142.0,292.0,3.834421,800.0,660.5,795.0,794.5,1370.0,1126.5,0.057221,0.296564,-0.177699,39.272372,355828.5
50%,1.000838,35396.843817,1.126717,16.347894,47.3,3.2,162.0,511.0,6.028421,992.5,910.0,1049.0,1113.0,1689.0,1421.0,0.132318,0.723732,-0.05257,40.921567,541348.7
75%,1.011537,44130.733851,1.404725,22.494306,47.7,3.5,184.0,605.0,10.344792,1221.5,1165.0,1328.0,1432.0,2080.0,1773.0,0.300322,1.476636,0.04614,42.341754,866464.9
max,1.046036,213006.664486,6.780213,144.9,51.1,4.5,209.0,725.0,60.541968,2522.0,4418.0,4680.0,4838.0,4792.0,4799.5,0.887741,253.888889,0.520722,53.200804,5964797.0


## Model Building

In [None]:
#Drop the lat-lon columns
uhi_data = uhi_data.drop(columns=['Latitude', 'Longitude', 'datetime'])

### Feature selection

In [None]:
uhi_data.columns

Index(['UHI Index', 'total_building_area', 'building_density', 'mean_height',
       'Relative Humidity [percent]', 'Avg Wind Speed [m/s]',
       'Wind Direction [degrees]', 'Solar Flux [W/m^2]', 'floor_area_ratio',
       'region', 'B01', 'B02', 'B03', 'B04', 'B08', 'B12', 'NDVI', 'EVI',
       'NDBI', 'LST', 'building_volume'],
      dtype='object')

In [None]:
# Retaining only the columns for the most important features in the dataset.
uhi_final = uhi_data[['B01','B12','NDVI','NDBI','LST', 'total_building_area', 'mean_height',
                    'Avg Wind Speed [m/s]', 'Solar Flux [W/m^2]','Wind Direction [degrees]',
                     'Relative Humidity [percent]','UHI Index', 'building_density', 'floor_area_ratio']]

In [None]:
#, 'total_building_area'

### Train Test Split

In [None]:
#Split the data into features (X) and target (y), and then into training and testing sets
X = uhi_final.drop(columns=['UHI Index']).values
y = uhi_final['UHI Index'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=42)

### Feature scaling

In [None]:
#Scale the training and test data using standardscaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Model Training

In [None]:
#Train the Random Forest model on the training data
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train,y_train)

### Model Evaluation

#### In-sample

In [None]:
#Make predictions on the training data
insample_predictions = model.predict(X_train)
#calculate R-squared score for in-sample predictions
Y_train = y_train.tolist()
r2_score(Y_train, insample_predictions)

0.981550855949046

#### Out-sample

In [None]:
#Make predictions on the test data
outsample_predictions = model.predict(X_test)

#calculate R-squared score for out-sample predictions
Y_test = y_test.tolist()
r2_score(Y_test, outsample_predictions)

0.8668657131356591

In [None]:
def train_and_evaluate_models(X_train, y_train, X_test, y_test):

    results = {}

    # Random Forest Regressor
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    results['RandomForest'] = {
        'in_sample': r2_score(y_train, rf_model.predict(X_train)),
        'out_sample': r2_score(y_test, rf_model.predict(X_test))
    }

    # XGBoost Regressor
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    xgb_model.fit(X_train, y_train)
    results['XGBoost'] = {
        'in_sample': r2_score(y_train, xgb_model.predict(X_train)),
        'out_sample': r2_score(y_test, xgb_model.predict(X_test))
    }

    # AdaBoost Regressor
    ada_model = AdaBoostRegressor(random_state=42)
    ada_model.fit(X_train, y_train)
    results['AdaBoost'] = {
        'in_sample': r2_score(y_train, ada_model.predict(X_train)),
        'out_sample': r2_score(y_test, ada_model.predict(X_test))
    }

    return results


In [None]:
#Compare in&out sample  evaluation
train_and_evaluate_models(X_train, y_train, X_test, y_test)

{'RandomForest': {'in_sample': 0.9836602325532597,
  'out_sample': 0.8762860952254259},
 'XGBoost': {'in_sample': 0.9611005438408912,
  'out_sample': 0.8439501421151752},
 'AdaBoost': {'in_sample': 0.3476980864234156,
  'out_sample': 0.32508706648396146}}

#### Parameter tuning

In [None]:
#Function to tune parameters
def tune_parameters(X_train, y_train):
    """
    Tunes hyperparameters for RandomForestRegressor, XGBoostRegressor, and AdaBoostRegressor.

    Args:
        X_train: Training features.
        y_train: Training target variable.

    Returns:
        A dictionary containing the best estimators for each model.
    """

    tuned_models = {}

    #Random Forest Regressor
    rf_param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10, 15, 20],
        'min_samples_leaf': [2, 4, 6, 8],
        'max_features': ['auto', 'sqrt', 'log2', None]}
    rf_model = RandomForestRegressor(random_state=42)
    rf_random = RandomizedSearchCV(estimator=rf_model, param_distributions=rf_param_grid, n_iter=10, cv=10, scoring='r2', random_state=42, n_jobs=-1)
    rf_random.fit(X_train, y_train)
    tuned_models['RandomForest'] = rf_random.best_params_

    #XGBoost Regressor
    xgb_param_grid = {
    'n_estimators': [100, 150, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 6, 9, 12],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]}
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    xgb_random = RandomizedSearchCV(estimator=xgb_model, param_distributions=xgb_param_grid, n_iter=10, cv=10, scoring='r2', random_state=42, n_jobs=-1)
    xgb_random.fit(X_train, y_train)
    tuned_models['XGBoost'] = xgb_random.best_params_

    #AdaBoost Regressor
    ada_param_grid = {
        'n_estimators': [50, 100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],
        'estimator__max_depth': [2, 4, 6, 8, 10]  #Depth of decision trees
        }
    ada_model = AdaBoostRegressor(estimator=DecisionTreeRegressor(random_state=42, max_depth=10), random_state=42)
    ada_random = RandomizedSearchCV(estimator=ada_model, param_distributions=ada_param_grid, n_iter=10, cv=10, scoring='r2', random_state=42, n_jobs=-1)
    ada_random.fit(X_train, y_train)
    tuned_models['AdaBoost'] = ada_random.best_params_

    return tuned_models


In [None]:
#Tune 3 models
tuned_estimators = tune_parameters(X_train, y_train)
tuned_estimators

{'RandomForest': {'n_estimators': 300,
  'min_samples_split': 5,
  'min_samples_leaf': 4,
  'max_features': 'log2',
  'max_depth': 40},
 'XGBoost': {'subsample': 0.8,
  'n_estimators': 300,
  'max_depth': 9,
  'learning_rate': 0.05,
  'colsample_bytree': 0.6},
 'AdaBoost': {'n_estimators': 500,
  'learning_rate': 0.01,
  'estimator__max_depth': 10}}

In [None]:
#Function to evaluate tuned models

def evaluate_tuned_models(X_train, y_train, X_test, y_test, tuned_params):
    """
    Trains and evaluates tuned models.

    Args:
        X_train: Training features.
        y_train: Training target variable.
        X_test: Testing features.
        y_test: Testing target variable.
        tuned_params: A dictionary containing the best hyperparameters for each model.

    Returns:
        A dictionary containing the evaluation metrics for each model.
    """
    results = {}

    # Random Forest Regressor
    rf_model = RandomForestRegressor(**tuned_params['RandomForest'], random_state=42)
    rf_model.fit(X_train, y_train)
    results['RandomForest'] = {
        'in_sample': r2_score(y_train, rf_model.predict(X_train)),
        'out_sample': r2_score(y_test, rf_model.predict(X_test))
    }

    # XGBoost Regressor
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, **tuned_params['XGBoost'])
    xgb_model.fit(X_train, y_train)
    results['XGBoost'] = {
        'in_sample': r2_score(y_train, xgb_model.predict(X_train)),
        'out_sample': r2_score(y_test, xgb_model.predict(X_test))
    }

    # AdaBoost Regressor
    ada_model = AdaBoostRegressor(estimator=DecisionTreeRegressor(random_state=42, max_depth=tuned_params['AdaBoost']['estimator__max_depth']), random_state=42, **{k:v for k, v in tuned_params['AdaBoost'].items() if k != 'estimator__max_depth'})
    ada_model.fit(X_train, y_train)
    results['AdaBoost'] = {
        'in_sample': r2_score(y_train, ada_model.predict(X_train)),
        'out_sample': r2_score(y_test, ada_model.predict(X_test))
    }

    return results


In [None]:
#Evaluate tuned model
evaluate_tuned_models(X_train, y_train, X_test, y_test, tuned_estimators)

{'RandomForest': {'in_sample': 0.9235745696151765,
  'out_sample': 0.8216702379859108},
 'XGBoost': {'in_sample': 0.9889114361432049,
  'out_sample': 0.8842730663272294},
 'AdaBoost': {'in_sample': 0.8330370826024855,
  'out_sample': 0.7540233510249127}}

In [None]:
#retest random forest regressor
model = RandomForestRegressor(max_depth=40, max_features='log2', min_samples_leaf=4,
                       min_samples_split=5, n_estimators=300, random_state=42)
model.fit(X_train, y_train)

In [None]:
#retest xgb
model = xgb.XGBRegressor(colsample_bytree=0.8, learning_rate=0.2, max_depth=12,
              n_estimators=200, subsample = 1, random_state=42)
model.fit(X_train, y_train)

In [None]:
#retest ada boost
model = AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=10,
                                                   random_state=42),
                   learning_rate=0.01, n_estimators=500, random_state=42)
model.fit(X_train, y_train)

In [None]:
#Make predictions on the training data
insample_predictions = model.predict(X_train)
#calculate R-squared score for in-sample predictions
Y_train = y_train.tolist()
r2_score(Y_train, insample_predictions)

0.905017548162673

In [None]:
#Make predictions on the test data
outsample_predictions = model.predict(X_test)

#calculate R-squared score for out-sample predictions
Y_test = y_test.tolist()
r2_score(Y_test, outsample_predictions)

0.7851774248151164