<a href="https://colab.research.google.com/github/hamydang16/EY_Data-Challenge-2025/blob/main/EY_Data_Challenge__2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Urban Heat Island Challenge

## Load Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install rioxarray
!pip install stackstac
!pip install pystac_client
!pip install planetary_computer
!pip install odc-stac
!pip install rasterstats
!pip install geopy
!pip install reverse_geocode
!pip install osmnx

In [None]:
!pip install rtree

Collecting rtree
  Downloading rtree-1.4.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.1 kB)
Downloading rtree-1.4.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (541 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/541.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.3/541.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m541.1/541.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rtree
Successfully installed rtree-1.4.0


In [None]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Science
import numpy as np
import pandas as pd

# Multi-dimensional arrays and datasets
import xarray as xr

# Geospatial raster data handling
import rioxarray as rxr

# Geospatial data analysis
import geopandas as gpd
import reverse_geocode
import osmnx as ox

# Geospatial operations
import rasterio as rio
from rasterio import windows
from rasterio import features
from rasterio import warp
from rasterio.warp import transform_bounds
from rasterio.windows import from_bounds

# Image Processing
from PIL import Image

# Coordinate transformations
from pyproj import Proj, Transformer, CRS

# Feature Engineering
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

# Machine Learning
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
import xgboost as xgb
from sklearn.metrics import r2_score


# Planetary Computer Tools
import pystac_client
import planetary_computer as pc
from pystac.extensions.eo import EOExtension as eo

# Others
import os
from tqdm.auto import tqdm
from shapely.geometry import Point, mapping, box, Polygon
from geopy.distance import geodesic
from rasterstats import zonal_stats
from multiprocessing import Pool
import time
from functools import partial
from rtree import index

In [None]:
pd.set_option('display.max_colwidth', None)

## Training data

In [None]:
#csv path
csv_path = '/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/Training_data_uhi_index_2025-02-18.csv'

#Load the training data from csv file and display the first few rows to inspect the data
ground_df = pd.read_csv(csv_path)
ground_df.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index
0,-73.909167,40.813107,24-07-2021 15:53,1.030289
1,-73.909187,40.813045,24-07-2021 15:53,1.030289
2,-73.909215,40.812978,24-07-2021 15:53,1.023798
3,-73.909242,40.812908,24-07-2021 15:53,1.023798
4,-73.909257,40.812845,24-07-2021 15:53,1.021634


## Response Variables

### Buidling data

* Building data is taken from NYC Open Data for the state of New York.

* Data is subset to only include Bronx and New York county by filter the Building Identification Number. The first digit is a borough code (1 = Manhattan, 2 = The Bronx, 3 = Brooklyn, 4 = Queens, 5 = Staten Island).  [Link](https://data.cityofnewyork.us/City-Government/Building-Footprints/5zhs-2jue/about_data)


* *Metadata*: [Link](https://github.com/CityOfNewYork/nyc-geo-metadata/blob/main/Metadata/Metadata_BuildingFootprints.md)

In [None]:
##Import NYC Open Data
buildings = gpd.read_file('/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/Building Footprints_20250311.geojson')


In [None]:
#convert BIN to str
buildings['bin'] = buildings['bin'].astype(str)
#subset to only manhattan and bronx through bin starts with either a 1 or 2
nyc_buildings = buildings[buildings['bin'].str.match(r'^[12]')]
nyc_buildings.head(2)

Unnamed: 0,name,base_bbl,shape_area,heightroof,mpluto_bbl,cnstrct_yr,globalid,lststatype,feat_code,groundelev,geomsource,bin,lstmoddate,doitt_id,shape_len,geometry
18,,1021210037,0.0,59.722628,1021210037,1910,{A0E56BCC-A86B-4CEF-9A42-9B4ECD61743F},Constructed,2100,154,Photogramm,1062896,2017-08-22,708881,0.0,"MULTIPOLYGON (((-73.9387 40.83782, -73.93863 40.83779, -73.93869 40.83772, -73.93876 40.83775, -73.93876 40.83776, -73.93887 40.83781, -73.93896 40.83785, -73.93892 40.8379, -73.93878 40.83785, -73.93872 40.83782, -73.9387 40.83782)))"
48,,2027810500,0.0,10.54654743,2027810500,1973,{2323D1C1-3086-4286-A469-4D0CE8D0756C},Constructed,2100,11,Photogramm,2117853,2014-07-16,813985,0.0,"MULTIPOLYGON (((-73.87287 40.80269, -73.8729 40.80264, -73.87303 40.8027, -73.87299 40.80274, -73.87287 40.80269)))"
58,,1008870001,0.0,155.49,1008870001,1935,{0B06F7A8-1F83-44F3-97DD-AC524A6374CA},Constructed,2100,37,Photogramm,1018457,2017-08-22,584238,0.0,"MULTIPOLYGON (((-73.98237 40.74524, -73.9822 40.74517, -73.98214 40.74525, -73.98212 40.74524, -73.98208 40.74522, -73.98209 40.74521, -73.9822 40.74505, -73.98221 40.74505, -73.98224 40.74506, -73.98243 40.74514, -73.98252 40.74518, -73.98245 40.74527, -73.98237 40.74524, -73.98237 40.74524)))"
68,,1013490035,0.0,167.21446831,1013490035,1931,{A091951D-C73E-4B77-9A61-716C666C5446},Constructed,2100,52,Photogramm,1039988,2017-08-22,66240,0.0,"MULTIPOLYGON (((-73.96425 40.7586, -73.96421 40.75867, -73.96419 40.75868, -73.96412 40.75865, -73.96387 40.75855, -73.96395 40.75843, -73.96403 40.75847, -73.96403 40.75847, -73.96417 40.75854, -73.96417 40.75854, -73.96427 40.75858, -73.96425 40.7586)))"
86,,1010610006,0.0,28.57,1010610006,1910,{2BE9F674-67E7-4A49-8535-F636A16D8DA5},Constructed,2100,40,Photogramm,1026714,2017-08-22,23730,0.0,"MULTIPOLYGON (((-73.99058 40.76523, -73.99065 40.76513, -73.99072 40.76516, -73.99065 40.76526, -73.99058 40.76523)))"


In [None]:
nyc_buildings.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 149591 entries, 18 to 1082183
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   name        1476 non-null    object        
 1   base_bbl    149591 non-null  object        
 2   shape_area  149591 non-null  object        
 3   heightroof  149591 non-null  object        
 4   mpluto_bbl  149591 non-null  object        
 5   cnstrct_yr  147089 non-null  object        
 6   globalid    149591 non-null  object        
 7   lststatype  149507 non-null  object        
 8   feat_code   149591 non-null  object        
 9   groundelev  149455 non-null  object        
 10  geomsource  149534 non-null  object        
 11  bin         149591 non-null  object        
 12  lstmoddate  149591 non-null  datetime64[ms]
 13  doitt_id    149591 non-null  object        
 14  shape_len   149591 non-null  object        
 15  geometry    149591 non-null  geometry      
dt

In [None]:
nyc_buildings.columns

Index(['shape_area', 'heightroof', 'feat_code', 'groundelev', 'geomsource',
       'bin', 'shape_len', 'geometry'],
      dtype='object')

In [None]:
#Drop columns in the geodataframe nyc_buildings
nyc_buildings.drop(columns=['name', 'base_bbl', 'mpluto_bbl',
       'cnstrct_yr', 'globalid', 'lststatype',
        'lstmoddate', 'doitt_id', 'geomsource', 'shape_area', 'shape_len'], inplace=True)

In [None]:
#add a centroid column from geom
nyc_buildings['centroid'] = nyc_buildings['geometry'].centroid

In [91]:
#convert unit from feet to meter for columns: shape_area, heightroof
nyc_buildings['heightroof'] = nyc_buildings['heightroof'] * 0.3048

In [92]:
nyc_buildings.head()

Unnamed: 0,shape_area,heightroof,feat_code,groundelev,geomsource,bin,shape_len,geometry,centroid
18,0.0,5.548414,2100,154.0,Photogramm,1062896,0.0,"MULTIPOLYGON (((589476.019 4521296.011, 589481.837 4521292.841, 589477.333 4521284.574, 589471.515 4521287.744, 589471.205 4521289.322, 589462.086 4521294.29, 589453.895 4521298.753, 589457.236 4521304.886, 589468.996 4521298.479, 589474.546 4521295.455, 589476.019 4521296.011)))",POINT (-73.93879 40.83781)
48,0.0,0.979806,2100,11.0,Photogramm,2117853,0.0,"MULTIPOLYGON (((595076.758 4517464.961, 595073.683 4517459.605, 595063.074 4517465.696, 595066.15 4517471.052, 595076.758 4517464.961)))",POINT (-73.87295 40.80269)
58,0.0,14.445494,2100,37.0,Photogramm,1018457,0.0,"MULTIPOLYGON (((585913.878 4510974.689, 585927.84 4510967.054, 585932.992 4510976.476, 585934.8 4510975.487, 585938.519 4510973.453, 585937.347 4510971.31, 585928.176 4510954.538, 585927.699 4510953.665, 585924.647 4510955.333, 585909.068 4510963.853, 585901.046 4510968.239, 585906.715 4510978.606, 585913.551 4510974.867, 585913.878 4510974.689)))",POINT (-73.9823 40.74517)
68,0.0,15.534732,2100,52.0,Photogramm,1039988,0.0,"MULTIPOLYGON (((587425.742 4512476.362, 587429.38 4512483.175, 587430.488 4512485.249, 587436.597 4512481.987, 587458.408 4512470.338, 587451.332 4512457.088, 587444.772 4512461.306, 587444.998 4512461.73, 587432.697 4512468.926, 587432.954 4512469.408, 587424.453 4512473.949, 587425.742 4512476.362)))",POINT (-73.96406 40.75856)
86,0.0,2.65424,2100,40.0,Photogramm,1026714,0.0,"MULTIPOLYGON (((585194.626 4513185.772, 585188.932 4513175.21, 585183.528 4513178.122, 585189.222 4513188.684, 585194.626 4513185.772)))",POINT (-73.99065 40.76519)


In [190]:
### CALCULATE BUILDING DATA ######
# Convert problematic columns to numeric, with errors='coerce' to convert invalid values to NaN
numeric_columns = ['heightroof', 'shape_area', 'groundelev']  # Added groundelev
for col in numeric_columns:
    if col in nyc_buildings.columns:
        nyc_buildings[col] = pd.to_numeric(nyc_buildings[col], errors='coerce')

# Drop rows with NaN values in critical columns
nyc_buildings = nyc_buildings.dropna(subset=['heightroof', 'shape_area', 'groundelev'])

# Convert ground_df to GeoDataFrame
geometry = [Point(xy) for xy in zip(ground_df['Longitude'], ground_df['Latitude'])]
ground_gdf = gpd.GeoDataFrame(ground_df, geometry=geometry, crs="EPSG:4326")

# Ensure both datasets use the same coordinate reference system
if nyc_buildings.crs != ground_gdf.crs:
    nyc_buildings = nyc_buildings.to_crs(ground_gdf.crs)

# Convert to a projected CRS for more accurate distance measurements
nyc_buildings = nyc_buildings.to_crs("EPSG:32618")
ground_gdf = ground_gdf.to_crs("EPSG:32618")

# Create spatial index for buildings
print("Creating spatial index...")
spatial_index = index.Index()
for idx, geom in enumerate(nyc_buildings.geometry):
    if geom is not None and geom.is_valid:
        spatial_index.insert(idx, geom.bounds)

# Add columns for metrics
ground_gdf['avg_building_height'] = np.nan
ground_gdf['num_buildings'] = 0
ground_gdf['total_building_area'] = 0.0
ground_gdf['building_density'] = 0.0  # building footprint area / buffer area
ground_gdf['floor_area_ratio'] = 0.0  # gross floor area / buffer area

# Function to calculate metrics using spatial index
def calculate_metrics(point, buildings_gdf, spatial_idx, distance=100):
    if point is None:
        return 0, 0, 0, 0, 0

    # Create buffer (in meters)
    buffer = point.buffer(distance)
    buffer_area = buffer.area  # in square meters

    # Use spatial index to find potential intersections
    potential_matches_idx = list(spatial_idx.intersection(buffer.bounds))

    if len(potential_matches_idx) > 0:
        # Get the actual buildings that intersect
        potential_matches = buildings_gdf.iloc[potential_matches_idx]
        precise_matches = potential_matches[potential_matches.intersects(buffer)]

        # Updated part of the calculate_metrics function
        if len(precise_matches) > 0:
            # Calculate the bounding box area for each building
            bounding_box_areas = []
            for idx, building in precise_matches.iterrows():
                if building.geometry.is_valid:
                    bounding_box = building.geometry.envelope  # Get the bounding box
                    bounding_box_areas.append(bounding_box.area)

            # Add bounding box areas to the dataframe
            precise_matches['bounding_box_area'] = bounding_box_areas

            # Calculate metrics
            avg_height = precise_matches['heightroof'].mean()
            num_buildings = len(precise_matches)
            total_building_area = sum(bounding_box_areas)  # Use bounding box areas
            building_density = total_building_area / buffer_area

            # Use groundelev as the number of floors (assuming it represents floor count)
            floor_counts = precise_matches['groundelev'].fillna(1).clip(lower=1)
            gross_floor_area = sum(precise_matches['bounding_box_area'] * floor_counts)
            floor_area_ratio = gross_floor_area / buffer_area

            return avg_height, num_buildings, total_building_area, building_density, floor_area_ratio


    return 0, 0, 0, 0, 0  # No buildings within buffer

# Process each point with progress tracking
print("Calculating metrics...")
for idx in tqdm(range(len(ground_gdf)), desc="Processing points"):
    try:
        point = ground_gdf.iloc[idx].geometry
        avg_height, num_bldgs, total_area, density, far = calculate_metrics(point, nyc_buildings, spatial_index, distance=700)

        ground_gdf.loc[idx, 'avg_building_height'] = avg_height
        ground_gdf.loc[idx, 'num_buildings'] = num_bldgs
        ground_gdf.loc[idx, 'total_building_area'] = total_area
        ground_gdf.loc[idx, 'building_density'] = density
        ground_gdf.loc[idx, 'floor_area_ratio'] = far
    except Exception as e:
        print(f"Error processing point {idx}: {e}")
        # Set values to NaN on error
        ground_gdf.loc[idx, ['avg_building_height', 'total_building_area', 'building_density', 'floor_area_ratio']] = np.nan
        ground_gdf.loc[idx, 'num_buildings'] = 0

# Convert back to original CRS for saving
ground_gdf = ground_gdf.to_crs("EPSG:4326")

# Save results
ground_gdf.head()

Creating spatial index...
Calculating metrics...


Processing points: 100%|██████████| 11229/11229 [40:54<00:00,  4.58it/s]


Unnamed: 0,Longitude,Latitude,datetime,UHI Index,avg_building_height,num_buildings,total_building_area,building_density,floor_area_ratio,Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],Solar Flux [W/m^2],geometry
0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,3.553857,1393,891568.994888,0.580105,18.64157,47.3,2.6,165,621,POINT (-73.90917 40.81311)
1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,3.554523,1386,886637.714314,0.576897,18.509338,47.3,2.6,165,621,POINT (-73.90919 40.81304)
2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,3.566601,1370,875459.768802,0.569624,18.289382,47.3,2.6,165,621,POINT (-73.90922 40.81298)
3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,3.567464,1362,874184.939669,0.568794,18.306696,47.3,2.6,165,621,POINT (-73.90924 40.81291)
4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,3.569945,1360,879534.36073,0.572275,18.428039,47.3,2.6,165,621,POINT (-73.90926 40.81284)


In [191]:
ground_gdf.describe()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,avg_building_height,num_buildings,total_building_area,building_density,floor_area_ratio,Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],Solar Flux [W/m^2]
count,11229.0,11229.0,11229,11229.0,11229.0,11229.0,11229.0,11229.0,11229.0,11229.0,11229.0,11229.0,11229.0
mean,-73.933927,40.8088,2021-07-24 15:34:29.056906240,1.000001,5.290654,1178.740493,1016687.0,0.661514,36.009802,46.391237,3.079731,159.998753,454.357823
min,-73.994457,40.758792,2021-07-24 15:01:00,0.956122,1.797684,109.0,425138.3,0.276619,4.966554,40.2,1.4,75.0,128.0
25%,-73.955703,40.790905,2021-07-24 15:22:00,0.988577,3.675129,814.0,832656.0,0.541773,19.24282,45.4,2.8,146.0,236.0
50%,-73.932968,40.810688,2021-07-24 15:36:00,1.000237,4.926012,1159.0,982364.9,0.639182,33.376172,47.3,3.2,163.0,511.0
75%,-73.909647,40.824515,2021-07-24 15:48:00,1.011176,6.798871,1532.0,1176198.0,0.765301,50.793946,47.7,3.5,184.0,605.0
max,-73.879458,40.859497,2021-07-24 15:59:00,1.046036,15.901457,2665.0,1798141.0,1.169973,120.862339,51.1,4.5,209.0,725.0
std,0.028253,0.023171,,0.016238,2.08248,489.312918,250060.3,0.162703,19.924555,2.759809,0.657176,32.266041,178.521571


In [192]:
#drop geometry
#Turn back into a dataframe
ground_df = ground_gdf.drop(columns='geometry')

In [132]:
ground_df.columns

Index(['Longitude', 'Latitude', 'datetime', 'UHI Index', 'avg_building_height',
       'num_buildings', 'total_building_area', 'building_density',
       'floor_area_ratio', 'Relative Humidity [percent]',
       'Avg Wind Speed [m/s]', 'Wind Direction [degrees]',
       'Solar Flux [W/m^2]'],
      dtype='object')

### Weather data

In [75]:
#Change datetime object
ground_df['datetime'] = pd.to_datetime(ground_df['datetime'])

In [76]:
#Reverse geocoding into ground_df by creating a new column to signify region by reverse geocoding from lat and long
ground_df['region'] = ground_df.apply(lambda x: reverse_geocode.search([(x['Latitude'], x['Longitude'])])[0]['county'], axis=1)

In [77]:
#Group into 2 areas by bronx and not bronx
ground_df['region'] = ground_df['region'].apply(lambda x: 'Bronx' if x == 'Bronx County' else 'Manhattan')

In [78]:
#Load the weather dataset
weather_bronx = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/NY_Mesonet_Weather.xlsx', sheet_name='Bronx')
weather_manhattan = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/NY_Mesonet_Weather.xlsx', sheet_name='Manhattan')

In [79]:
#change datetime of weather bronx and weather manhattan
weather_bronx['Date / Time'] = pd.to_datetime(weather_bronx['Date / Time'])
weather_manhattan['Date / Time'] = pd.to_datetime(weather_manhattan['Date / Time'])

In [80]:
ground_df.columns

Index(['Longitude', 'Latitude', 'datetime', 'UHI Index', 'avg_building_height',
       'num_buildings', 'total_building_area', 'building_density',
       'floor_area_ratio', 'region'],
      dtype='object')

In [81]:
#Function to find the nearest hour and minute match in weather data
def nearest_time_match(ground_time, weather_df):
    #Calculate time differences in minutes
    time_diffs = abs((weather_df['Date / Time'] - ground_time).dt.total_seconds() / 60)

    #Find the index of the minimum time difference
    nearest_index = time_diffs.idxmin()

    return weather_df.loc[nearest_index]

#Apply the nearest_time_match function to each row in ground_df for Bronx region
ground_df_bronx = ground_df[ground_df['region'] == 'Bronx'].copy()
merged_data = []
for index, row in ground_df_bronx.iterrows():
    try:
      nearest_weather = nearest_time_match(row['datetime'], weather_bronx)
      merged_row = pd.concat([row, nearest_weather])
      merged_data.append(merged_row)
    except:
      pass
# Create the merged DataFrame
merged_bronx = pd.DataFrame(merged_data)

merged_bronx.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,avg_building_height,num_buildings,total_building_area,building_density,floor_area_ratio,region,Date / Time,Air Temp at Surface [degC],Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],Solar Flux [W/m^2]
Unnamed 0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,49.1164,30,9666.517432,0.30819,10.922979,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,52.181926,27,9414.838308,0.300166,10.576054,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,52.397067,30,9219.622457,0.293942,10.280267,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,52.7304,30,9111.018945,0.290479,10.083872,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,51.62975,32,9141.532829,0.291452,10.076775,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621


In [82]:
#Apply the nearest_time_match function to each row in ground_df for Manhattan region
ground_df_manhattan = ground_df[ground_df['region'] == 'Manhattan'].copy()
merged_data = []
for index, row in ground_df_manhattan.iterrows():
    try:
      nearest_weather = nearest_time_match(row['datetime'], weather_manhattan)
      merged_row = pd.concat([row, nearest_weather])
      merged_data.append(merged_row)
    except:
      pass

# Create the merged DataFrame
merged_manhattan = pd.DataFrame(merged_data)

merged_manhattan.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,avg_building_height,num_buildings,total_building_area,building_density,floor_area_ratio,region,Date / Time,Air Temp at Surface [degC],Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],Solar Flux [W/m^2]
0,-73.981233,40.784337,2021-07-24 15:36:00,1.009974,89.399867,37,13054.906172,0.416219,26.704496,Manhattan,2021-07-24 15:35:00,26.8,47.6,2.4,209,511
1,-73.981257,40.784328,2021-07-24 15:36:00,1.00781,88.491489,37,13107.479396,0.417895,26.808539,Manhattan,2021-07-24 15:35:00,26.8,47.6,2.4,209,511
2,-73.981285,40.784322,2021-07-24 15:36:00,1.00781,87.605397,38,13163.826619,0.419691,26.904472,Manhattan,2021-07-24 15:35:00,26.8,47.6,2.4,209,511
3,-73.981318,40.784297,2021-07-24 15:36:00,1.00781,86.789617,39,13272.601042,0.423159,27.251787,Manhattan,2021-07-24 15:35:00,26.8,47.6,2.4,209,511
4,-73.981338,40.784268,2021-07-24 15:36:00,1.00781,85.556377,40,13361.049477,0.425979,27.612176,Manhattan,2021-07-24 15:35:00,26.8,47.6,2.4,209,511


In [83]:
#combine merge_bronx and merge_manhattan
ground_df = pd.concat([merged_bronx, merged_manhattan])
ground_df.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,avg_building_height,num_buildings,total_building_area,building_density,floor_area_ratio,region,Date / Time,Air Temp at Surface [degC],Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],Solar Flux [W/m^2]
Unnamed 0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,49.1164,30,9666.517432,0.30819,10.922979,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,52.181926,27,9414.838308,0.300166,10.576054,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,52.397067,30,9219.622457,0.293942,10.280267,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,52.7304,30,9111.018945,0.290479,10.083872,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,51.62975,32,9141.532829,0.291452,10.076775,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621


In [84]:
#combine and clean column
ground_df = ground_df.reset_index()
#drop column
ground_df = ground_df.drop(columns=['Date / Time', 'region', 'Air Temp at Surface [degC]', 'index'])

In [None]:
ground_df.drop(columns=['total_floor_area', 'individual_floor_areas','mean_floor_area'], inplace=True)

### Sattelite data

#### Extracting Landsat-8 data

In [85]:
#Extracting lst data
from rasterio.windows import Window
def map_lst_data(tiff_path, csv_path):
    """
    Extract raster values at specified lat/lon coordinates from a GeoTIFF file.

    Parameters:
    -----------
    tiff_path : str
        Path to the GeoTIFF file
    csv_path : str
        Path to CSV file containing 'Latitude' and 'Longitude' columns

    Returns:
    --------
    pd.DataFrame
        DataFrame containing original coordinates and extracted values
    """
    # Read points from CSV
    df = pd.read_csv(csv_path)

    # Open the raster file and extract values
    with rio.open(tiff_path) as src:
        coords = list(zip(df['Longitude'], df['Latitude']))
        lst_values = []

        for lon, lat in coords:
            try:
                row, col = src.index(lon, lat)
                window = Window(col, row, 1, 1)
                value = src.read(1, window=window)
                lst_values.append(float(value[0][0]))
            except (IndexError, ValueError):
                lst_values.append(None)

    # Create and return output DataFrame
    return pd.DataFrame({
        'Latitude': df['Latitude'],
        'Longitude': df['Longitude'],
        'LST': lst_values
    })

In [86]:
#map satellite data from landsat
tiff_path_2 = '/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/Landsat_LST.tiff'
lst_data = map_lst_data(tiff_path = tiff_path_2, csv_path = csv_path)

In [87]:
lst_data.describe()

Unnamed: 0,Latitude,Longitude,LST
count,11229.0,11229.0,11229.0
mean,40.8088,-73.933927,40.588086
std,0.023171,0.028253,2.752488
min,40.758792,-73.994457,32.532037
25%,40.790905,-73.955703,39.121979
50%,40.810688,-73.932968,40.827571
75%,40.824515,-73.909647,42.310992
max,40.859497,-73.879458,53.200804


#### Extracting Sentinel-2 data

In [88]:
#Extracting spectral data from geotiff image, allowing for buffer zone

def map_sent_data(tiff_path, csv_path, buffer_distance):
    # Read the CSV file using pandas
    df = pd.read_csv(csv_path)

    # Create points from coordinates
    geometry = [Point(lon, lat) for lon, lat in zip(df['Longitude'], df['Latitude'])]
    gdf = gpd.GeoDataFrame(df, crs='epsg:4326', geometry=geometry)

    # Initialize results DataFrame with original data
    results_df = df.copy()

    with rio.open(tiff_path) as src:
        # Transform points to raster CRS
        gdf = gdf.to_crs(src.crs)

        band_name_mapping = {
            1: 'B01',
            2: 'B02',
            3: 'B03',
            4: 'B04',
            5: 'B08',
            6: 'B12'
        }

        # Process each point individually
        for idx, point in enumerate(tqdm(gdf.geometry, desc="Processing locations")):
            # Create buffer for this specific point
            buffered_point = point.buffer(buffer_distance)

            # Get the pixel coordinates for this specific point
            row, col = src.index(point.x, point.y)

            # Calculate window size based on buffer
            buffer_pixels = int(np.ceil(buffer_distance / src.res[0]))
            window = rio.windows.Window(
                col - buffer_pixels,
                row - buffer_pixels,
                2 * buffer_pixels + 1,
                2 * buffer_pixels + 1
            )

            # Process each band for this specific point
            for band_idx, band_name in band_name_mapping.items():
                try:
                    # Read data for this window
                    data = src.read(band_idx, window=window)

                    # Create mask for the buffer
                    shapes = [(buffered_point, 1)]
                    mask = rio.features.rasterize(
                        shapes,
                        out_shape=data.shape,
                        transform=rio.windows.transform(window, src.transform),
                        fill=0,
                        dtype='uint8'
                    )

                    # Calculate mean for masked area
                    masked_data = data[mask == 1]
                    if len(masked_data) > 0:
                        mean_value = np.mean(masked_data)
                    else:
                        # Fallback to single pixel value if no pixels in buffer
                        mean_value = src.read(band_idx, window=((row, row+1), (col, col+1)))[0][0]

                    # Assign value to specific row and band
                    results_df.at[idx, band_name] = mean_value

                except Exception as e:
                    # Fallback to single pixel value in case of any error
                    value = src.read(band_idx, window=((row, row+1), (col, col+1)))[0][0]
                    results_df.at[idx, band_name] = value

    return results_df

In [89]:
#Open the GeoTIFF file
tiff_path = '/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/S2_sample.tiff'
#Call function
senti_data = map_sent_data(tiff_path = tiff_path, csv_path = csv_path, buffer_distance = 700)

Processing locations: 100%|██████████| 11229/11229 [14:54<00:00, 12.55it/s]


#### Calculate Indexes

In [103]:
# Calculate NDVI (Normalized Difference Vegetation Index) and handle division by zero by replacing infinities with NaN.
# See the Sentinel-2 sample notebook for more information about the NDVI index
senti_data['NDVI'] = (senti_data['B08'] - senti_data['B04']) / (senti_data['B08'] + senti_data['B04'])
senti_data['NDVI'] = senti_data['NDVI'].replace([np.inf, -np.inf], np.nan)

In [210]:
#EVI
senti_data['EVI'] = (2.5*(senti_data['B08'] - senti_data['B04'])) / ((senti_data['B08'] + 6*senti_data['B04'] - 7.5 * senti_data['B02'] + 1)) /10000
senti_data['EVI'] = senti_data['EVI'].replace([np.inf, -np.inf], np.nan)

In [105]:
# NDBI
senti_data['NDBI'] = (senti_data['B12'] - senti_data['B08']) / (senti_data['B12'] + senti_data['B08'])
senti_data['NDBI'] = senti_data['NDBI'].replace([np.inf, -np.inf], np.nan)

## Joining the predictor variables and response variables

In [211]:
#Combining ground data, weather data and satellite data into a single dataset.
uhi_data = pd.concat([ground_df,senti_data, lst_data], axis=1)
uhi_data = uhi_data.loc[:,~uhi_data.columns.duplicated()]
uhi_data.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,avg_building_height,num_buildings,total_building_area,building_density,floor_area_ratio,Relative Humidity [percent],...,B01,B02,B03,B04,B08,B12,NDVI,EVI,NDBI,LST
0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,3.553857,1393,891568.994888,0.580105,18.64157,47.3,...,730.5,703.5,775.5,775.5,1199.5,1188.5,0.214684,0.000184,-0.004606,42.345172
1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,3.554523,1386,886637.714314,0.576897,18.509338,47.3,...,730.5,703.5,775.5,775.5,1199.5,1188.5,0.214684,0.000184,-0.004606,42.345172
2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,3.566601,1370,875459.768802,0.569624,18.289382,47.3,...,730.5,564.0,760.5,650.5,1852.0,879.0,0.48012,0.000197,-0.35628,41.442815
3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,3.567464,1362,874184.939669,0.568794,18.306696,47.3,...,730.5,491.5,652.0,552.5,1757.0,879.0,0.521541,0.000217,-0.33308,41.442815
4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,3.569945,1360,879534.36073,0.572275,18.428039,47.3,...,730.5,500.0,601.5,585.0,1644.0,951.0,0.475101,0.000188,-0.267052,41.152283


In [None]:
#Drop unnecessary column
uhi_data = uhi_data.drop(columns=['level_0'])

KeyError: "['level_0'] not found in axis"

## Cleaning data

### Remove duplicates

In [162]:
#Check for columns to clean
uhi_data.columns[3:]

Index(['UHI Index', 'avg_building_height', 'num_buildings',
       'total_building_area', 'building_density', 'floor_area_ratio',
       'Relative Humidity [percent]', 'Avg Wind Speed [m/s]',
       'Wind Direction [degrees]', 'Solar Flux [W/m^2]', 'B01', 'B02', 'B03',
       'B04', 'B08', 'B12', 'NDVI', 'EVI', 'NDBI', 'LST'],
      dtype='object')

In [212]:
# Remove duplicate rows from the DataFrame based on specified columns and keep the first occurrence
columns_to_check = uhi_data.columns[3:]
for col in columns_to_check:
    # Check if the value is a numpy array and has more than one dimension
    uhi_data[col] = uhi_data[col].apply(lambda x: tuple(x) if isinstance(x, np.ndarray) and x.ndim > 0 else x)

# Now remove duplicates
uhi_data = uhi_data.drop_duplicates(subset=columns_to_check, keep='first')
uhi_data.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,avg_building_height,num_buildings,total_building_area,building_density,floor_area_ratio,Relative Humidity [percent],...,B01,B02,B03,B04,B08,B12,NDVI,EVI,NDBI,LST
0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,3.553857,1393,891568.994888,0.580105,18.64157,47.3,...,730.5,703.5,775.5,775.5,1199.5,1188.5,0.214684,0.000184,-0.004606,42.345172
1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,3.554523,1386,886637.714314,0.576897,18.509338,47.3,...,730.5,703.5,775.5,775.5,1199.5,1188.5,0.214684,0.000184,-0.004606,42.345172
2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,3.566601,1370,875459.768802,0.569624,18.289382,47.3,...,730.5,564.0,760.5,650.5,1852.0,879.0,0.48012,0.000197,-0.35628,41.442815
3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,3.567464,1362,874184.939669,0.568794,18.306696,47.3,...,730.5,491.5,652.0,552.5,1757.0,879.0,0.521541,0.000217,-0.33308,41.442815
4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,3.569945,1360,879534.36073,0.572275,18.428039,47.3,...,730.5,500.0,601.5,585.0,1644.0,951.0,0.475101,0.000188,-0.267052,41.152283


### Remove missing values

In [213]:
#check for nan
uhi_data.isna().sum()

Unnamed: 0,0
Longitude,0
Latitude,0
datetime,0
UHI Index,0
avg_building_height,0
num_buildings,0
total_building_area,0
building_density,0
floor_area_ratio,0
Relative Humidity [percent],0


In [None]:
#remove missing values
uhi_data = uhi_data.dropna()

In [None]:
# Resetting the index of the dataset
uhi_data=uhi_data.reset_index(drop=True)

In [196]:
uhi_data.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,avg_building_height,num_buildings,total_building_area,building_density,floor_area_ratio,Relative Humidity [percent],...,B01,B02,B03,B04,B08,B12,NDVI,EVI,NDBI,LST
0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,3.553857,1393,891568.994888,0.580105,18.64157,47.3,...,730.5,703.5,775.5,775.5,1199.5,1188.5,0.214684,1.836293,-0.004606,42.345172
1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,3.554523,1386,886637.714314,0.576897,18.509338,47.3,...,730.5,703.5,775.5,775.5,1199.5,1188.5,0.214684,1.836293,-0.004606,42.345172
2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,3.566601,1370,875459.768802,0.569624,18.289382,47.3,...,730.5,564.0,760.5,650.5,1852.0,879.0,0.48012,1.968381,-0.35628,41.442815
3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,3.567464,1362,874184.939669,0.568794,18.306696,47.3,...,730.5,491.5,652.0,552.5,1757.0,879.0,0.521541,2.171444,-0.33308,41.442815
4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,3.569945,1360,879534.36073,0.572275,18.428039,47.3,...,730.5,500.0,601.5,585.0,1644.0,951.0,0.475101,1.884342,-0.267052,41.152283


In [None]:
### TEST #### include building volume
#Create a new column to calculate building volume
#Doesn't seem to improve accuracy
uhi_data['building_volume'] = uhi_data['mean_height'] * uhi_data['total_building_area']

In [223]:
uhi_data.describe()

Unnamed: 0,UHI Index,avg_building_height,num_buildings,total_building_area,building_density,floor_area_ratio,Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],Solar Flux [W/m^2],B01,B02,B03,B04,B08,B12,NDVI,EVI,NDBI,LST
count,11220.0,11220.0,11220.0,11220.0,11220.0,11220.0,11220.0,11220.0,11220.0,11220.0,11220.0,11220.0,11220.0,11220.0,11220.0,11220.0,11220.0,11220.0,11220.0,11220.0
mean,0.999996,5.291206,1179.175847,1016790.0,0.661581,36.007439,46.391676,3.080053,160.003922,454.352496,999.495053,954.371301,1100.024064,1145.400223,1767.843048,1471.569296,0.217177,9.9e-05,-0.085345,40.587407
std,0.016235,2.082756,489.1912,250085.1,0.16272,19.91088,2.759364,0.656979,32.265931,178.483533,334.63345,457.29901,481.765654,542.431927,616.088403,538.789588,0.21112,0.000442,0.199669,2.752298
min,0.956122,1.797684,109.0,425138.3,0.276619,4.966554,40.2,1.4,75.0,128.0,147.0,173.0,182.0,170.5,214.0,185.5,-0.173883,-0.019318,-0.740248,32.532037
25%,0.988577,3.675324,814.0,832872.8,0.541914,19.260922,45.4,2.8,146.0,236.0,785.875,644.5,776.5,770.5,1376.75,1098.0,0.058188,3e-05,-0.189971,39.121125
50%,1.000237,4.925882,1160.0,982425.0,0.639221,33.37767,47.3,3.2,163.0,511.0,983.5,898.0,1038.0,1097.0,1703.0,1406.0,0.136333,7.4e-05,-0.056833,40.827571
75%,1.011176,6.799236,1532.25,1176491.0,0.765491,50.7932,47.7,3.5,184.0,605.0,1215.0,1158.0,1322.0,1424.25,2118.0,1759.0,0.316857,0.000154,0.043958,42.310992
max,1.046036,15.901457,2665.0,1798141.0,1.169973,120.862339,51.1,4.5,209.0,725.0,2522.0,4418.0,4680.0,4838.0,4792.0,4799.5,0.887741,0.025389,0.520722,53.200804


## Model Building

In [215]:
#Drop the lat-lon columns
uhi_data = uhi_data.drop(columns=['Latitude', 'Longitude', 'datetime'])

### Feature selection

In [199]:
uhi_data.columns

Index(['UHI Index', 'avg_building_height', 'num_buildings',
       'total_building_area', 'building_density', 'floor_area_ratio',
       'Relative Humidity [percent]', 'Avg Wind Speed [m/s]',
       'Wind Direction [degrees]', 'Solar Flux [W/m^2]', 'B01', 'B02', 'B03',
       'B04', 'B08', 'B12', 'NDVI', 'EVI', 'NDBI', 'LST'],
      dtype='object')

In [224]:
# Retaining only the columns for the most important features in the dataset.
uhi_final = uhi_data[['B01','B12','NDVI','NDBI','LST', 'EVI',
                     'Avg Wind Speed [m/s]', 'Solar Flux [W/m^2]','Wind Direction [degrees]',
                     'Relative Humidity [percent]','UHI Index', 'avg_building_height', 'num_buildings', 'total_building_area',
                     'building_density', 'floor_area_ratio']]

## Retaining only the columns for the most important features in the dataset.
#uhi_final = uhi_data[['B01','B12','NDVI','NDBI','LST', 'total_building_area', 'mean_height',
                    #'Avg Wind Speed [m/s]', 'Solar Flux [W/m^2]','Wind Direction [degrees]',
                     #'Relative Humidity [percent]','UHI Index', 'building_density', 'floor_area_ratio']]

In [None]:
#, 'total_building_area'

### Train Test Split

In [225]:
#Split the data into features (X) and target (y), and then into training and testing sets
X = uhi_final.drop(columns=['UHI Index']).values
y = uhi_final['UHI Index'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

### Feature scaling

In [226]:
#Scale the training and test data using standardscaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Model Training

In [None]:
#Train the Random Forest model on the training data
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train,y_train)

### Model Evaluation

#### In-sample

In [None]:
#Make predictions on the training data
insample_predictions = model.predict(X_train)
#calculate R-squared score for in-sample predictions
Y_train = y_train.tolist()
r2_score(Y_train, insample_predictions)

0.981550855949046

#### Out-sample

In [None]:
#Make predictions on the test data
outsample_predictions = model.predict(X_test)

#calculate R-squared score for out-sample predictions
Y_test = y_test.tolist()
r2_score(Y_test, outsample_predictions)

0.8668657131356591

In [118]:
def train_and_evaluate_models(X_train, y_train, X_test, y_test):

    results = {}

    # Random Forest Regressor
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    results['RandomForest'] = {
        'in_sample': r2_score(y_train, rf_model.predict(X_train)),
        'out_sample': r2_score(y_test, rf_model.predict(X_test))
    }

    # XGBoost Regressor
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    xgb_model.fit(X_train, y_train)
    results['XGBoost'] = {
        'in_sample': r2_score(y_train, xgb_model.predict(X_train)),
        'out_sample': r2_score(y_test, xgb_model.predict(X_test))
    }
    return results


In [227]:
#Compare in&out sample  evaluation
train_and_evaluate_models(X_train, y_train, X_test, y_test)

{'RandomForest': {'in_sample': 0.9928163510998617,
  'out_sample': 0.9539125080925288},
 'XGBoost': {'in_sample': 0.980107855206931, 'out_sample': 0.9322559260770802}}

#### Parameter tuning

In [119]:
#Function to tune parameters
def tune_parameters(X_train, y_train):
    """
    Tunes hyperparameters for RandomForestRegressor, XGBoostRegressor, and AdaBoostRegressor.

    Args:
        X_train: Training features.
        y_train: Training target variable.

    Returns:
        A dictionary containing the best estimators for each model.
    """

    tuned_models = {}

    #Random Forest Regressor
    rf_param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10, 15, 20],
        'min_samples_leaf': [2, 4, 6, 8],
        'max_features': ['auto', 'sqrt', 'log2', None]}
    rf_model = RandomForestRegressor(random_state=42)
    rf_random = RandomizedSearchCV(estimator=rf_model, param_distributions=rf_param_grid, n_iter=10, cv=10, scoring='r2', random_state=42, n_jobs=-1)
    rf_random.fit(X_train, y_train)
    tuned_models['RandomForest'] = rf_random.best_params_

    #XGBoost Regressor
    xgb_param_grid = {
    'n_estimators': [100, 150, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 6, 9, 12],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]}
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    xgb_random = RandomizedSearchCV(estimator=xgb_model, param_distributions=xgb_param_grid, n_iter=10, cv=10, scoring='r2', random_state=42, n_jobs=-1)
    xgb_random.fit(X_train, y_train)
    tuned_models['XGBoost'] = xgb_random.best_params_

    return tuned_models


In [228]:
#Tune 2 models
tuned_estimators = tune_parameters(X_train, y_train)
tuned_estimators

{'RandomForest': {'n_estimators': 300,
  'min_samples_split': 5,
  'min_samples_leaf': 4,
  'max_features': 'log2',
  'max_depth': 40},
 'XGBoost': {'subsample': 0.8,
  'n_estimators': 300,
  'max_depth': 9,
  'learning_rate': 0.05,
  'colsample_bytree': 0.6}}

In [122]:
#Function to evaluate tuned models

def evaluate_tuned_models(X_train, y_train, X_test, y_test, tuned_params):
    """
    Trains and evaluates tuned models.

    Args:
        X_train: Training features.
        y_train: Training target variable.
        X_test: Testing features.
        y_test: Testing target variable.
        tuned_params: A dictionary containing the best hyperparameters for each model.

    Returns:
        A dictionary containing the evaluation metrics for each model.
    """
    results = {}

    # Random Forest Regressor
    rf_model = RandomForestRegressor(**tuned_params['RandomForest'], random_state=42)
    rf_model.fit(X_train, y_train)
    results['RandomForest'] = {
        'in_sample': r2_score(y_train, rf_model.predict(X_train)),
        'out_sample': r2_score(y_test, rf_model.predict(X_test))
    }

    # XGBoost Regressor
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, **tuned_params['XGBoost'])
    xgb_model.fit(X_train, y_train)
    results['XGBoost'] = {
        'in_sample': r2_score(y_train, xgb_model.predict(X_train)),
        'out_sample': r2_score(y_test, xgb_model.predict(X_test))
    }

    return results


In [229]:
#Evaluate tuned model
evaluate_tuned_models(X_train, y_train, X_test, y_test, tuned_estimators)

{'RandomForest': {'in_sample': 0.9630161207555645,
  'out_sample': 0.9236182105223147},
 'XGBoost': {'in_sample': 0.9941807131620548,
  'out_sample': 0.9502992179619453}}

In [176]:
#retest random forest regressor
model = RandomForestRegressor(max_depth=40, max_features='log2', min_samples_leaf=4,
                       min_samples_split=5, n_estimators=300, random_state=42)
model.fit(X_train, y_train)

In [None]:
#retest xgb
model = xgb.XGBRegressor(colsample_bytree=0.8, learning_rate=0.2, max_depth=12,
              n_estimators=200, subsample = 1, random_state=42)
model.fit(X_train, y_train)

In [177]:
#Make predictions on the training data
insample_predictions = model.predict(X_train)
#calculate R-squared score for in-sample predictions
Y_train = y_train.tolist()
r2_score(Y_train, insample_predictions)

0.9605852446130947

In [178]:
#Make predictions on the test data
outsample_predictions = model.predict(X_test)

#calculate R-squared score for out-sample predictions
Y_test = y_test.tolist()
r2_score(Y_test, outsample_predictions)

0.9172287740052498