<a href="https://colab.research.google.com/github/hamydang16/EY_Data-Challenge-2025/blob/main/EY_Data_Challenge__2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Urban Heat Island Challenge

## Load Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install rioxarray
!pip install stackstac
!pip install pystac_client
!pip install planetary_computer
!pip install odc-stac
!pip install rasterstats
!pip install geopy
!pip install reverse_geocode

Collecting rioxarray
  Downloading rioxarray-0.18.2-py3-none-any.whl.metadata (5.4 kB)
Collecting rasterio>=1.3.7 (from rioxarray)
  Downloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting affine (from rasterio>=1.3.7->rioxarray)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting cligj>=0.5 (from rasterio>=1.3.7->rioxarray)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio>=1.3.7->rioxarray)
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl.metadata (6.4 kB)
Downloading rioxarray-0.18.2-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.9/61.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rasterio-1.4.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
[?25hD

In [73]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Science
import numpy as np
import pandas as pd

# Multi-dimensional arrays and datasets
import xarray as xr

# Geospatial raster data handling
import rioxarray as rxr

# Geospatial data analysis
import geopandas as gpd
import reverse_geocode

# Geospatial operations
import rasterio
from rasterio import windows
from rasterio import features
from rasterio import warp
from rasterio.warp import transform_bounds
from rasterio.windows import from_bounds

# Image Processing
from PIL import Image

# Coordinate transformations
from pyproj import Proj, Transformer, CRS

# Feature Engineering
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

# Machine Learning
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
import xgboost as xgb
from sklearn.metrics import r2_score


# Planetary Computer Tools
import pystac_client
import planetary_computer as pc
from pystac.extensions.eo import EOExtension as eo

# Others
import os
from tqdm import tqdm
from shapely.geometry import Point
from geopy.distance import geodesic
from rasterstats import zonal_stats

## Response Variables

### Training data

In [4]:
#csv path
csv_path = '/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/Training_data_uhi_index_2025-02-18.csv'

#Load the training data from csv file and display the first few rows to inspect the data
ground_df = pd.read_csv(csv_path)
ground_df.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index
0,-73.909167,40.813107,24-07-2021 15:53,1.030289
1,-73.909187,40.813045,24-07-2021 15:53,1.030289
2,-73.909215,40.812978,24-07-2021 15:53,1.023798
3,-73.909242,40.812908,24-07-2021 15:53,1.023798
4,-73.909257,40.812845,24-07-2021 15:53,1.021634


### Weather data

In [5]:
#Change datetime object
ground_df['datetime'] = pd.to_datetime(ground_df['datetime'])

In [26]:
#Reverse geocoding into ground_df by creating a new column to signify region by reverse geocoding from lat and long
ground_df['region'] = ground_df.apply(lambda x: reverse_geocode.search([(x['Latitude'], x['Longitude'])])[0]['county'], axis=1)

In [30]:
#Group into 2 areas by bronx and not bronx
ground_df['region'] = ground_df['region'].apply(lambda x: 'Bronx' if x == 'Bronx County' else 'Manhattan')

In [32]:
#Load the weather dataset
weather_bronx = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/NY_Mesonet_Weather.xlsx', sheet_name='Bronx')
weather_manhattan = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/NY_Mesonet_Weather.xlsx', sheet_name='Manhattan')

In [33]:
#change datetime of weather bronx and weather manhattan
weather_bronx['Date / Time'] = pd.to_datetime(weather_bronx['Date / Time'])
weather_manhattan['Date / Time'] = pd.to_datetime(weather_manhattan['Date / Time'])

In [35]:
#Function to find the nearest hour and minute match in weather data
def nearest_time_match(ground_time, weather_df):
    #Calculate time differences in minutes
    time_diffs = abs((weather_df['Date / Time'] - ground_time).dt.total_seconds() / 60)

    #Find the index of the minimum time difference
    nearest_index = time_diffs.idxmin()

    return weather_df.loc[nearest_index]

#Apply the nearest_time_match function to each row in ground_df for Bronx region
ground_df_bronx = ground_df[ground_df['region'] == 'Bronx'].copy()
merged_data = []
for index, row in ground_df_bronx.iterrows():
    try:
      nearest_weather = nearest_time_match(row['datetime'], weather_bronx)
      merged_row = pd.concat([row, nearest_weather])
      merged_data.append(merged_row)
    except:
      pass

# Create the merged DataFrame
merged_bronx = pd.DataFrame(merged_data)

merged_bronx.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,region,Date / Time,Air Temp at Surface [degC],Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],Solar Flux [W/m^2]
Unnamed 0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621


In [36]:
#Apply the nearest_time_match function to each row in ground_df for Manhattan region
ground_df_manhattan = ground_df[ground_df['region'] == 'Manhattan'].copy()
merged_data = []
for index, row in ground_df_manhattan.iterrows():
    try:
      nearest_weather = nearest_time_match(row['datetime'], weather_manhattan)
      merged_row = pd.concat([row, nearest_weather])
      merged_data.append(merged_row)
    except:
      pass

# Create the merged DataFrame
merged_manhattan = pd.DataFrame(merged_data)

merged_manhattan.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,region,Date / Time,Air Temp at Surface [degC],Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],Solar Flux [W/m^2]
0,-73.981233,40.784337,2021-07-24 15:36:00,1.009974,Manhattan,2021-07-24 15:35:00,26.8,47.6,2.4,209,511
1,-73.981257,40.784328,2021-07-24 15:36:00,1.00781,Manhattan,2021-07-24 15:35:00,26.8,47.6,2.4,209,511
2,-73.981285,40.784322,2021-07-24 15:36:00,1.00781,Manhattan,2021-07-24 15:35:00,26.8,47.6,2.4,209,511
3,-73.981318,40.784297,2021-07-24 15:36:00,1.00781,Manhattan,2021-07-24 15:35:00,26.8,47.6,2.4,209,511
4,-73.981338,40.784268,2021-07-24 15:36:00,1.00781,Manhattan,2021-07-24 15:35:00,26.8,47.6,2.4,209,511


In [37]:
#combine merge_bronx and merge_manhattan
ground_df = pd.concat([merged_bronx, merged_manhattan])
ground_df.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,region,Date / Time,Air Temp at Surface [degC],Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],Solar Flux [W/m^2]
Unnamed 0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621
Unnamed 4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,Bronx,2021-07-24 15:55:00,27.2,47.3,2.6,165,621


In [39]:
#combine and clean column
ground_df = ground_df.reset_index()
#drop column
ground_df = ground_df.drop(columns=['Date / Time', 'region', 'Air Temp at Surface [degC]', 'index'])

### Sattelite data

#### Extracting Landsat-8 data

In [40]:
#Extracting lst data
from rasterio.windows import Window
def map_lst_data(tiff_path, csv_path):
    """
    Extract raster values at specified lat/lon coordinates from a GeoTIFF file.

    Parameters:
    -----------
    tiff_path : str
        Path to the GeoTIFF file
    csv_path : str
        Path to CSV file containing 'Latitude' and 'Longitude' columns

    Returns:
    --------
    pd.DataFrame
        DataFrame containing original coordinates and extracted values
    """
    # Read points from CSV
    df = pd.read_csv(csv_path)

    # Open the raster file and extract values
    with rasterio.open(tiff_path) as src:
        coords = list(zip(df['Longitude'], df['Latitude']))
        lst_values = []

        for lon, lat in coords:
            try:
                row, col = src.index(lon, lat)
                window = Window(col, row, 1, 1)
                value = src.read(1, window=window)
                lst_values.append(float(value[0][0]))
            except (IndexError, ValueError):
                lst_values.append(None)

    # Create and return output DataFrame
    return pd.DataFrame({
        'Latitude': df['Latitude'],
        'Longitude': df['Longitude'],
        'LST': lst_values
    })

In [41]:
#map satellite data from landsat
tiff_path_2 = '/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/Landsat_LST.tiff'
lst_data = map_lst_data(tiff_path = tiff_path_2, csv_path = csv_path)

In [43]:
lst_data.describe()

Unnamed: 0,Latitude,Longitude,LST
count,11229.0,11229.0,11229.0
mean,40.8088,-73.933927,40.588086
std,0.023171,0.028253,2.752488
min,40.758792,-73.994457,32.532037
25%,40.790905,-73.955703,39.121979
50%,40.810688,-73.932968,40.827571
75%,40.824515,-73.909647,42.310992
max,40.859497,-73.879458,53.200804


#### Extracting Sentinel-2 data

In [42]:
#Extracting spectral data from geotiff image, allowing for buffer zone
import rasterio as rio
from shapely.geometry import Point, mapping


def map_sent_data(tiff_path, csv_path, buffer_distance):
    # Read the CSV file using pandas
    df = pd.read_csv(csv_path)

    # Create points from coordinates
    geometry = [Point(lon, lat) for lon, lat in zip(df['Longitude'], df['Latitude'])]
    gdf = gpd.GeoDataFrame(df, crs='epsg:4326', geometry=geometry)

    # Initialize results DataFrame with original data
    results_df = df.copy()

    with rio.open(tiff_path) as src:
        # Transform points to raster CRS
        gdf = gdf.to_crs(src.crs)

        band_name_mapping = {
            1: 'B01',
            2: 'B02',
            3: 'B03',
            4: 'B04',
            5: 'B08',
            6: 'B12'
        }

        # Process each point individually
        for idx, point in enumerate(tqdm(gdf.geometry, desc="Processing locations")):
            # Create buffer for this specific point
            buffered_point = point.buffer(buffer_distance)

            # Get the pixel coordinates for this specific point
            row, col = src.index(point.x, point.y)

            # Calculate window size based on buffer
            buffer_pixels = int(np.ceil(buffer_distance / src.res[0]))
            window = rio.windows.Window(
                col - buffer_pixels,
                row - buffer_pixels,
                2 * buffer_pixels + 1,
                2 * buffer_pixels + 1
            )

            # Process each band for this specific point
            for band_idx, band_name in band_name_mapping.items():
                try:
                    # Read data for this window
                    data = src.read(band_idx, window=window)

                    # Create mask for the buffer
                    shapes = [(buffered_point, 1)]
                    mask = rasterio.features.rasterize(
                        shapes,
                        out_shape=data.shape,
                        transform=rio.windows.transform(window, src.transform),
                        fill=0,
                        dtype='uint8'
                    )

                    # Calculate mean for masked area
                    masked_data = data[mask == 1]
                    if len(masked_data) > 0:
                        mean_value = np.mean(masked_data)
                    else:
                        # Fallback to single pixel value if no pixels in buffer
                        mean_value = src.read(band_idx, window=((row, row+1), (col, col+1)))[0][0]

                    # Assign value to specific row and band
                    results_df.at[idx, band_name] = mean_value

                except Exception as e:
                    # Fallback to single pixel value in case of any error
                    value = src.read(band_idx, window=((row, row+1), (col, col+1)))[0][0]
                    results_df.at[idx, band_name] = value

    return results_df

In [44]:
#Open the GeoTIFF file
tiff_path = '/content/drive/MyDrive/Colab Notebooks/EY Data Challenge/S2_sample.tiff'
#Call function
senti_data = map_sent_data(tiff_path = tiff_path, csv_path = csv_path, buffer_distance = 50)

Processing locations: 100%|██████████| 11229/11229 [11:23<00:00, 16.44it/s]


#### Calculate Indexes

In [45]:
# Calculate NDVI (Normalized Difference Vegetation Index) and handle division by zero by replacing infinities with NaN.
# See the Sentinel-2 sample notebook for more information about the NDVI index
senti_data['NDVI'] = (senti_data['B08'] - senti_data['B04']) / (senti_data['B08'] + senti_data['B04'])
senti_data['NDVI'] = senti_data['NDVI'].replace([np.inf, -np.inf], np.nan)

In [47]:
#EVI
senti_data['EVI'] = (2.5*(senti_data['B08'] - senti_data['B04'])) / (senti_data['B08'] + 6*senti_data['B04'] - 7.5 * senti_data['B02'] + 1)
senti_data['EVI'] = senti_data['EVI'].replace([np.inf, -np.inf], np.nan)

In [46]:
# NDBI
senti_data['NDBI'] = (senti_data['B12'] - senti_data['B08']) / (senti_data['B12'] + senti_data['B08'])
senti_data['NDBI'] = senti_data['NDBI'].replace([np.inf, -np.inf], np.nan)

## Joining the predictor variables and response variables

In [54]:
#Combining ground data, weather data and satellite data into a single dataset.
uhi_data = pd.concat([ground_df,senti_data, lst_data], axis=1)
uhi_data = uhi_data.loc[:,~uhi_data.columns.duplicated()]
uhi_data.head()

Unnamed: 0,level_0,Longitude,Latitude,datetime,UHI Index,Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],Solar Flux [W/m^2],B01,B02,B03,B04,B08,B12,NDVI,NDBI,EVI,LST
0,0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,47.3,2.6,165,621,730.5,703.5,775.5,775.5,1199.5,1188.5,0.214684,-0.004606,1.836293,42.345172
1,1,-73.909187,40.813045,2021-07-24 15:53:00,1.030289,47.3,2.6,165,621,730.5,703.5,775.5,775.5,1199.5,1188.5,0.214684,-0.004606,1.836293,42.345172
2,2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,47.3,2.6,165,621,730.5,564.0,760.5,650.5,1852.0,879.0,0.48012,-0.35628,1.968381,41.442815
3,3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,47.3,2.6,165,621,730.5,491.5,652.0,552.5,1757.0,879.0,0.521541,-0.33308,2.171444,41.442815
4,4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,47.3,2.6,165,621,730.5,500.0,601.5,585.0,1644.0,951.0,0.475101,-0.267052,1.884342,41.152283


In [55]:
#Drop unnecessary column
uhi_data = uhi_data.drop(columns=['level_0'])

## Cleaning data

### Remove duplicates

In [58]:
#Check for columns to clean
uhi_data.columns[3:]

Index(['UHI Index', 'Relative Humidity [percent]', 'Avg Wind Speed [m/s]',
       'Wind Direction [degrees]', 'Solar Flux [W/m^2]', 'B01', 'B02', 'B03',
       'B04', 'B08', 'B12', 'NDVI', 'NDBI', 'EVI', 'LST'],
      dtype='object')

In [59]:
# Remove duplicate rows from the DataFrame based on specified columns and keep the first occurrence
columns_to_check = uhi_data.columns[3:]
for col in columns_to_check:
    # Check if the value is a numpy array and has more than one dimension
    uhi_data[col] = uhi_data[col].apply(lambda x: tuple(x) if isinstance(x, np.ndarray) and x.ndim > 0 else x)

# Now remove duplicates
uhi_data = uhi_data.drop_duplicates(subset=columns_to_check, keep='first')
uhi_data.head()

Unnamed: 0,Longitude,Latitude,datetime,UHI Index,Relative Humidity [percent],Avg Wind Speed [m/s],Wind Direction [degrees],Solar Flux [W/m^2],B01,B02,B03,B04,B08,B12,NDVI,NDBI,EVI,LST
0,-73.909167,40.813107,2021-07-24 15:53:00,1.030289,47.3,2.6,165,621,730.5,703.5,775.5,775.5,1199.5,1188.5,0.214684,-0.004606,1.836293,42.345172
2,-73.909215,40.812978,2021-07-24 15:53:00,1.023798,47.3,2.6,165,621,730.5,564.0,760.5,650.5,1852.0,879.0,0.48012,-0.35628,1.968381,41.442815
3,-73.909242,40.812908,2021-07-24 15:53:00,1.023798,47.3,2.6,165,621,730.5,491.5,652.0,552.5,1757.0,879.0,0.521541,-0.33308,2.171444,41.442815
4,-73.909257,40.812845,2021-07-24 15:53:00,1.021634,47.3,2.6,165,621,730.5,500.0,601.5,585.0,1644.0,951.0,0.475101,-0.267052,1.884342,41.152283
6,-73.909312,40.81271,2021-07-24 15:53:00,1.015143,47.3,2.6,165,621,730.5,484.0,671.0,575.0,2156.0,951.0,0.578909,-0.387834,1.999241,41.152283


In [60]:
# Resetting the index of the dataset
uhi_data=uhi_data.reset_index(drop=True)

## Model Building

In [61]:
#Drop the lat-lon columns
uhi_data = uhi_data.drop(columns=['Latitude', 'Longitude', 'datetime'])

### Feature selection

In [64]:
# Retaining only the columns for the most important features in the dataset.
uhi_final = uhi_data[['B01','B12','NDVI','NDBI','LST',
                    'Avg Wind Speed [m/s]', 'Solar Flux [W/m^2]',
                     'Relative Humidity [percent]','UHI Index']]

### Train Test Split

In [65]:
#Split the data into features (X) and target (y), and then into training and testing sets
X = uhi_final.drop(columns=['UHI Index']).values
y = uhi_final['UHI Index'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=42)

### Feature scaling

In [66]:
#Scale the training and test data using standardscaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Model Training

In [67]:
#Train the Random Forest model on the training data
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train,y_train)

### Model Evaluation

#### In-sample

In [68]:
#Make predictions on the training data
insample_predictions = model.predict(X_train)
#calculate R-squared score for in-sample predictions
Y_train = y_train.tolist()
r2_score(Y_train, insample_predictions)

0.9669994609875968

#### Out-sample

In [69]:
#Make predictions on the test data
outsample_predictions = model.predict(X_test)

#calculate R-squared score for out-sample predictions
Y_test = y_test.tolist()
r2_score(Y_test, outsample_predictions)

0.7773556766157927

In [71]:
def train_and_evaluate_models(X_train, y_train, X_test, y_test):

    results = {}

    # Random Forest Regressor
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    results['RandomForest'] = {
        'in_sample': r2_score(y_train, rf_model.predict(X_train)),
        'out_sample': r2_score(y_test, rf_model.predict(X_test))
    }

    # XGBoost Regressor
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    xgb_model.fit(X_train, y_train)
    results['XGBoost'] = {
        'in_sample': r2_score(y_train, xgb_model.predict(X_train)),
        'out_sample': r2_score(y_test, xgb_model.predict(X_test))
    }

    # AdaBoost Regressor
    ada_model = AdaBoostRegressor(random_state=42)
    ada_model.fit(X_train, y_train)
    results['AdaBoost'] = {
        'in_sample': r2_score(y_train, ada_model.predict(X_train)),
        'out_sample': r2_score(y_test, ada_model.predict(X_test))
    }

    return results


In [72]:
#Compare in&out sample  evaluation
train_and_evaluate_models(X_train, y_train, X_test, y_test)

{'RandomForest': {'in_sample': 0.9669994609875968,
  'out_sample': 0.7773556766157927},
 'XGBoost': {'in_sample': 0.9042733616383822,
  'out_sample': 0.7200212836429974},
 'AdaBoost': {'in_sample': 0.16238186779406538,
  'out_sample': 0.16394642133253634}}

#### Parameter tuning

In [78]:
#Function to tune parameters
def tune_parameters(X_train, y_train):
    """
    Tunes hyperparameters for RandomForestRegressor, XGBoostRegressor, and AdaBoostRegressor.

    Args:
        X_train: Training features.
        y_train: Training target variable.

    Returns:
        A dictionary containing the best estimators for each model.
    """

    tuned_models = {}

    # Random Forest Regressor
    rf_param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10, 15, 20],
        'min_samples_leaf': [2, 4, 6, 8],
        'max_features': ['auto', 'sqrt', 'log2', None]}
    rf_model = RandomForestRegressor(random_state=42)
    rf_random = RandomizedSearchCV(estimator=rf_model, param_distributions=rf_param_grid, n_iter=10, cv=5, scoring='r2', random_state=42, n_jobs=-1)
    rf_random.fit(X_train, y_train)
    tuned_models['RandomForest'] = rf_random.best_estimator_

    # XGBoost Regressor
    xgb_param_grid = {
    'n_estimators': [100, 150, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 6, 9, 12],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]}
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    xgb_random = RandomizedSearchCV(estimator=xgb_model, param_distributions=xgb_param_grid, n_iter=10, cv=5, scoring='r2', random_state=42, n_jobs=-1)
    xgb_random.fit(X_train, y_train)
    tuned_models['XGBoost'] = xgb_random.best_estimator_

    # AdaBoost Regressor
    ada_param_grid = {
        'n_estimators': [50, 100, 200, 500],
        'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],
        'estimator__max_depth': [2, 4, 6, 8, 10]  #Depth of decision trees
        }
    ada_model = AdaBoostRegressor(estimator=DecisionTreeRegressor(random_state=42, max_depth=10), random_state=42)
    ada_random = RandomizedSearchCV(estimator=ada_model, param_distributions=ada_param_grid, n_iter=10, cv=5, scoring='r2', random_state=42, n_jobs=-1)
    ada_random.fit(X_train, y_train)
    tuned_models['AdaBoost'] = ada_random.best_estimator_

    return tuned_models


In [77]:
# Example usage (assuming X_train and y_train are defined)
tuned_estimators = tune_parameters(X_train, y_train)
tuned_estimators

{'RandomForest': RandomForestRegressor(max_depth=40, max_features='log2', min_samples_leaf=4,
                       min_samples_split=5, n_estimators=300, random_state=42),
 'XGBoost': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=12, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...),
 'AdaBoost': AdaBoostRegressor(estimator=DecisionTreeRegressor(

In [81]:
#retest random forest regressor
model = RandomForestRegressor(max_depth=40, max_features='log2', min_samples_leaf=4,
                       min_samples_split=5, n_estimators=300, random_state=42)
model.fit(X_train, y_train)

In [86]:
#retest xgb
model = xgb.XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.2, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=12, max_leaves=None,
              min_child_weight=None, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, random_state=42)
model.fit(X_train, y_train)

In [89]:
#retest ada boost
model = AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=10,
                                                   random_state=42),
                   learning_rate=0.01, n_estimators=500, random_state=42)
model.fit(X_train, y_train)

In [90]:
#Make predictions on the training data
insample_predictions = model.predict(X_train)
#calculate R-squared score for in-sample predictions
Y_train = y_train.tolist()
r2_score(Y_train, insample_predictions)

0.7707022535669924

In [91]:
#Make predictions on the test data
outsample_predictions = model.predict(X_test)

#calculate R-squared score for out-sample predictions
Y_test = y_test.tolist()
r2_score(Y_test, outsample_predictions)

0.6533551641526014