In [1]:
# %pip install seaborn
# %pip install xgboost
# %pip install catboost
# %pip install missingno

## Load In Dependencies

In [2]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Data Science
import numpy as np
import pandas as pd

# Multi-dimensional arrays and datasets
import xarray as xr

# Geospatial raster data handling
import rioxarray as rxr

# Geospatial data analysis
import geopandas as gpd

# Geospatial operations
import rasterio
from rasterio import windows  
from rasterio import features  
from rasterio import warp
from rasterio.warp import transform_bounds 
from rasterio.windows import from_bounds 

# Image Processing
from PIL import Image

# Coordinate transformations
from pyproj import Proj, Transformer, CRS

# Feature Engineering
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, KFold

# Evaluation Metrics
from sklearn.metrics import r2_score

# Machine Learning
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Planetary Computer Tools
import pystac_client
import planetary_computer as pc
from pystac.extensions.eo import EOExtension as eo

# Others
import missingno as msno
import os
from tqdm import tqdm

## Response Variable

Before building the model, we need to load in the Urban Heat Island (UHI) index training dataset. We have curated data for the New York region. The dataset consists of geo-locations (Longitude and Latitude), with additional fields including date & time of data collection and the UHI index for each location. 

In [None]:
# Load the training data from csv file and display the first few rows to inspect the data
ground_df = pd.read_csv("data/Training_data_uhi_index_UHI2025-v2.csv")
ground_df.head()

## Predictor Variables

### Downloading GeoTIFF Image

In [None]:
# Open the GeoTIFF file
tiff_path = "data_pipeline/data/tiff/landsat_8.tiff"

# Read the bands from the GeoTIFF file
with rasterio.open(tiff_path) as dst:
    red = dst.read(1)
    blue = dst.read(2)
    green = dst.read(3)
    nir08 = dst.read(4)
    swir16 = dst.read(5)
    swir22 = dst.read(6)
    coastal = dst.read(7) 
    lwir11 = dst.read(8)

band_name = ['red', 'blue', 'green', 'nir08', 'swir16', 'swir22', 'coastal', 'lwir11'] 
band_lst = [red, blue, green, nir08, swir16, swir22, coastal, lwir11]
fig, axes = plt.subplots(2, 4, figsize=(20, 10))

# Flatten the axes for easier indexing
axes = axes.flatten()

for i in range(len(band_lst)):
    im = axes[i].imshow(band_lst[i], cmap='viridis')
    axes[i].set_title(band_name[i])
    fig.colorbar(im, ax=axes[i])

plt.tight_layout()
plt.show()


In [None]:
# Open the GeoTIFF file
tiff_path = "data_pipeline/data/tiff/building_res30.tiff"

# Read the bands from the GeoTIFF file
with rasterio.open(tiff_path) as dst:
    building_height = dst.read(1)
    building_sd = dst.read(2)
    building_year = dst.read(3)
    building_area = dst.read(4)

band_name = ['height', 'sd', 'year', 'area']
band_lst = [building_height, building_sd, building_year, building_area]
fig, axes = plt.subplots(2, 2, figsize=(10, 10))

# Flatten the axes for easier indexing
axes = axes.flatten()

for i in range(len(band_lst)):
    im = axes[i].imshow(band_lst[i], cmap='viridis')
    axes[i].set_title(band_name[i])
    fig.colorbar(im, ax=axes[i])

plt.tight_layout()
plt.show()


In [None]:
# Open the GeoTIFF file
tiff_path = "data_pipeline/data/tiff/street_res30.tiff"

# Read the bands from the GeoTIFF file
with rasterio.open(tiff_path) as dst:
    street_width = dst.read(1)
    traffic_dir = dst.read(2)
    street_lane = dst.read(3)
    orientation = dst.read(4)

fig, axes = plt.subplots(2, 2, figsize=(10, 10))

band_lst = [street_width, traffic_dir, street_lane, orientation]
band_name = ['width', 'traffic', 'lane', 'orientation']

# Flatten the axes for easier indexing
axes = axes.flatten()

for i in range(len(band_lst)):
    im = axes[i].imshow(band_lst[i], cmap='viridis')
    axes[i].set_title(band_name[i])
    fig.colorbar(im, ax=axes[i])

plt.tight_layout()
plt.show()


### Extracting Band Values from the GeoTIFF Image


In [7]:
# Extracts satellite band values from a GeoTIFF based on coordinates from a csv file and returns them in a DataFrame.

def map_satellite_data(tiff_path, csv_path, col_name):
    
    # Load the GeoTIFF data
    data = rxr.open_rasterio(tiff_path)
    layer_num = 0
    with rasterio.open(tiff_path) as dts:
        layer_num = dts.count

    # Read the Excel file using pandas
    df = pd.read_csv(csv_path)
    latitudes = df['Latitude'].values
    longitudes = df['Longitude'].values   

    df = pd.DataFrame()
    for i in tqdm(range(layer_num), desc="Go through layer"):
        values = []
        # Iterate over the latitudes and longitudes, and extract the corresponding band values
        for lat, lon in tqdm(zip(latitudes, longitudes), total=len(latitudes), desc="Mapping values"):
        # Assuming the correct dimensions are 'y' and 'x' (replace these with actual names from data.coords)
            cell_value = data.sel(x=lon, y=lat,  band=i+1, method="nearest").values
            values.append(cell_value)
        # Add column of feature
        df[col_name[i]] = values
    
    return df


In [None]:
# Mapping satellite data with training data.
data_dir = "data_pipeline/data/tiff/"
landsat_bands = ['red', 'blue', 'green', 'nir08', 'swir16', 'swir22', 'coastal', 'lwir11']
landsat_data = map_satellite_data(data_dir+'landsat_8.tiff', 'data/Training_data_uhi_index_UHI2025-v2.csv',
                                landsat_bands)
landsat_data.head()

In [None]:
# Mapping satellite data with training data.
building_bands = ['building_height', 'building_sd', 'building_year', 'building_area']
building_data = map_satellite_data(data_dir+'building_res30.tiff', 'data/Training_data_uhi_index_UHI2025-v2.csv',
                                building_bands)
building_data.head()

In [None]:
# Mapping satellite data with training data.
street_bands = ['street_width', 'street_traffic', 'street_lane', 'street_orientation']
street_data = map_satellite_data(data_dir+'street_res30.tiff', 'data/Training_data_uhi_index_UHI2025-v2.csv',
                                street_bands)
street_data.head()

In [None]:
df = pd.read_csv('data/Training_data_uhi_index_UHI2025-v2.csv')
df = pd.concat([df, landsat_data, building_data, street_data], axis=1)
df.head()

### NDVI, NDBI, NDWI and albedo calculation


In [12]:
# Calculate NDVI (Normalized Difference Vegetation Index) and handle division by zero by replacing infinities with NaN.
df['ndvi'] = (df['nir08'] - df['red']) / (df['nir08'] + df['red'])
df['ndvi'] = df['ndvi'].replace([np.inf, -np.inf], np.nan)

# Calculate NDBI (Normalized Difference Building Index)
df['ndbi'] = (df['swir16'] - df['nir08']) / (df['swir16'] + df['nir08'])
df['ndbi'] = df['ndbi'].replace([np.inf, -np.inf], np.nan)

# Calculate NDWI (Normalized Difference Water Index)
df['ndwi'] = (df['green'] - df['nir08']) / (df['green'] + df['nir08'])
df['ndwi'] = df['ndwi'].replace([np.inf, -np.inf], np.nan)

df['albedo'] = 0.356 * df['blue'] + 0.130 * df['red'] + 0.373 * df['nir08'] + 0.085 * df['swir16'] + 0.072 * df['swir22'] - 0.018

### VAR (Vertical Aspect Ratio) and Building Area per Pixel


In [13]:
# Calculate the VAR (Vertical Aspect Ratio)
df['VAR'] = df['building_height'] / df['street_width']
df['VAR'] = df['VAR'].replace([np.inf, -np.inf], np.nan)

# Calculate the building's area per pixel area (30 x 30 m^2)
df['Building_area_per_pixel'] = df['building_area'] / (900 * 10.764)
df['Building_area_per_pixel'] = df['Building_area_per_pixel'].replace([np.inf, -np.inf], np.nan)

## EDA

In [None]:
df.info()

In [15]:
excluded_cols = ['Longitude', 'Latitude', 'datetime', 'UHI Index']
categorical_cols = ['street_traffic', 'street_orientation']
numerical_cols = df.columns.difference(categorical_cols + excluded_cols)


In [16]:
df[numerical_cols] = df[numerical_cols].astype('float')

In [17]:
df[categorical_cols] = df[categorical_cols].astype('float')
df[categorical_cols] = df[categorical_cols].astype('category')

In [None]:
df.info()

### Mising values

In [None]:
msno.matrix(df)

### UHI Index

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df['UHI Index'], bins='auto', kde=True)
plt.title('Distribution of UHI Index')
plt.xlabel('UHI Index')
plt.ylabel('Frequency')
plt.show()

### Bla blaa

## Data Preprocesing

In [None]:
uhi_data = df.drop(columns=['Longitude', 'Latitude', 'datetime'])
uhi_data

### Missing values

In [23]:
uhi_data['VAR'].fillna(uhi_data['VAR'].mean(), inplace=True)

### Removing duplicates

In [None]:
# Check the duplicated records
print(f'Duplicated Rows: {uhi_data.duplicated().sum()}')

In [None]:
uhi_data.columns

In [None]:
# # Remove duplicate rows from the DataFrame based on specified columns and keep the first occurrence
# columns = ['lwir11', 'building_height', 'building_year', 'ndvi', 'ndbi', 'ndwi', 'street_width', 'building_area']
# for col in columns:
#     # Check if the value is a numpy array and has more than one dimension
#     uhi_data[col] = uhi_data[col].apply(lambda x: tuple(x) if isinstance(x, np.ndarray) and x.ndim > 0 else x)

# # Now remove duplicates
uhi_data = uhi_data.drop_duplicates(keep='first')
uhi_data


In [27]:
# Resetting the index of the dataset
uhi_data = uhi_data.reset_index(drop=True)

### Encoding

In [None]:
# One-hot encodiing
uhi_data = pd.get_dummies(data=uhi_data, columns=categorical_cols)
uhi_data

## Model Building

In [29]:
# Retaining only the columns for B01, B06, NDVI, and UHI Index in the dataset.
# uhi_data = uhi_data.drop(columns=['Longitude', 'Latitude', 'datetime'])

In [None]:
uhi_data

### Train and Test Split 

<p align="justify">We will now split the data into 70% training data and 30% test data. Scikit-learn alias “sklearn” is a robust library for machine learning in Python. The scikit-learn library has a <i><b>model_selection</b></i> module in which there is a splitting function <i><b>train_test_split</b></i>. You can use the same.</p>

In [75]:
# Split the data into features (X) and target (y), and then into training and testing sets
# columns = uhi_data.drop(columns=['UHI Index']).columns
# X = uhi_data[columns].values
X = uhi_data.drop(columns=['UHI Index'])
y = uhi_data['UHI Index']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

### Feature Scaling 

In [76]:
# Scale the training and test data using standardscaler
sc = StandardScaler()
X_train[numerical_cols] = sc.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = sc.transform(X_test[numerical_cols])

In [None]:
X_train.columns

In [None]:
feat_corr = uhi_data[numerical_cols].corr()
plt.figure(figsize=(15, 15))
sns.heatmap(feat_corr, annot=True)

### Model Training

#### Random Forest

In [None]:
# Random Forest 
rf = RandomForestRegressor(n_estimators=100, random_state=42)

In [136]:
rf_param_grid = {
    "n_estimators":[100, 200, 300],
    "max_depth": [3, 5, 7],
    "min_samples_split": [2, 5, 10], 
    "min_samples_leaf": [1, 2, 4],    
    "criterion": ['absolute_error', 'squared_error'],
}

rf_grid_search = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='r2')
rf_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


KeyboardInterrupt: 

In [35]:
print("RF Best Parameters:", rf_grid_search.best_params_)
print("RF Best Score:", rf_grid_search.best_score_)

In [115]:
rf.fit(X_train, y_train)

##### In-Sample Evaluation

In [116]:
# Make predictions on the training data
insample_predictions = rf.predict(X_train)

In [117]:
# calculate R-squared score for in-sample predictions
Y_train = y_train.tolist()
r2_score(Y_train, insample_predictions)

0.9433375907868949

##### Out-Sample Evaluation

In [118]:
# Make predictions on the test data
outsample_predictions = rf.predict(X_test)

In [119]:
# calculate R-squared score for out-sample predictions
Y_test = y_test.tolist()
r2_score(Y_test, outsample_predictions)

0.6220974048788308

#### XGBoost

In [110]:
# Xtreme Gradient Boosting
xgb = XGBRegressor(random_state=42)

In [None]:
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.001, 0.01, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_grid_search = GridSearchCV(estimator=xgb, param_grid=xgb_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='r2')
xgb_grid_search.fit(X_train, y_train)

In [None]:
print("XGB Best Parameters:", xgb_grid_search.best_params_)
print("XGB Best Score:", xgb_grid_search.best_score_)

In [111]:
xgb.fit(X_train, y_train)

AttributeError: 'super' object has no attribute '__sklearn_tags__'

AttributeError: 'super' object has no attribute '__sklearn_tags__'

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)

##### In-Sample Evaluation

In [112]:
# Make predictions on the training data
insample_predictions = xgb.predict(X_train)

In [113]:
# calculate R-squared score for in-sample predictions
Y_train = y_train.tolist()
r2_score(Y_train, insample_predictions)

0.9466014504432678

##### Out-Sample Evaluation

In [120]:
# Make predictions on the test data
outsample_predictions = xgb.predict(X_test)

In [121]:
# calculate R-squared score for out-sample predictions
Y_test = y_test.tolist()
r2_score(Y_test, outsample_predictions)

0.628976583480835

#### CatBoost


In [126]:
cb = CatBoostRegressor(n_estimators=100, random_state=42)

In [127]:
cb.fit(X_train, y_train)

Learning rate set to 0.346492
0:	learn: 0.0152059	total: 167ms	remaining: 16.5s
1:	learn: 0.0146160	total: 171ms	remaining: 8.37s
2:	learn: 0.0143133	total: 177ms	remaining: 5.73s
3:	learn: 0.0141293	total: 181ms	remaining: 4.35s
4:	learn: 0.0139324	total: 186ms	remaining: 3.54s
5:	learn: 0.0136144	total: 191ms	remaining: 2.99s
6:	learn: 0.0134963	total: 195ms	remaining: 2.58s
7:	learn: 0.0134075	total: 199ms	remaining: 2.29s
8:	learn: 0.0132750	total: 203ms	remaining: 2.05s
9:	learn: 0.0131480	total: 206ms	remaining: 1.86s
10:	learn: 0.0130859	total: 211ms	remaining: 1.71s
11:	learn: 0.0130334	total: 216ms	remaining: 1.58s
12:	learn: 0.0129231	total: 220ms	remaining: 1.47s
13:	learn: 0.0128302	total: 227ms	remaining: 1.39s
14:	learn: 0.0127397	total: 234ms	remaining: 1.33s
15:	learn: 0.0126868	total: 240ms	remaining: 1.26s
16:	learn: 0.0126301	total: 245ms	remaining: 1.19s
17:	learn: 0.0125594	total: 250ms	remaining: 1.14s
18:	learn: 0.0124989	total: 255ms	remaining: 1.09s
19:	learn: 

<catboost.core.CatBoostRegressor at 0x1629fc62cd0>

##### In-Sample Evaluation

In [128]:
# Make predictions on the training data
insample_predictions = cb.predict(X_train)

In [129]:
# calculate R-squared score for in-sample predictions
Y_train = y_train.tolist()
r2_score(Y_train, insample_predictions)

0.7270071224900289

##### Out-Sample Evaluation

In [130]:
# Make predictions on the test data
outsample_predictions = cb.predict(X_test)

In [131]:
# calculate R-squared score for out-sample predictions
Y_test = y_test.tolist()
r2_score(Y_test, outsample_predictions)

0.49984256791697235

## Model Evaluation

<p align="justify">Now that we have trained our model, the next step is to evaluate its performance. For evaluation, we will use the R² score, a common metric for regression models that measures how well the model explains the variance in the response variable (UHI index). Scikit-learn provides many other metrics that can be used for evaluation, and you can also write custom code for more specific evaluation needs.</p>


### In-Sample Evaluation
<p align="justify">We will be evaluating our model's performance using the R² score on the training data. It is important to note that this is in-sample performance testing, which involves evaluating the model on the training dataset. These metrics are not truly indicative of the model's ability to generalize. You should reserve testing on the test data before drawing final conclusions about your model's performance.</p>


<p align="justify">In this section, we make predictions on the training set and store them in the <b><i>insample_predictions</i></b> variable. The R² score is then calculated to gauge the model's performance on the training data. It is important to keep in mind that this evaluation is for the training set, and further testing on the test set is necessary to assess the model's generalizability.</p>


In [43]:
# Make predictions on the training data
insample_predictions = model.predict(X_train)

In [None]:
# calculate R-squared score for in-sample predictions
Y_train = y_train.tolist()
r2_score(Y_train, insample_predictions)

### Out-Sample Evaluation

When evaluating a machine learning model, it is essential to correctly and fairly evaluate the model's ability to generalize. This is because models have a tendency to overfit the dataset they are trained on. To estimate the out-of-sample performance, we will predict on the test data now. 

In [45]:
# Make predictions on the test data
outsample_predictions = model.predict(X_test)

In [None]:
# calculate R-squared score for out-sample predictions
Y_test = y_test.tolist()
r2_score(Y_test, outsample_predictions)

In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(model, X_test, Y_test, n_repeats=10, random_state=42)
print(result.importances_mean)

In [None]:
features = uhi_data.drop(columns=['UHI Index']).columns
importance_mean = result.importances_mean

In [None]:
plt.barh(features, importance_mean, color='skyblue')
plt.xlabel("Mean Permutation Importance")
plt.ylabel("Feature")
plt.title("Permutation Feature Importance")
plt.show()

## Submission

Once you are satisfied with your model, you can proceed to make a submission. To do this, you will need to use your model to predict the Urban Heat Island (UHI) index for a set of test coordinates provided in the <b>"Submission_template.csv"</b> file and upload the results to the challenge platform.

In [None]:
#Reading the coordinates for the submission
test_file = pd.read_csv('data/Submission_template_UHI2025-v2.csv')
test_file

In [None]:
# Mapping satellite data for submission.
val_data1 = map_satellite_data(data_dir + 'landsat_8.tiff', 'data/Submission_template_UHI2025-v2.csv', landsat_bands)
val_data2 = map_satellite_data(data_dir + 'building_res30.tiff', 'data/Submission_template_UHI2025-v2.csv', building_bands)
val_data3 = map_satellite_data(data_dir + 'street_res30.tiff', 'data/Submission_template_UHI2025-v2.csv', street_bands)
val_data = pd.concat([val_data1, val_data2, val_data3], axis=1)

In [54]:
val_data['ndvi'] = (val_data['nir08'] - val_data['red']) / (val_data['nir08'] + val_data['red'])
val_data['ndvi'] = val_data['ndvi'].replace([np.inf, -np.inf], np.nan)

val_data['ndbi'] = (val_data['swir16'] - val_data['nir08']) / (val_data['swir16'] + val_data['nir08'])
val_data['ndbi'] = val_data['ndbi'].replace([np.inf, -np.inf], np.nan)

val_data['ndwi'] = (val_data['green'] - val_data['nir08']) / (val_data['green'] + val_data['nir08'])
val_data['ndwi'] = val_data['ndwi'].replace([np.inf, -np.inf], np.nan)

val_data['VAR'] = val_data['building_height'] / val_data['street_width']
val_data['VAR'] = val_data['VAR'].replace([np.inf, -np.inf], np.nan)

val_data['Building_area_per_pixel'] = val_data['building_area'] / (900 * 10.764)
val_data['Building_area_per_pixel'] = val_data['Building_area_per_pixel'].replace([np.inf, -np.inf], np.nan)

val_data['albedo'] = 0.356 * val_data['blue'] + 0.130 * val_data['red'] + 0.373 * val_data['nir08'] + 0.085 * val_data['swir16'] + 0.072 * val_data['swir22'] - 0.018

In [55]:
val_data[categorical_cols] = val_data[categorical_cols].astype('float')
val_data[categorical_cols] = val_data[categorical_cols].astype('category')

In [None]:
val_data = pd.get_dummies(data=val_data, columns=categorical_cols)

In [None]:
val_data

In [None]:
# Feature Scaling 
# submission_val_data = val_data[numerical_cols]
# transformed_submission_data = sc.transform(submission_val_data)
val_data[numerical_cols] = sc.transform(val_data[numerical_cols])

In [None]:
transformed_submission_data = val_data.copy()
transformed_submission_data = transformed_submission_data[X_train.columns]

In [132]:
#Making predictions
final_predictions = cb.predict(transformed_submission_data)
final_prediction_series = pd.Series(final_predictions)

In [133]:
#Combining the results into dataframe
submission_df = pd.DataFrame({'Longitude':test_file['Longitude'].values, 'Latitude':test_file['Latitude'].values, 'UHI Index':final_prediction_series.values})

In [134]:
#Displaying the sample submission dataframe
submission_df.head()

Unnamed: 0,Longitude,Latitude,UHI Index
0,-73.971665,40.788763,0.97954
1,-73.971928,40.788875,0.97527
2,-73.96708,40.78908,0.985543
3,-73.97255,40.789082,0.986214
4,-73.969697,40.787953,0.980597


In [135]:
#Dumping the predictions into a csv file.
submission_df.to_csv("submissions/landsat_all_feature_using_catboost_(encoded).csv",index = False)

### Upload submission file on platform

Upload the submission.csv on the <a href ="https://challenge.ey.com">platform</a> to get score generated on scoreboard.

## Conclusion

<div align ="justify">Now that you have learned a basic approach to model training, it’s time to try your own approach! Feel free to modify any of the functions presented in this notebook. We look forward to seeing your version of the model and the results. Best of luck with the challenge!</div>