In [1]:
import sys
sys.path.append('..')

from baseline.utilities import *
from pandarallel import pandarallel
import pandas as pd
import geopandas as gpd
import json

In [2]:
# -----------------------------------------------------------------------------
# Initialization: Configure parallel processing
# -----------------------------------------------------------------------------
pandarallel.initialize(progress_bar=False, nb_workers=8)

SENTINEL_TIFF_PATH = '../baseline/S2_sample.tiff'
LANDSAT_TIFF_PATH = '../baseline/Landsat_LST.tiff'

MODE = 'submission'  # Options: 'submission', 'train'

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
# -----------------------------------------------------------------------------
# Load Data: Read CSV file based on mode
# -----------------------------------------------------------------------------
if MODE == 'train':
    ground_df = pd.read_csv("../baseline/Training_data_uhi_index.csv")
elif MODE == 'submission':
    ground_df = pd.read_csv("../baseline/Submission_template.csv")
else:
    raise ValueError("MODE should be either 'train' or 'submission'")

display(ground_df[['Longitude', 'Latitude']].describe())

Unnamed: 0,Longitude,Latitude
count,1040.0,1040.0
mean,-73.934816,40.807991
std,0.028661,0.0232
min,-73.993163,40.758877
25%,-73.95703,40.790802
50%,-73.934618,40.809553
75%,-73.910655,40.823054
max,-73.879537,40.859243


In [16]:
# -----------------------------------------------------------------------------
# Data Visualization: Convert dataset to GeoDataFrame and save as GeoJSON
# -----------------------------------------------------------------------------
gdf = gpd.GeoDataFrame(
    ground_df, geometry=gpd.points_from_xy(ground_df['Longitude'], ground_df['Latitude']),
    crs='EPSG:4326'  # Latitude-Longitude coordinate reference system
)
gdf.to_file(f"../pipeline/data/processed/{MODE}/ground_dataset.json", driver='GeoJSON')
print("GeoDataFrame shape:", gdf.shape)

GeoDataFrame shape: (1040, 4)


In [17]:
# -----------------------------------------------------------------------------
# Data Preprocessing: Generate geographic bounding boxes
# -----------------------------------------------------------------------------

radius_list = json.loads(open('../pipeline/data/radius_list.json', 'r').read())['radius_list']

bbox_dataset = compute_geographic_bounding_boxes(ground_df[['Longitude', 'Latitude']], radius_list)
print("Bounding box dataset columns:", bbox_dataset.columns)

with open('../pipeline/data/radius_list.json', 'w') as f:
    f.write(json.dumps({"radius_list": radius_list}, indent=4))

bbox_dataset.to_parquet(f'../pipeline/data/processed/{MODE}/bbox_dataset.parquet')


Computing bounding boxes: 100%|██████████| 16/16 [00:00<00:00, 29.78it/s]

Bounding box dataset columns: Index(['buffer_50m_bbox_4326', 'buffer_100m_bbox_4326',
       'buffer_150m_bbox_4326', 'buffer_200m_bbox_4326',
       'buffer_250m_bbox_4326', 'buffer_275m_bbox_4326',
       'buffer_300m_bbox_4326', 'buffer_350m_bbox_4326',
       'buffer_400m_bbox_4326', 'buffer_450m_bbox_4326',
       'buffer_500m_bbox_4326', 'buffer_600m_bbox_4326',
       'buffer_700m_bbox_4326', 'buffer_800m_bbox_4326',
       'buffer_900m_bbox_4326', 'buffer_1000m_bbox_4326'],
      dtype='object')



