In [7]:
import sys
sys.path.append('..')

from baseline.utilities import *
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=False, nb_workers=8)

# -------------------------------
# INITIAL CONFIGURATION
# -------------------------------
SENTINEL_TIFF_PATH = '../baseline/S2_sample.tiff'
LANDSAT_TIFF_PATH = '../baseline/Landsat_LST.tiff'
MODE = 'submission'  # 'submission' 'train'

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [8]:
# -------------------------------
# DATA LOADING
# -------------------------------
if MODE == 'train':
    ground_df = pd.read_csv("../baseline/Training_data_uhi_index.csv")
elif MODE == 'submission':
    ground_df = pd.read_csv("../baseline/Submission_template.csv")
else:
    raise ValueError("MODE should be either 'train' or 'submission'")

display(ground_df[['Longitude', 'Latitude']].describe())

Unnamed: 0,Longitude,Latitude
count,1040.0,1040.0
mean,-73.934816,40.807991
std,0.028661,0.0232
min,-73.993163,40.758877
25%,-73.95703,40.790802
50%,-73.934618,40.809553
75%,-73.910655,40.823054
max,-73.879537,40.859243


In [9]:
# -------------------------------
# FEATURE EXTRACTION FROM SENTINEL-2
# -------------------------------
# Mapping satellite data with training data
sentinel2_bands_df = map_satellite_data(SENTINEL_TIFF_PATH, ground_df)
display(sentinel2_bands_df.head())

<class 'xarray.core.dataarray.DataArray'>


Mapping values: 100%|██████████| 1040/1040 [00:04<00:00, 243.14it/s]


Unnamed: 0,B01,B04,B06,B08,B02,B03,B05,B07,B8A,B11,B12
0,835.0,546.0,2106.0,2300.0,494.0,697.0,1136.0,2264.0,2394.0,1629.0,1208.0
1,1241.0,568.0,2576.0,4050.0,545.0,713.0,1261.0,2977.0,2944.0,2074.0,1456.0
2,940.0,907.0,1228.0,1160.0,696.0,850.0,835.0,1378.0,1446.0,1402.0,1066.0
3,1147.0,1112.0,1743.0,1310.0,871.0,963.0,2056.0,2122.0,1753.0,2227.0,2148.0
4,955.0,893.0,2277.0,3278.0,659.0,1088.0,1137.0,2641.0,2627.0,1839.0,1426.0


In [10]:
# -------------------------------
# SPECTRAL INDICES CALCULATION
# -------------------------------
# NDVI (Normalized Difference Vegetation Index)
sentinel2_bands_df['NDVI'] = (sentinel2_bands_df['B08'] - sentinel2_bands_df['B04']) / (sentinel2_bands_df['B08'] + sentinel2_bands_df['B04'])
sentinel2_bands_df['NDVI'] = sentinel2_bands_df['NDVI'].replace([np.inf, -np.inf], np.nan) 

# gNDBI (Generalized Normalized Difference Built-up Index)
sentinel2_bands_df['gNDBI'] = (sentinel2_bands_df['B08'] - sentinel2_bands_df['B03']) / (sentinel2_bands_df['B08'] + sentinel2_bands_df['B03'])
sentinel2_bands_df['gNDBI'] = sentinel2_bands_df['gNDBI'].replace([np.inf, -np.inf], np.nan) 

# UI (Urban Index)
sentinel2_bands_df['UI'] = (sentinel2_bands_df['B11'] - sentinel2_bands_df['B08']) / (sentinel2_bands_df['B11'] + sentinel2_bands_df['B08'])
sentinel2_bands_df['UI'] = sentinel2_bands_df['UI'].replace([np.inf, -np.inf], np.nan) 

# NDBI (Normalized Difference Built-up Index)
sentinel2_bands_df['NDBI'] = (sentinel2_bands_df['B12'] - sentinel2_bands_df['B08']) / (sentinel2_bands_df['B12'] + sentinel2_bands_df['B08'])
sentinel2_bands_df['NDBI'] = sentinel2_bands_df['NDBI'].replace([np.inf, -np.inf], np.nan) 

# NBI (New Built-up Index)
sentinel2_bands_df['NBI'] = (sentinel2_bands_df['B04'] * sentinel2_bands_df['B12']) / (sentinel2_bands_df['B08'])
sentinel2_bands_df['NBI'] = sentinel2_bands_df['NBI'].replace([np.inf, -np.inf], np.nan) 

# BRBA (Band Ratio for Built-up Area)
sentinel2_bands_df['BRBA'] = (sentinel2_bands_df['B04']) / (sentinel2_bands_df['B12'])
sentinel2_bands_df['BRBA'] = sentinel2_bands_df['BRBA'].replace([np.inf, -np.inf], np.nan)

# NBAI (Normalized Built-up Area Index)
sentinel2_bands_df['NBAI'] = ((sentinel2_bands_df['B12'] - sentinel2_bands_df['B11']) / sentinel2_bands_df['B03']) / ((sentinel2_bands_df['B12'] + sentinel2_bands_df['B11']) / sentinel2_bands_df['B03'])
sentinel2_bands_df['NBAI'] = sentinel2_bands_df['NBAI'].replace([np.inf, -np.inf], np.nan)

# MBI (Modified Built-up Index)
sentinel2_bands_df['MBI'] = (sentinel2_bands_df['B11'] * sentinel2_bands_df['B04'] - (sentinel2_bands_df['B08'] * sentinel2_bands_df['B08'])) / (sentinel2_bands_df['B04'] + sentinel2_bands_df['B08'] + sentinel2_bands_df['B11'])
sentinel2_bands_df['MBI'] = sentinel2_bands_df['MBI'].replace([np.inf, -np.inf], np.nan)

# BAEI (Built-up Area Extraction Index)
sentinel2_bands_df['BAEI'] = (sentinel2_bands_df['B04'] + 0.3) / (sentinel2_bands_df['B03'] + sentinel2_bands_df['B11'])
sentinel2_bands_df['BAEI'] = sentinel2_bands_df['BAEI'].replace([np.inf, -np.inf], np.nan)

# gCI (Green Chlorophyll Index)
sentinel2_bands_df['gCI'] = (sentinel2_bands_df['B08'] / sentinel2_bands_df['B03']) - 1
sentinel2_bands_df['gCI'] = sentinel2_bands_df['gCI'].replace([np.inf, -np.inf], np.nan)

sentinel_2_indices = ['NDVI', 'gNDBI', 'UI', 'NDBI', 'NBI', 'BRBA', 'NBAI', 'MBI', 'BAEI', 'gCI']
display(sentinel2_bands_df.head())
print(sentinel2_bands_df.dtypes)

for col in sentinel2_bands_df.columns:
    sentinel2_bands_df[col] = sentinel2_bands_df[col].astype(float)

print(sentinel2_bands_df.std())

Unnamed: 0,B01,B04,B06,B08,B02,B03,B05,B07,B8A,B11,B12,NDVI,gNDBI,UI,NDBI,NBI,BRBA,NBAI,MBI,BAEI,gCI
0,835.0,546.0,2106.0,2300.0,494.0,697.0,1136.0,2264.0,2394.0,1629.0,1208.0,0.616304,0.534868,-0.170781,-0.311288,286.768696,0.451987,-0.148396,-983.366704,0.234867,2.299857
1,1241.0,568.0,2576.0,4050.0,545.0,713.0,1261.0,2977.0,2944.0,2074.0,1456.0,0.754006,0.700609,-0.322665,-0.471122,204.199506,0.39011,-0.175071,-2275.025105,0.203911,4.680224
2,940.0,907.0,1228.0,1160.0,696.0,850.0,835.0,1378.0,1446.0,1402.0,1066.0,0.1224,0.154229,0.094457,-0.042228,833.501724,0.850844,-0.136143,-21.32776,0.402886,0.364706
3,1147.0,1112.0,1743.0,1310.0,871.0,963.0,2056.0,2122.0,1753.0,2227.0,2148.0,0.081751,0.152662,0.259259,0.242337,1823.340458,0.517691,-0.018057,163.545709,0.348683,0.360332
4,955.0,893.0,2277.0,3278.0,659.0,1088.0,1137.0,2641.0,2627.0,1839.0,1426.0,0.571805,0.501603,-0.281219,-0.393707,388.47407,0.626227,-0.126493,-1514.651747,0.305193,2.012868


B01       object
B04       object
B06       object
B08       object
B02       object
B03       object
B05       object
B07       object
B8A       object
B11       object
B12       object
NDVI     float64
gNDBI    float64
UI       float64
NDBI     float64
NBI      float64
BRBA     float64
NBAI     float64
MBI      float64
BAEI     float64
gCI      float64
dtype: object
B01      332.515564
B04      573.624399
B06      558.657066
B08      755.739168
B02      483.157503
B03      485.293577
B05      485.101804
B07      648.458224
B8A      679.065698
B11      491.031757
B12      532.515527
NDVI       0.215990
gNDBI      0.173087
UI         0.146128
NDBI       0.192714
NBI      666.476803
BRBA       0.209684
NBAI       0.073233
MBI      618.920896
BAEI       0.095323
gCI        1.075769
dtype: float64


In [11]:
# -------------------------------
# DATA SAVING
# -------------------------------
os.makedirs(f'../pipeline/data/processed/{MODE}', exist_ok=True)
sentinel2_bands_df.to_parquet(f'../pipeline/data/processed/{MODE}/sentinel2_bands.parquet')