In [4]:
%load_ext autoreload
%autoreload 2
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from tqdm import tqdm
import time

from utils.load import load_radar_dataset, load_raingauge_dataset, load_cml_dataset, get_gauge_coordinate_mappings, read_config
from benchmarks.models.idw import run_IDW_benchmark
from benchmarks.models.kriging import kriging_external_drift

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
config = read_config('config.yaml')

In [None]:
radar_df = load_radar_dataset(folder_name='sg_radar_data')
raingauge_df = load_raingauge_dataset('rainfall_data.csv', N=0)
cml_df = load_cml_dataset('CML_data_processed_2025.nc')
print(radar_df.shape)
print(raingauge_df.shape)
print(cml_df.shape)

The size of dataset is 8334


In [None]:
print(raingauge_df.iloc[0].name)

# IDW Interpolation with rain gauge

In [None]:
raingauge_df_5mins = raingauge_df.mul(12)
raingauge_df_15mins = raingauge_df.resample('15min').sum().mul(4) #resamples to 15 mins and converts to mm/hour rainfall rate
station_dict = get_gauge_coordinate_mappings()

raingauge_choice_df = raingauge_df_15mins
training_ratio = 0.7

random.seed(111)
training_stations = random.sample(list(station_dict.keys()), math.floor(len(station_dict) * training_ratio))
validation_stations = [s for s in station_dict if s not in training_stations]

gridx = np.arange(103.605, 104.05, 0.01)
gridy = np.arange(1.145, 1.51, 0.01)

# fig, ax = plt.subplots(3,3, sharex=True, sharey=True, figsize=(10,10))

# #With plotting
# idw_RMSE = run_IDW_benchmark(raingauge_choice_df,
#                              coordinates=station_dict,
#                              training_stations=training_stations,
#                              validation_stations=validation_stations,
#                              power=2, 
#                              loss_hist=False,
#                              x_grid=gridx, 
#                              y_grid=gridy, 
#                              ax=ax,
#                              axis_cols=3,
#                              axis_rows=3,
#                              plot_time_start=pd.Timestamp("2025-02-05 04:15:00"),
#                              n_nearest=15
#                              )

#Without plotting
idw_RMSE = run_IDW_benchmark(raingauge_choice_df,
                             coordinates=station_dict,
                             training_stations=training_stations,
                             validation_stations=validation_stations,
                             power=2, 
                             loss_hist=False,
                             x_grid=gridx, 
                             y_grid=gridy,
                             n_nearest=5
                             )

# Kriging interpolation with rain gauge

In [None]:
random.seed(111)
total_RMSE_loss = 0.0
invalid_kriges = 0
count = 0
training_ratio = config['dataset_parameters']['train_size']
station_names = []
station_dict = get_gauge_coordinate_mappings()
for key in station_dict.keys():
  station_names.append(key,)

training_stations = random.sample(station_names, int(len(station_names) * training_ratio))
validation_stations = [s for s in station_names if s not in training_stations]

loss_arr = []

start = time.time()

for i in tqdm(range(len(raingauge_choice))):
  count += 1
  df = raingauge_df_5mins.iloc[i].fillna(0)

  kriging_result, keiging_variance = kriging_external_drift(df=df, 
                                                            station_names=training_stations, 
                                                            station_dict=station_dict, 
                                                            variogram_model='exponential', 
                                                            method='ordinary')
  # print(kriging_result) #kriging_result[row][col]
  # plt.imshow(kriging_result, origin='lower')
  if kriging_result is None:
    invalid_kriges += 1
    continue

  #Calculate loss
  RSE_loss = 0.0
  station_count = 0
  for validation_station in validation_stations:
    rain_gauge_value = df[validation_station]
    lat, long = station_dict[validation_station]
    row = math.floor((lat - 1.14) / 0.01)
    col = math.floor((long - 103.6) / 0.01)
    kriged_value = kriging_result[row][col]

    error = np.sqrt((kriged_value - rain_gauge_value) ** 2)
    RSE_loss += error
    loss_arr.append(error)
    station_count += 1

  RMSE = RSE_loss / station_count

  total_RMSE_loss += RMSE
  # print(f"RMSE: {RMSE}")

end = time.time()

print(f"invalid kriges: {invalid_kriges}")
print(f"final average loss: {total_RMSE_loss / (len(raingauge_choice)-invalid_kriges)}")
print(f"final average loss (0 rain = 0 loss): {total_RMSE_loss / (len(raingauge_choice))}")
print(f"Time taken = {end - start}")

plt.figure(figsize=(15,8))
plt.hist(loss_arr, bins=30, log=True)
plt.show()


In [None]:
print(df)