# Libraries

In [None]:
%load_ext autoreload
%autoreload 2
import random
import math
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from models.kriging import kriging_external_drift
import tqdm as tqdm

from utils.load import load_radar_dataset, load_raingauge_dataset, load_cml_dataset, read_config, get_gauge_coordinate_mappings
from utils.visualisation import *

In [None]:
config = read_config('config.yaml')

# Load Dataset

In [None]:
radar_df = load_radar_dataset(folder_name='sg_radar_data')
raingauge_df = load_raingauge_dataset('rainfall_data.csv', N=0)
cml_df = load_cml_dataset('CML_data_processed_2025.nc')
print(radar_df.shape)
print(raingauge_df.shape)
print(cml_df.shape)

In [None]:
temp = raingauge_df.copy()

fifteen_min_total = temp.resample('15min').sum().mul(4) #resamples to 15 mins and converts to mm/hour rainfall rate
raingauge_rate_df_15mins = fifteen_min_total

raingauge_rate_df_5mins = raingauge_df.mul(12) #converts rainrate to instantaneous rainrate(at 5 mins interval)

# Filter for only stations whos coordinates we know

In [None]:
raingauge_sampling_method = "15mins"

station_dict = get_gauge_coordinate_mappings()
filter_cols = [s for s in raingauge_df.columns]

if raingauge_sampling_method == "5mins":
  station_aligned_raingauge_df = raingauge_rate_df_5mins[filter_cols]
elif raingauge_sampling_method == "15mins":
  station_aligned_raingauge_df = raingauge_rate_df_15mins[filter_cols]
print(filter_cols)

raingauge_station_count = len(filter_cols)

#Count the number of na values per col
nan_values = []
for col in filter_cols:
  column_data = station_aligned_raingauge_df[col].values
  nans = np.sum(np.count_nonzero(column_data))
  nan_values.append(nans)

print(raingauge_rate_df_15mins)
plt.figure()
plt.title("number of non-zero per station")
plt.hist(nan_values, bins=10)
plt.show()

# MERGE RADAR DATA AND RAIN GAUGE DATA

In [None]:
merged_df = pd.merge(station_aligned_raingauge_df, radar_df, on='time_sgt', how='inner').fillna(0) # Hacky solutions for now. Need to figure out what to do with na values

# print(merged_df)
avg = merged_df.values[:, 1:-4].mean()
min = merged_df.values[:, 1:-4].min()
max = merged_df.values[:, 1:-4].max()

print(f"min: {min}, max: {max}, average: {avg}")
flattened_arr = merged_df.values[:, 1:-4].flatten()

plt.hist(flattened_arr, bins=100, log=True)
plt.title(f"Raingauge readings sampled at {raingauge_sampling_method}")
plt.xlabel("Rainfall rate (mm/h)")
plt.ylabel("Count (Log scale)")

ATTEMPT AT KRIGING BASELINE

In [None]:
from pykrige.uk import UniversalKriging

fig, axes = plt.subplots(nrows=1, ncols=3)

variogram_model = "gaussian"

row_data = merged_df.iloc[2].dropna()
print(row_data)
station_names = list(row_data.index[:-4])

data = []

for s in station_names[1:]:
  lat, long = station_dict[s]
  data.append([long, lat, row_data[s]])

gauge_data = np.array(data)

gridx = np.arange(103.605, 104.05, 0.01)
gridy = np.arange(1.145, 1.51, 0.01)

#RADAR FOR USE IN EXTERNAL DRIFT

radar_grid = row_data['data']
bounds = row_data['bounds']
transform = row_data['transform']
x_min = bounds.left
y_max = bounds.top
pixel_width = transform[0]
pixel_height = -transform[4]

axes[0].imshow(radar_grid, origin='lower')

e_d = []
e_dx = []
e_dy = []

for row in range(radar_grid.shape[0]): 
    y = y_max - (row * pixel_height) + pixel_height / 2
    e_dy.append(y)

for col in range(radar_grid.shape[0]):

    # Calculate middle of cell
    x = x_min + (col * pixel_width) + pixel_width / 2
    e_dx.append(x)


e_dx = np.array(e_dx)
e_dy = np.array(e_dy)

KED = UniversalKriging(
    x=gauge_data[:, 0],
    y=gauge_data[:, 1],
    z=gauge_data[:, 2],
    variogram_model=variogram_model,
    drift_terms=["external_Z"],
    external_drift=radar_grid,
    external_drift_x=e_dx,
    external_drift_y=e_dy,
    pseudo_inv=True
)

UK = UniversalKriging(
    gauge_data[:, 0],
    gauge_data[:, 1],
    gauge_data[:, 2],
    variogram_model=variogram_model,
    drift_terms=["regional_linear"],
    pseudo_inv=True
)


z,ss = KED.execute("grid", gridx, gridy)
z2,ss2 = UK.execute("grid", gridx, gridy)

axes[1].imshow(z, origin='lower') #UKriging with external drift
axes[2].imshow(z2, origin='lower') #Universal kriging
plt.show()

print(z.shape)

# Calculate Kriging Loss

In [None]:
random.seed(42)
total_RMSE_loss = 0.0
invalid_kriges = 0
count = 0
training_ratio = config['dataset_parameters']['train_size']
station_names = list(merged_df.columns[:-4])
station_names.remove('time_sgt')
training_stations = random.sample(station_names, int(len(station_names) * training_ratio))
validation_stations = [s for s in station_names if s not in training_stations]
loss_arr = []

for i in tqdm.tqdm(range(len(merged_df))):
  count += 1
  df = merged_df.iloc[i]
  station_names=list(df.index[:-4]) #HARDCODED
  station_names.remove('time_sgt')

  kriging_result, keiging_variance = kriging_external_drift(df=df, 
                                                            station_names=training_stations, 
                                                            station_dict=station_dict, 
                                                            variogram_model='hole-effect', 
                                                            method='universal')
  # print(kriging_result) #kriging_result[row][col]
  # plt.imshow(kriging_result, origin='lower')
  if kriging_result is None:
    invalid_kriges += 1
    continue

  #Calculate loss
  total_RSE_loss = 0.0
  station_count = 0
  for validation_station in validation_stations:
    rain_gauge_value = df[validation_station]
    lat, long = station_dict[validation_station]
    row = math.floor((lat - 1.14) / 0.01)
    col = math.floor((long - 103.6) / 0.01)
    kriged_value = kriging_result[row][col]

    error = math.sqrt((kriged_value - rain_gauge_value) ** 2)
    total_RSE_loss += error
    station_count += 1

  RMSE = total_RSE_loss / station_count
  loss_arr.append(RMSE)
  total_RMSE_loss += RMSE
  # print(f"RMSE: {RMSE}")

print(f"final average loss: {total_RMSE_loss / (len(merged_df)-invalid_kriges)}")


In [None]:
KED_Loss = loss_arr
print(max(loss_arr))
plt.hist(KED_Loss, bins=100)
plt.show()

In [None]:
Universal_Loss = loss_arr
print(max(Universal_Loss))
plt.hist(Universal_Loss, bins=100)
plt.show()

In [None]:
ordinary_Loss = loss_arr
print(max(ordinary_Loss))
plt.hist(ordinary_Loss, bins=100)
plt.show()

# Plot raingauge and radar data on the same grid

# General plotting function (Hard coded for max 9 plots for now)

In [None]:
R = 1
C = 1

fig, ax = plt.subplots(R, C, figsize=(17,12))

bounds_singapore = {
  'left': 103.6,
  'right': 104.05,
  'top': 1.5,
  'bottom': 1.188
}

# #iterrate through rows
# for index, row in merged_df.head(R * C).iterrows():
#   node_df = pandas_to_geodataframe(row)
#   radar_input = row[['data', 'bounds', 'crs', 'transform']]
#   visualise_gauge_grid(node_df=node_df, ax=ax[int(index / 2)][index % 2])
#   visualise_radar_grid(data=radar_input, ax=ax[int(index / 2)][index % 2], zoom=None, scaling=None, alpha=0.5, legend=False)
#   cx.add_basemap(ax, crs=4326, source=cx.providers.CartoDB.Voyager)

plt.plot()

#single plot
for index, row in merged_df[2:].head(1).iterrows():
  node_df = pandas_to_geodataframe(row)
  radar_input = row[['data', 'bounds', 'crs', 'transform']]
  visualise_gauge_grid(node_df=node_df, ax=ax)
  visualise_radar_grid(data=radar_input, ax=ax, zoom=bounds_singapore, scaling=None, alpha=0.5, legend=False)
  cx.add_basemap(ax, crs=4326, source=cx.providers.CartoDB.Voyager)


  

# TEMP: Load CML 

In [None]:
cml_df.rename(columns={'time': 'time_sgt'}, inplace=True)
cml_df['lat'] = (cml_df['site_a_latitude'] + cml_df['site_b_latitude'])/2
cml_df['lon'] = (cml_df['site_a_longitude'] + cml_df['site_b_longitude'])/2

# #vals = ['trsl', 'wet', 'baseline', 'waa', 'A', 'R','frequency', 'length', 'lat', 'lon']
# vals = ['R', 'frequency', 'lat','lon']

In [None]:
vals = list(cml_df.columns.drop(['time_sgt', 'link_id', 'station']))
df_pivoted = cml_df.pivot(index='time_sgt', columns=['link_id', 'station'], values='R')
print(df_pivoted)

In [None]:
#df_result contains the average rainfall rate between station A, B for each location

df_result = df_pivoted.groupby(level=0, axis=1).agg('mean')
df_result = df_result.reset_index()
print(df_result)

In [None]:
# df_pivoted = cml_df.pivot(index='time_sgt', columns=['link_id', 'station'], values=vals)

# def process_row(row):
#     x = row.tolist()
#     new_row = []
#     print(x)
#     new_row.append(x[0]) #freq
#     new_row.append(x[2]) #lat
#     new_row.append(x[4]) #lon
#     new_row.append((x[6] + x[7]) / 2)
#     return new_row

# def combine_stations_and_values_v1(df_pivoted):
#     """
#     Combine stations and aggregate values into arrays using groupby
#     (I don't really know how this works. But the output is [R, R, frequency, frequency, lat, lat, lon, lon, trslA, trslB])
#     """
#     # Stack to convert columns to rows, keeping time_sgt as index
#     stacked = df_pivoted.stack(level=[0, 1, 2])  # Stack all column levels
#     stacked = stacked.reset_index()
    
#     # Rename columns for clarity
#     stacked.columns = ['time_sgt', 'value_type', 'link_id', 'station', 'value']
    
#     # Group by time_sgt and link_id, then aggregate all values into lists
#     result = stacked.groupby(['time_sgt', 'link_id'])['value'].apply(
#         process_row  # Remove NaN values and convert to list
#     ).unstack(level='link_id')
    
#     return result

# df_result = combine_stations_and_values_v1(df_pivoted)


In [None]:
# df_result = df_result.reset_index()
# print(df_result.reset_index().columns)

In [None]:
copied_df = merged_df.copy()

In [None]:
all_sources_df = pd.merge(copied_df, df_result, on='time_sgt', how='inner')
print(all_sources_df)

In [None]:
# df_pivoted = cml_df.pivot_table(index='time', columns=['link_id', 'station'], values='R')
# df_pivoted.columns = [f'{link_id}_{station}' for link_id, station in df_pivoted.columns]
# df_pivoted = df_pivoted.reset_index()
# df_pivoted.rename(columns={'time': 'time_sgt'}, inplace=True)

# copied_df = merged_df.copy()
# copied_df = pd.merge(copied_df, df_pivoted, on='time_sgt', how='inner')
# print(copied_df.iloc[0])


# pd_df.rename(columns={'time_sg': 'time_sgt'}, inplace=True)
# print(pd_df)
# print(merged_df)
# merged_df = pd.merge(pd_df, merged_df, on='time_sgt', how='inner')
# print(merged_df)

In [None]:
#get coordinate points for each station in CML
# print(cml_df)

cml_coordinate_info = cml_df[['link_id', 'lat', 'lon']]
cml_coordinate_info = cml_coordinate_info.groupby('link_id').agg('mean')

print(cml_coordinate_info.loc('AMK_1012_1655_59-ODU_25-ODU'))

In [None]:
def interpolate_scattered_to_grid(scattered_x, scattered_y, scattered_values, 
                                  grid_x, grid_y, method='linear'):
    """
    Interpolate scattered point data to a regular grid
    """
    from scipy.interpolate import griddata
    
    # Create grid points
    X_grid, Y_grid = np.meshgrid(grid_x, grid_y)
    grid_points = np.column_stack([X_grid.ravel(), Y_grid.ravel()])
    
    # Interpolate scattered data to grid
    scattered_points = np.column_stack([scattered_x, scattered_y])

    interpolated_values = griddata(
        scattered_points, 
        scattered_values, 
        grid_points, 
        method=method,
        fill_value=np.nan
    )
    
    # Reshape back to grid
    return interpolated_values.reshape(X_grid.shape)

In [None]:
from pykrige.uk import UniversalKriging

variogram_model = "gaussian"

fig, axes = plt.subplots(nrows=2, ncols=2)
row_data = all_sources_df.iloc[3].dropna()

raingauge_station_names = list(row_data.index[:raingauge_station_count])
cml_station_names = list(row_data.index[raingauge_station_count + 5:])

data = [] #initialise array

for s in raingauge_station_names[2:]:
  lat, long = station_dict[s]
  data.append([long, lat, row_data[s]])

gauge_data = np.array(data)
 
data = [] #initialise array

for s in cml_station_names:
   lat, lon = cml_coordinate_info.loc[s]
   data.append([lon, lat, row_data[s]])

cml_data = np.array(data)

gridx = np.arange(103.605, 104.05, 0.01)
gridy = np.arange(1.145, 1.51, 0.01)

#RADAR FOR USE IN EXTERNAL DRIFT

radar_grid = row_data['data']
bounds = row_data['bounds']
transform = row_data['transform']
x_min = bounds.left
y_max = bounds.top
pixel_width = transform[0]
pixel_height = -transform[4]

axes[0][0].imshow(radar_grid, origin='lower')

e_d = []
e_dx = []
e_dy = []

for row in range(radar_grid.shape[0]): 
    y = y_max - (row * pixel_height) + pixel_height / 2
    e_dy.append(y)

for col in range(radar_grid.shape[0]):

    # Calculate middle of cell
    x = x_min + (col * pixel_width) + pixel_width / 2
    e_dx.append(x)


e_dx = np.array(e_dx)
e_dy = np.array(e_dy)


#Need to interpolate the cml values to the grid in order ot use function

second_var_grid = interpolate_scattered_to_grid(
   cml_data[:, 0],
   cml_data[:, 1],
   cml_data[:, 2],
   e_dx,
   e_dy
)

numpy_array = np.array(second_var_grid)

# Replace NaN values with 0 using nan_to_num
converted_array = np.nan_to_num(numpy_array, nan=0.0)

# If you need to convert it back to a list of lists
second_var_grid = converted_array

cml_radar_grid = np.column_stack([radar_grid, second_var_grid])


KED_with_radar = UniversalKriging(
    x=gauge_data[:, 0],
    y=gauge_data[:, 1],
    z=gauge_data[:, 2],
    variogram_model=variogram_model,
    drift_terms=["external_Z"],
    external_drift=radar_grid,
    external_drift_x=e_dx,
    external_drift_y=e_dy,
    pseudo_inv=True
)

KED_with_radar_cml = UniversalKriging(
    x=gauge_data[:, 0],
    y=gauge_data[:, 1],
    z=gauge_data[:, 2],
    variogram_model=variogram_model,
    drift_terms=['external_Z'],
    external_drift=second_var_grid,
    external_drift_x=e_dx,
    external_drift_y=e_dy,
    pseudo_inv=True
)

UK = UniversalKriging(
    gauge_data[:, 0],
    gauge_data[:, 1],
    gauge_data[:, 2],
    variogram_model=variogram_model,
    drift_terms=["regional_linear"],
    pseudo_inv=True
)


z,ss = KED_with_radar.execute("grid", gridx, gridy)
z2,ss2 = UK.execute("grid", gridx, gridy)
z3,ss3 = KED_with_radar_cml.execute("grid", gridx, gridy)

axes[0][1].imshow(z2, origin='lower') #Universal kriging
axes[0][1].set_title("universal kriging")
axes[1][0].imshow(z, origin='lower') #UKriging with external drift
axes[1][0].set_title("KED")
axes[1][1].imshow(z3, origin='lower') #Kriging with external drift CML + radar
axes[1][1].set_title("KED + cml")
plt.show()