# TAMSAT Pertinence Analysis

In [None]:
#| default_exp tamsat

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from pathlib import Path
import xarray as xr
import geopandas as gpd
import hvplot.pandas
import pandas as pd
import matplotlib.pyplot as plt

from ombs_senegal.region import get_region_mask


DATA_PATH = Path("../../data")

## TAMSAT Data preprocessing

This section preprocesses TAMSAT rainfall data. First we will load and mask TAMSAT data over the region of interest
 



In [None]:
#| skip_export
tamsat = xr.open_dataset(DATA_PATH/"01-tamsatDaily.v3.1-20100101-20250531-20250603_-16.85_-6.05_10.15_18.95.nc")
roi_gdf = gpd.read_file(DATA_PATH/"point_ajustement/sub_poly.shp")
mask = get_region_mask(tamsat, roi_gdf)

In [None]:
#| skip_export
roi_tamsat = tamsat.where(mask)
roi_tamsat = roi_tamsat.sel(time=slice(None, "2024-12-31"))

Since we're interested in the total rainfall across the basin rather than its spatial distribution, we'll sum up all rainfall values within the basin area. We'll save this aggregated data to avoid repeating the preprocessing steps.

In [None]:
#| skip_export
daily_total = roi_tamsat.sum(["lat", "lon"])
daily_total.to_netcdf(DATA_PATH/"tamsat_sub_poly_daily_total.nc")

## TAMSAT estimate to in situ correlation

We will analyze the correlation between TAMSAT rainfall estimates and observed river discharge (débit).
To reduce noise and identify long-term patterns, we'll aggregate the data annually. This will help us:
1. Evaluate how well TAMSAT rainfall estimates correspond to actual river flow
2. Assess the potential effectiveness of using TAMSAT data in our benchmark model
3. Account for seasonal patterns and lag effects between rainfall and discharge

The correlation analysis will provide insights into whether TAMSAT data can be a reliable predictor for river discharge in our study area.

In [None]:
#| skip_export
insitu_df = pd.read_csv(
    DATA_PATH/'data_cumul.csv', 
    sep=';', 
    usecols=['time', 'débit_insitu', 'P_mean'], 
    index_col='time',
    converters={"time": pd.to_datetime}
    )

In [None]:
tamsat_daily_total = xr.load_dataset(DATA_PATH/"tamsat_sub_poly_daily_total.nc")

In [None]:
#| skip_export
combined_df = pd.merge(insitu_df, tamsat_daily_total["rfe"].to_dataframe(), left_index=True, right_index=True)
yearly_df = combined_df.resample("YS").sum()
yearly_df = (yearly_df - yearly_df.min())/(yearly_df.max() - yearly_df.min())

In [None]:
def r2(x, y):
    res = x.sub(y).pow(2).sum()
    tot = x.sub(x.mean()).pow(2).sum()
    return 1 - res/tot


In [None]:
from sklearn.metrics import r2_score

r2(yearly_df["débit_insitu"], yearly_df["rfe"]), r2(yearly_df["débit_insitu"], yearly_df["P_mean"])

In [None]:
#| skip_export
plt.figure(figsize=(7,6))
plt.scatter(yearly_df['débit_insitu'], yearly_df['rfe'])
plt.scatter(yearly_df['débit_insitu'], yearly_df['P_mean'])

# Add year labels to each point
for idx, row in yearly_df.iterrows():
    plt.annotate(idx.year, (row['débit_insitu'], row['rfe']), xytext=(5,5), textcoords='offset points')
    plt.annotate(idx.year, (row['débit_insitu'], row['P_mean']), xytext=(5,5), textcoords='offset points')

plt.xlabel('Débit in-situ')
plt.ylabel('Rainfall Estimate (mm)')
plt.title('Débit vs Rainfall')

As expected, in the above graph we can see that, even if there are some outliers such as 2020, there is a big correlation between the rainfall estimate and the river flow.

We will now take a closer look by plotting the smoothed and normalized daily data.


In [None]:
#| skip_export
from ombs_senegal.benchmark_model import normalize

In [None]:
#| export
def smooth(df, window=7, missing_values=0):
    smoothed_df = df.copy()
    smoothed_df = smoothed_df.rolling(window=window).sum()
    return smoothed_df.fillna(missing_values)

In [None]:
#| skip_export
processed_df = combined_df.copy()
processed_df["rfe"] = smooth(combined_df["rfe"], window=7)
normalized_df = normalize(processed_df)

normalized_df.hvplot.line()

## Model Benchmark with TAMSAT 

Based on the strong correlation observed between TAMSAT rainfall estimates and river flow, we will now evaluate the benchmark model using TAMSAT data. We will conduct two analyses:
1. Using only TAMSAT rainfall estimates and MGB water flow predictions as input features
2. Using all available parameters (TAMSAT rainfall, MGB flow, and other variables) as input features

Similar to our previous analysis with IMERG data, we will:
- Test different time window sizes to capture temporal patterns
- Evaluate multiple polynomial degrees to model non-linear relationships
- Compare model performance using standard metrics (MSE, MAE) and visual analysis

This will allow us to:
- Assess TAMSAT's effectiveness as a predictor
- Compare results with the IMERG-based models
- Determine optimal model parameters

In [None]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from ombs_senegal.benchmark_model import FeatureGenerator, SimpleRegressionModel
from ombs_senegal.benchmark_model import mae, rmse

from ombs_senegal.benchmark_model import plot_benchmark_scores

In [None]:
#| skip_export
df = pd.read_csv(
    DATA_PATH/'data_cumul.csv', 
    sep=';', 
    usecols=['time', 'P_cumul_7j', 'débit_insitu', 'débit_mgb'], 
    index_col='time',
    converters={"time": pd.to_datetime}
    )

tamsat_daily_total = xr.load_dataset(DATA_PATH/"tamsat_sub_poly_daily_total.nc")

data = pd.merge(df, tamsat_daily_total["rfe"].to_dataframe(), left_index=True, right_index=True)


#### Select feature and target columns

In [None]:
x_col, y_col = ["débit_mgb", "rfe"], ['débit_insitu']


#### Sooth data

In [None]:
data["rfe"] = smooth(data["rfe"], window=7)

#### Scale data

In [None]:
from sklearn.preprocessing import RobustScaler
# Mise à l'échelle avec RobustScaler
features_scaler = RobustScaler()

features = data[x_col]
data[x_col] = features_scaler.fit_transform(features)


In [None]:
#| skip_export
train_mask = df.index < '2019-01-01'

# normalized_data = normalize(data[x_col])
# normalized_data[y_col[0]] = data[y_col[0]]

train = data[train_mask]
valid = data[~train_mask]

In [None]:
#| hide
#| eval: false
predictions = []
scores = []
for degree in range(1, 4):
    for window in range(10, 51, 10):
        feature_generator = FeatureGenerator(context_window=window, target_window=10, degree=degree)        
        train_x, train_y = feature_generator.generate(train, x_col, y_col)
        valid_x, valid_y = feature_generator.generate(valid, x_col, y_col)

        model = SimpleRegressionModel()
        model.fit(train_x, train_y)
        pred = model.predict(valid_x)

        pred_df = pred.copy()
        pred_df['degree'] = degree
        pred_df['window'] = window
        pred_df = pred_df.set_index(['degree', 'window'], append=True)
        predictions.append(pred_df)

predictions_ds = pd.concat(predictions).reorder_levels(['degree', 'window', 'time']).to_xarray()
observations = valid[y_col].to_xarray().sel(time=slice(predictions_ds.time.min(), predictions_ds.time.max()))
results_ds = predictions_ds.merge(observations)

In [None]:


mae_ds = mae(predictions_ds.to_array(), observations["débit_insitu"])
rmse_ds = rmse(predictions_ds.to_array(), observations["débit_insitu"])


In [None]:
#| hide
#| eval: false
scores_ds = mae_ds.to_dataset(name="mae").merge(rmse_ds.to_dataset(name="rmse"))

In [None]:
#| hide
#| eval: false
import xarray as xr
def find_min_coords(ds):
    """Find coordinates of minimum values in dataset."""
    min_val = ds.min(dim=["window", "degree"])
    min_coords = ds.where(ds == ds.min(dim=["window", "degree"]), drop=True)
    return min_coords.to_dataframe(name=ds.name).dropna()

min_mae_coords = find_min_coords(mae_ds)
min_rmse_coords = find_min_coords(rmse_ds)

min_coords = pd.concat([min_mae_coords, min_rmse_coords], axis=1)

In [None]:
min_coords.dropna(how="all").reorder_levels(['variable', 'degree', 'window']).sort_index()

In [None]:
plot_benchmark_scores(scores_ds.sel(degree=slice(2, 2), window=slice(40, 50)).to_dataframe(),)# xlim=(None, 120), ylim=(None, 225))

With IMERG: Degree: 1, Window: 50, MSE: 19955.6, MAE: 83.0

Without IMERG: Degree: 2, Window: 50, MSE: 20931.7, MAE: 78.3


In [None]:
#| hide
#| eval: false
benchmark_ds = results_ds.sel(degree=2, window=slice(30,50))
benchmark_ds.to_netcdf(DATA/'regression_benchmark.nc')

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()