In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import xarray as xr
import geopandas as gpd
import hvplot.pandas
import pandas as pd
import matplotlib.pyplot as plt

from ombs_senegal.region import get_region_mask


DATA_PATH = Path("../../data")

In [None]:
import pandas as pd
from sklearn.preprocessing import RobustScaler
from ombs_senegal.benchmark_model import FeatureGenerator, SimpleRegressionModel, BenchmarkScores
from ombs_senegal.benchmark_model import plot_interactive_benchmark_scores, plot_prediction_comparison
from ombs_senegal.season import SeasonalityHandler

In [None]:
df = pd.read_csv(
    DATA_PATH/'data_cumul.csv', 
    sep=';', 
    usecols=['time', 'débit_insitu', 'débit_mgb'], 
    index_col='time',
    converters={"time": pd.to_datetime}
    )

tamsat_daily_total = xr.load_dataset(DATA_PATH/"tamsat_sub4_senegal_daily_total.nc")

data = pd.merge(df, tamsat_daily_total["rfe"].to_dataframe(), left_index=True, right_index=True)
data["débit_insitu_x"] = data["débit_insitu"].copy()


#### Preprocess data

Select feature and target columns

In [None]:
x_col, y_col = ["débit_insitu_x"], ['débit_insitu']


Split the data

In [None]:
train = data[:"2018-12-31"]
test = data["2019-01-01":]

Add target season column

In [None]:
# season_handler =SeasonalityHandler()
# _ = season_handler.compute_seasonal_pattern(train[y_col])
# train = season_handler.remove_seasonality(train)
# train = season_handler.append_season(train)
# test = season_handler.append_season(test)

Smooth data

In [None]:
def smooth(df, window=7, missing_val=0): return df.rolling(window=window).sum().fillna(missing_val)

data["rfe"] = smooth(data["rfe"], window=7)

Scale data

In [None]:
features_scaler = RobustScaler()

#train[x_col] = features_scaler.fit_transform(train[x_col])
#test[x_col] = features_scaler.transform(test[x_col])


In [None]:

predictions = []
for degree in range(1, 10):
    for window in range(1, 10):
        feature_generator = FeatureGenerator(context_window=window, target_window=10, degree=degree)        
        train_x, train_y = feature_generator.generate(train, x_col, y_col)
        test_x, _ = feature_generator.generate(test, x_col, y_col)

        model = SimpleRegressionModel()
        model.fit(train_x, train_y)
        predictions.append(model.predict_as_dataframe(test_x, degree=degree, ctx_window=window))


predictions = pd.concat(predictions).reorder_levels(['degree', 'ctx_window', 'time']).to_xarray()
observations = test[y_col[0]].to_xarray().sel(time=slice(predictions.time.min(), predictions.time.max()))


In [None]:
def reseasonalize(ds, season_handler):
    index = ds.to_dataframe().reset_index(["degree", "ctx_window"])[["degree", "ctx_window"]]
    index_cols = list(index.columns)
    df = season_handler.add_seasonality(ds.to_dataframe().reset_index(index_cols, drop=True))
    df[index_cols] = index
    df = df.set_index(index_cols, append=True)
    return df.to_xarray()

# predictions = reseasonalize(predictions, season_handler)

In [None]:
benchmark_scores = BenchmarkScores()
scores_ds = benchmark_scores.compute_scores(
    predictions,
    observations,
    ["mae", "rmse", "nse", "kge"])
best_scores = benchmark_scores.find_nbest_scores(
    scores_ds,
    how={"mae": "min", "rmse": "min", "nse": "max", "kge": "max"},
    n=1)

In [None]:
plot_interactive_benchmark_scores(best_scores)

In order to be able to choose the best model we will analize three possibilities. 
- Average classic scores such as MAE and RMSE
- Average Hydrological scores such as NSE and KGE
- Average all the metrics scores. By this means we will normalize MAE and RMSE and inverse them being 1 the best and 0 the worst.

In [None]:
def get_best_average_score(scores, how="max"):
    """Returns best model configuration based on averaged normalized scores across metrics."""
    metric_averaged_scores = scores.to_array().mean(dim="variable")
    
    best_configuration = benchmark_scores.find_nbest_scores(
        metric_averaged_scores.to_dataset(name="score"), 
        how={"score": how}, 
        n=1
    )
    best_model_idx = {}
    for idx, row in best_configuration.reset_index().iterrows():
        best_model_idx[row["forecast_horizon"]] = {"degree": row["degree"], "ctx_window": row["ctx_window"]}
    return best_model_idx

We can now get the best models for classic scores and hydrological scores

In [None]:
best_model_classic = get_best_average_score(scores_ds[["mae", "rmse"]], how="min")
best_model_hydro = get_best_average_score(scores_ds[["nse", "kge"]])

Finally we will get the best models based on average score

In [None]:

def normalize_metrics(ds):
    dims = ["degree", "ctx_window"]
    return 1 - (ds - ds.min(dim=dims))/(ds.max(dim=dims) - ds.min(dim=dims))

normalized_scores = scores_ds.copy()
normalized_scores[["mae", "rmse"]] = normalize_metrics(normalized_scores[["mae", "rmse"]])
best_model_avg = get_best_average_score(normalized_scores)

We can now plot the data

In [None]:
_ = plot_prediction_comparison(
    observed=observations, 
    predicted=predictions, 
    best_model=best_model_avg,
    mgb=test["débit_mgb"].to_xarray(),
    scores=scores_ds
    )


The analysis reveals distinct patterns in model performance across different metrics. While the model optimized for hydrological scores shows unique behavior, the model selected based on classic metrics closely aligns with the averaged score model's predictions. This alignment may be attributed to similarities in how these scores are calculated.

Based on our visual analysis, we observe two key patterns:

- With a 15-day smoothing window, the best performing model varies depending on the forecast horizon, though it generally corresponds to the model with the highest averaged score
- With a 60-day smoothing window, the model with the best averaged score consistently outperforms other models across all forecast horizons

The visual inspection further validates that the 60-day smoothing window, which shows the strongest correlation with hydrological metrics, produces the most accurate predictions overall.

## Save results

In [None]:
best_model_avg

In [None]:
benchmark_predictions = predictions.sel(degree=2, ctx_window=10).to_array("forecast_horizon", name="pred")
benchmark_predictions = benchmark_predictions.expand_dims({"model": ["Regression"]})
observations.name = "obs"
benchmark_scores = scores_ds.sel(degree=2, ctx_window=10).to_array("score", name="scores")
benchmark_scores = benchmark_scores.expand_dims({"model": ["Regression"]})
benchmark_results = xr.merge([
    benchmark_predictions,
    observations,
    benchmark_scores])
benchmark_results.to_netcdf(DATA_PATH/'tamsat_regression_benchmark.nc')