# NIR EDA

NIR is a new feature in the glacier mapper and needs to be tested. This script is mainly for visualization and some basic correlations. -> find out which features are most likely useful.

## Imports

In [None]:
import os
import sys
from typing import Dict, Any, Optional

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add the parent directory to sys.path to allow imports from monthly_forecasting
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))

## Config

In [None]:
data_config = {
    "path_discharge": "../../../../data/discharge/digitized_kyrgyz_hydromet/kyrgyz_hydromet_discharge_daily_2000_2023_kgz_filtered_v2.csv",
    "path_forcing": "../../../../data/forcing/ERA5_krg/HRU00003_forcing_2000-2023.csv",
    "path_static_data": "../../../../GIS/ML_Sandro/ML_basin_attributes_v2.csv",
    "path_to_sla": "../../../../data/sla_silvan/fsc_sla_timeseries_gapfilled.csv",
    "path_to_nir": "../../../../data/sla_silvan/meanNIR_TS_allBasins.csv",
    "path_to_sca": None,
    "path_to_hru_shp": None,
    "path_to_swe": "../../../../data/snow/kyrgyzstan_ts/SWE",
    "path_to_hs": "../../../../data/snow/kyrgyzstan_ts/HS",
    "path_to_rof": "../../../../data/snow/kyrgyzstan_ts/RoF",
    "HRU_SWE": "HRU_00003",
    "HRU_HS": "HRU_00003",
    "HRU_ROF": "HRU_00003",
    "model_home_path": "../../monthly_forecasting_models/GlacierMapper_Based",
}

## Data Loading

In [None]:
from monthly_forecasting.scr import data_loading as dl

# supress logging from matplotlib
import logging

logging.getLogger("matplotlib").setLevel(logging.WARNING)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def load_data(data_config: Dict[str, Any], path_config: Dict[str, Any]) -> tuple:
    """
    Load data using the data loading utilities.

    Args:
        data_config: Data configuration
        path_config: Path configuration

    Returns:
        Tuple of (data, static_data)
    """
    # -------------- 1. Load Data ------------------------------
    hydro_ca, static_df = dl.load_data(
        path_discharge=path_config["path_discharge"],
        path_forcing=path_config["path_forcing"],
        path_static_data=path_config["path_static_data"],
        path_to_sca=path_config["path_to_sca"],
        path_to_swe=path_config["path_to_swe"],
        path_to_hs=path_config["path_to_hs"],
        path_to_rof=path_config["path_to_rof"],
        HRU_SWE=path_config["HRU_SWE"],
        HRU_HS=path_config["HRU_HS"],
        HRU_ROF=path_config["HRU_ROF"],
        path_to_sla=path_config.get("path_to_sla", None),
        path_to_nir=path_config.get("path_to_nir", None),
    )

    # if log_discharge in columns - drop
    if "log_discharge" in hydro_ca.columns:
        hydro_ca.drop(columns=["log_discharge"], inplace=True)

    hydro_ca = hydro_ca.sort_values("date")

    hydro_ca["code"] = hydro_ca["code"].astype(int)

    if "CODE" in static_df.columns:
        static_df.rename(columns={"CODE": "code"}, inplace=True)
    static_df["code"] = static_df["code"].astype(int)

    return hydro_ca, static_df


# Load the data
hydro_ca, static_df = load_data(data_config, data_config)

print("Data loaded successfully.")

# get the dimensions of the data
print(f"hydro_ca shape: {hydro_ca.shape}")

# get the dtype of each column
print(hydro_ca.dtypes)

# print head
print(hydro_ca.head())

## Visualization

In [None]:
feature_config = {
    "discharge": [
        {
            "operation": "mean",
            "windows": [
                15,
                30,
            ],
            "lags": {},
        },
    ],
    "P": [{"operation": "sum", "windows": [15, 30], "lags": {}}],
    "T": [{"operation": "mean", "windows": [15, 30], "lags": {}}],
    "fsc_basin": [{"operation": "last_value", "windows": [30], "lags": {}}],
    "NIR": [
        {"operation": "last_value", "windows": [30], "lags": {}},
        {"operation": "mean", "windows": [30], "lags": {}},
    ],
    "SWE": [{"operation": "mean", "windows": [15, 30], "lags": {}}],
}

In [None]:
def plot_monthly_correlation(
    data: pd.DataFrame,
    feature_col: str,
    target_col: str,
    *,
    month_col: str = "month",
    jitter: float = 0.2,
    point_alpha: float = 0.45,
    figsize: tuple[int, int] = (12, 6),
) -> None:
    """Plot monthly correlation distribution between a feature and a target."""
    if month_col not in data.columns:
        if "date" not in data.columns:
            raise ValueError(
                "DataFrame must contain either 'date' or the month column."
            )
        data = data.copy()
        data[month_col] = data["date"].dt.month

    required = {feature_col, target_col, month_col, "code"}
    missing = required.difference(data.columns)
    if missing:
        missing_str = ", ".join(sorted(missing))
        raise ValueError(f"DataFrame is missing required columns: {missing_str}")

    correlations = (
        data.groupby([month_col, "code"], dropna=False)
        .apply(lambda df: df[feature_col].corr(df[target_col]))
        .reset_index(name="correlation")
        .dropna(subset=["correlation"])
    )

    plt.figure(figsize=figsize)
    sns.boxplot(
        data=correlations,
        x=month_col,
        y="correlation",
    )
    sns.stripplot(
        data=correlations,
        x=month_col,
        y="correlation",
        color="black",
        alpha=point_alpha,
        jitter=jitter,
    )
    plt.title(f"Correlation between {feature_col} and {target_col} by Month")
    plt.xlabel("Month")
    plt.ylabel("Correlation Coefficient")
    plt.ylim(-1, 1)
    plt.axhline(0, color="red", linestyle="--")
    plt.show()

In [None]:
from monthly_forecasting.scr import FeatureExtractor as FE

# Use FeatureExtractor for time series features
extractor = FE.StreamflowFeatureExtractor(
    feature_configs=feature_config,
    prediction_horizon=30,
    offset=30,
)

data = extractor.create_all_features(hydro_ca)

# only keep the last day of the month
data = data[data["date"].dt.is_month_end]

print("Features extracted successfully.")
print(f"Feature data shape: {data.shape}")
# columns
print(f"Feature data columns: {data.columns.tolist()}")

data["month"] = data["date"].dt.month

print(f"Feature data shape: {data.shape}")

# plot the distribution of months against NIR values
plt.figure(figsize=(10, 6))
sns.boxplot(x=data["month"], y=data["NIR_roll_last_value_30"])
plt.title("Distribution of NIR Values by Month")
plt.xlabel("Month")
plt.ylabel("NIR Value")
plt.show()


# plot the correlation of NIR against the 'target' for each month and basin
plot_monthly_correlation(
    data, feature_col="NIR_roll_last_value_30", target_col="target"
)
plot_monthly_correlation(
    data,
    feature_col="NIR_roll_last_value_30",
    target_col="fsc_basin_roll_last_value_30",
)
plot_monthly_correlation(
    data, feature_col="fsc_basin_roll_last_value_30", target_col="target"
)
plot_monthly_correlation(data, feature_col="T_roll_mean_30", target_col="target")
plot_monthly_correlation(data, feature_col="P_roll_sum_30", target_col="target")
# with the discharge lags
plot_monthly_correlation(
    data, feature_col="discharge_roll_mean_30", target_col="target"
)
plot_monthly_correlation(
    data, feature_col="discharge_roll_mean_15", target_col="target"
)
# swe
plot_monthly_correlation(data, feature_col="SWE_roll_mean_30", target_col="target")
plot_monthly_correlation(data, feature_col="SWE_roll_mean_15", target_col="target")

In [None]:
# create a correlation of NIR and fsc_basin , temperature precipitation and target

features = [
    "NIR_roll_last_value_30",
    "fsc_basin_roll_last_value_30",
    "T_roll_mean_30",
    "P_roll_sum_30",
    "target",
]

corr_matrix = data[features].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Correlation Matrix")
plt.show()

In [None]:
discharge_data_path = "/Users/sandrohunziker/hydrosolutions Dropbox/Sandro Hunziker/SAPPHIRE_Central_Asia_Technical_Work/data/kyg_data_forecast_tools/intermediate_data/runoff_day.csv"

discharge_df = pd.read_csv(discharge_data_path, parse_dates=["date"])

select_code = [16059]
discharge_df = discharge_df[discharge_df["code"].isin(select_code)]

# filter only values with cutoff date after 2025-09-01
cutoff_date = pd.to_datetime("2025-09-01")
discharge_df = discharge_df[discharge_df["date"] >= cutoff_date]

# print the dataframe
print(discharge_df)