<a href="https://colab.research.google.com/github/kharlescim/ERT_Project/blob/main/FI_scores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

quick code to test which features are most informative for USDM

In [None]:
!pip install xarray netCDF4
%pip install bottleneck

Collecting xarray
  Downloading xarray-2025.7.1-py3-none-any.whl.metadata (12 kB)
Collecting netCDF4
  Downloading netCDF4-1.7.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting cftime (from netCDF4)
  Downloading cftime-1.6.4.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.7 kB)
Downloading xarray-2025.7.1-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading netCDF4-1.7.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m90.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cftime-1.6.4.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cfti

In [None]:

import pandas as pd
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
import collections

import xarray as xr
import numpy as np

LTD_ds = xr.open_dataset('LTD05.nc')
spei_ds = xr.open_dataset('spei_obs_3D.nc')
obs_ds = xr.open_dataset('obs.nc')

LTD = LTD_ds['LTD']

# Creating new time coordinate
weekly_time = LTD_ds.time.values

# Converting to weekly (method = linear)
spei_weekly = spei_ds.interp(time=weekly_time, method="linear")
obs_weekly = obs_ds.interp(time=weekly_time, method="linear")

# Function to convert raw values to percentiles
# missing values = -999 in obs - might need to alter for proper percentile (7-4)
def to_percentile(ds, dim='time', missing_val = -999.0):

    valid = ds.where(ds != missing_val)
    # Convert each grid point's time series to percentile values.
    return valid.rank(dim=dim, pct=True)

percentiles_spei = to_percentile(spei_weekly)
percentiles_obs = to_percentile(obs_weekly)

# ens = 1, so safe to ignore it from dataset
# testing flattening out entire dataset
spei_df = percentiles_spei.to_dataframe().reset_index()
LTD_df = LTD.to_dataframe().reset_index()
obs_df = (percentiles_obs.to_dataframe().reset_index()).drop(columns=['ens'])
merged_df = pd.merge(spei_df, obs_df, on=['time', 'lat', 'lon'], how='inner')
merged_df = pd.merge(merged_df, LTD_df, on=['time', 'lat', 'lon'], how='inner')

# cleaning up NaN entries
df = merged_df.dropna().copy()

# changing -1 classification to 5 to work with featurespace
df["LTD"] = df["LTD"].replace(-1, 5)

from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler


# Extract features and target
X = df.drop(columns=["LTD", "time", "lat", "lon"])
y = df["LTD"]

# Scale inputs to [0, 1] if needed (helps MI estimator)
X_scaled = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)

# Compute MI
mi_scores = mutual_info_classif(X_scaled, y, discrete_features=False, random_state=42)

from sklearn.metrics import mutual_info_score
from scipy.stats import entropy

# Calculate entropy of the target
target_probs = df["LTD"].value_counts(normalize=True).values
H_y = entropy(target_probs, base=2)  # bits

fi_scores = mi_scores / H_y

for name, fi in zip(X.columns, fi_scores):
    print(f"{name}: FI = {fi:.4f}")



SPEI1: FI = 0.0046
SPEI3: FI = 0.0153
SPEI6: FI = 0.0378
SPEI12: FI = 0.0746
SPEI24: FI = 0.0691
SPEI60: FI = 0.0355
SPEI2: FI = 0.0097
SPEI9: FI = 0.0619
SPEI36: FI = 0.0590
SPEI48: FI = 0.0460
SPEI72: FI = 0.0300
SPI1: FI = 0.0004
SPI3: FI = 0.0049
SPI6: FI = 0.0220
SPI9: FI = 0.0465
SPI12: FI = 0.0633
SPI24: FI = 0.0599
SPI60: FI = 0.0260
SMP1: FI = 0.0402
SMP3: FI = 0.0526
SMP6: FI = 0.0643
SMP9: FI = 0.0676
SMP12: FI = 0.0634
SMP24: FI = 0.0401
SMP60: FI = 0.0181
SRI1: FI = 0.0158
SRI3: FI = 0.0282
SRI6: FI = 0.0463
SRI9: FI = 0.0615
SRI12: FI = 0.0646
SRI24: FI = 0.0404
SRI60: FI = 0.0127
