<a href="https://colab.research.google.com/github/kharlescim/ERT_Project/blob/main/FI_scores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

quick code to test which features are most informative for USDM

In [1]:
!pip install xarray netCDF4
%pip install bottleneck

Collecting xarray
  Downloading xarray-2025.7.1-py3-none-any.whl.metadata (12 kB)
Collecting netCDF4
  Downloading netCDF4-1.7.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting cftime (from netCDF4)
  Downloading cftime-1.6.4.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.7 kB)
Downloading xarray-2025.7.1-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading netCDF4-1.7.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m121.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cftime-1.6.4.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m66.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cft

In [3]:

import pandas as pd
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
import collections

import xarray as xr
import numpy as np

usdm_ds = xr.open_dataset('USDM05_2000_2024.nc')
spei_ds = xr.open_dataset('spei_obs_3D.nc')
obs_ds = xr.open_dataset('obs.nc')

USDM = usdm_ds['USDM']

# Creating new time coordinate
weekly_time = usdm_ds.time.values

# Converting to weekly (method = linear)
spei_weekly = spei_ds.interp(time=weekly_time, method="linear")
obs_weekly = obs_ds.interp(time=weekly_time, method="linear")

# Function to convert raw values to percentiles
# missing values = -999 in obs - might need to alter for proper percentile (7-4)
def to_percentile(ds, dim='time', missing_val = -999.0):

    valid = ds.where(ds != missing_val)
    # Convert each grid point's time series to percentile values.
    return valid.rank(dim=dim, pct=True)

percentiles_spei = to_percentile(spei_weekly)
percentiles_obs = to_percentile(obs_weekly)

# ens = 1, so safe to ignore it from dataset
# testing flattening out entire dataset
spei_df = percentiles_spei.to_dataframe().reset_index()
usdm_df = USDM.to_dataframe().reset_index()
obs_df = (percentiles_obs.to_dataframe().reset_index()).drop(columns=['ens'])
merged_df = pd.merge(spei_df, obs_df, on=['time', 'lat', 'lon'], how='inner')
merged_df = pd.merge(merged_df, usdm_df, on=['time', 'lat', 'lon'], how='inner')

# cleaning up NaN entries
df = merged_df.dropna().copy()

# changing -1 classification to 5 to work with featurespace
df["USDM"] = df["USDM"].replace(-1, 5)

from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler


# Extract features and target
X = df.drop(columns=["USDM", "time", "lat", "lon"])
y = df["USDM"]

# Scale inputs to [0, 1] if needed (helps MI estimator)
X_scaled = pd.DataFrame(MinMaxScaler().fit_transform(X), columns=X.columns)

# Compute MI
mi_scores = mutual_info_classif(X_scaled, y, discrete_features=False, random_state=42)

from sklearn.metrics import mutual_info_score
from scipy.stats import entropy

# Calculate entropy of the target
target_probs = df["USDM"].value_counts(normalize=True).values
H_y = entropy(target_probs, base=2)  # bits

fi_scores = mi_scores / H_y

for name, fi in zip(X.columns, fi_scores):
    print(f"{name}: FI = {fi:.4f}")



SPEI1: FI = 0.0239
SPEI3: FI = 0.0592
SPEI6: FI = 0.0778
SPEI12: FI = 0.0812
SPEI24: FI = 0.0573
SPEI60: FI = 0.0231
SPEI2: FI = 0.0451
SPEI9: FI = 0.0829
SPEI36: FI = 0.0438
SPEI48: FI = 0.0320
SPEI72: FI = 0.0206
SPI1: FI = 0.0016
SPI3: FI = 0.0284
SPI6: FI = 0.0602
SPI9: FI = 0.0713
SPI12: FI = 0.0745
SPI24: FI = 0.0519
SPI60: FI = 0.0166
SMP1: FI = 0.0633
SMP3: FI = 0.0779
SMP6: FI = 0.0690
SMP9: FI = 0.0581
SMP12: FI = 0.0480
SMP24: FI = 0.0263
SMP60: FI = 0.0093
SRI1: FI = 0.0332
SRI3: FI = 0.0593
SRI6: FI = 0.0635
SRI9: FI = 0.0598
SRI12: FI = 0.0536
SRI24: FI = 0.0282
SRI60: FI = 0.0064


In [5]:
fi_df = (
    pd.DataFrame({'feature': X.columns, 'fi': fi_scores})
      .sort_values('fi', ascending=False)      # highest first
      .reset_index(drop=True)
)

print(fi_df)

   feature        fi
0    SPEI9  0.082912
1   SPEI12  0.081244
2     SMP3  0.077915
3    SPEI6  0.077818
4    SPI12  0.074513
5     SPI9  0.071336
6     SMP6  0.069040
7     SRI6  0.063462
8     SMP1  0.063260
9     SPI6  0.060166
10    SRI9  0.059781
11    SRI3  0.059325
12   SPEI3  0.059228
13    SMP9  0.058059
14  SPEI24  0.057324
15   SRI12  0.053650
16   SPI24  0.051876
17   SMP12  0.048050
18   SPEI2  0.045057
19  SPEI36  0.043776
20    SRI1  0.033166
21  SPEI48  0.031953
22    SPI3  0.028436
23   SRI24  0.028221
24   SMP24  0.026305
25   SPEI1  0.023884
26  SPEI60  0.023054
27  SPEI72  0.020588
28   SPI60  0.016559
29   SMP60  0.009324
30   SRI60  0.006359
31    SPI1  0.001580
