feature selection pre filter

In [1]:
# =========================================================
# 01. Data setup + global pre-filtering of covariates
# =========================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

# ---------------------------------------------------------
# STEP 1 — Set up inputs
# ---------------------------------------------------------

# Example input file: your SOC sample data joined with covariates
# Each row = sample point; includes coordinates, SOC value, and 75 covariates
# Example columns: ['site_info_id', 'X_coord', 'Y_coord', 'log_soc_stock', 'cov1', 'cov2', ..., 'cov75']
input_csv = "/Users/inesschwartz/Desktop/final_training_dataset.csv"
df = pd.read_csv(input_csv)

# Quick check
print(f"Initial number of samples: {len(df)}")
print(f"Initial number of covariates: {df.shape[1] - 4}")  # assuming first 4 cols = id, coords, log_soc_stock

Initial number of samples: 909
Initial number of covariates: 74


In [2]:
df.head()

Unnamed: 0,log_soc_stock,site_info_id,X_coord,Y_coord,year,faosoil_id,landsurface_value,litho_value,formation,conv_rangeland_1950,...,slope_height,slope_length,standardized_height,terrain_surf_convexity,terrain_surf_texture,total_curve,twi,valley_depth,valley_index,watershed_basins
0,1.260593,2139.0,637881.888723,8608926.0,1956.0,43.0,2.0,2.0,97.0,0.0,...,79.74369,3000.0,952.486816,49.512749,77.167595,1.178243e-09,5.82737,64.605621,0.0,4340.0
1,1.659299,1927.0,385725.69329,8669325.0,1958.0,120.0,2.0,2.0,123.0,0.0,...,209.757782,3414.213623,50.189114,43.61615,72.120461,8.122644e-10,8.748525,501.486071,0.0,3570.0
2,0.0,17.0,248538.63635,9488118.0,1959.0,8.0,2.0,2.0,66.0,17.13189,...,167.649002,0.0,72.041985,39.723015,71.185776,4.263759e-10,5.255396,260.212341,0.0,7588.0
3,0.972043,1701.0,840008.131292,8731220.0,1963.0,40.0,1.0,1.0,96.0,0.0,...,92.101753,4828.427246,509.570862,44.910591,72.663414,2.168373e-10,7.329942,111.684311,0.0,5267.0
4,1.333861,1934.0,384135.495798,8666721.0,1958.0,120.0,2.0,2.0,123.0,0.0,...,229.889511,0.0,69.800308,45.813786,71.682632,2.845996e-10,5.054585,501.486071,0.0,3570.0


In [7]:


# ---------------------------------------------------------
# STEP 2 — Hold out 20% test data (independent validation set)
# ---------------------------------------------------------
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save for later use
train_df.to_csv("/Users/inesschwartz/Desktop/model/train_data.csv", index=False)
test_df.to_csv("/Users/inesschwartz/Desktop/model/test_data.csv", index=False)

print(f"Training samples: {len(train_df)}, Test samples: {len(test_df)}")

# ---------------------------------------------------------
# STEP 3 — Global pre-filtering of covariates (with protected variables)
# ---------------------------------------------------------

# Define which columns are covariates
non_covariates = ['site_info_id', 'X_coord', 'Y_coord', 'log_soc_stock']
covariate_cols = [c for c in df.columns if c not in non_covariates]

# List of expert-recommended covariates to KEEP no matter what
protected_vars = [
    "annual_precip",
    "tmax_mean",
    "twi",
    "MRVBF",
    "terrain_surf_convexity",
    "terrain_surf_texture",
    "normalized_height"
]

# --- 3A. Remove near-zero variance predictors ---
var_threshold = 1e-5
variances = train_df[covariate_cols].var()
low_var = variances[variances < var_threshold].index.tolist()
low_var = [v for v in low_var if v not in protected_vars]  # never drop protected
print(f"Removed {len(low_var)} near-zero variance variables.")

filtered_covs = [c for c in covariate_cols if c not in low_var]

# --- 3B. Remove highly correlated variables (|r| > 0.8) ---
corr_matrix = train_df[filtered_covs].corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop_corr = [col for col in upper_tri.columns if any(upper_tri[col] > 0.8)]
# keep protected variables
to_drop_corr = [v for v in to_drop_corr if v not in protected_vars]
print(f"Removed {len(to_drop_corr)} highly correlated variables (|r| > 0.8).")

filtered_covs = [c for c in filtered_covs if c not in to_drop_corr]

# --- 3C. VIF filtering (relaxed, protected variables never dropped) ---
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(df_subset):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = df_subset.columns
    vif_data["VIF"] = [
        variance_inflation_factor(df_subset.values, i)
        for i in range(df_subset.shape[1])
    ]
    return vif_data

X = train_df[filtered_covs].dropna().copy()
vif_threshold = 20  # relaxed threshold for predictive models

vif = calculate_vif(X)

while vif["VIF"].max() > vif_threshold:
    remove_var = vif.loc[vif["VIF"].idxmax(), "Variable"]
    # skip removal if variable is protected
    if remove_var in protected_vars:
        print(f"Skipping removal of protected variable: {remove_var} (VIF = {vif['VIF'].max():.2f})")
        # artificially lower its VIF to break potential infinite loop
        vif.loc[vif['Variable'] == remove_var, 'VIF'] = vif_threshold - 0.1
        continue
    print(f"Removing {remove_var} (VIF = {vif['VIF'].max():.2f})")
    filtered_covs.remove(remove_var)
    vif = calculate_vif(X[filtered_covs])

print(f"Remaining covariates after VIF filtering: {len(filtered_covs)}")

# Ensure all protected variables are included
for v in protected_vars:
    if v not in filtered_covs and v in covariate_cols:
        filtered_covs.append(v)
        print(f"Re-added protected variable: {v}")

print(f"Final covariate count (after protections): {len(filtered_covs)}")

# Save outputs
pd.Series(filtered_covs, name="covariate").to_csv("/Users/inesschwartz/Desktop/model/filtered_covariates.csv", index=False)
train_filtered = train_df[non_covariates + filtered_covs]
train_filtered.to_csv("/Users/inesschwartz/Desktop/model/train_filtered.csv", index=False)

print("\n✅ Pre-filtering complete! Protected variables retained:")
print(protected_vars)

Training samples: 727, Test samples: 182
Removed 9 near-zero variance variables.
Removed 21 highly correlated variables (|r| > 0.8).
Removing year (VIF = 16758.97)
Removing max_temp_warmest_month (VIF = 3729.61)
Removing hillshade (VIF = 2351.14)
Skipping removal of protected variable: tmax_mean (VIF = 1341.61)
Removing mean_temp_coldest_quarter (VIF = 845.74)
Removing grazing_1950 (VIF = 570.07)
Skipping removal of protected variable: tmax_mean (VIF = 525.95)
Removing isothermality (VIF = 361.79)
Skipping removal of protected variable: tmax_mean (VIF = 358.40)
Skipping removal of protected variable: normalized_height (VIF = 271.55)
Skipping removal of protected variable: annual_precip (VIF = 186.32)
Removing precip_wettest_month (VIF = 178.46)
Skipping removal of protected variable: tmax_mean (VIF = 356.57)
Skipping removal of protected variable: normalized_height (VIF = 262.38)
Skipping removal of protected variable: terrain_surf_convexity (VIF = 142.82)
Skipping removal of protected