feature selection pre filter

In [1]:
# =========================================================
# 01. Data setup + global pre-filtering of covariates
# =========================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

# ---------------------------------------------------------
# STEP 1 — Set up inputs
# ---------------------------------------------------------

# Example input file: your SOC sample data joined with covariates
# Each row = sample point; includes coordinates, SOC value, and 75 covariates
# Example columns: ['site_info_id', 'X_coord', 'Y_coord', 'log_soc_stock', 'cov1', 'cov2', ..., 'cov75']
input_csv = "/Users/inesschwartz/Desktop/final_training_dataset1.csv"
df = pd.read_csv(input_csv)

# Quick check
print(f"Initial number of samples: {len(df)}")
print(f"Initial number of covariates: {df.shape[1] - 4}")  # assuming first 4 cols = id, coords, log_soc_stock

Initial number of samples: 920
Initial number of covariates: 49


In [2]:
df.columns

Index(['log_soc_stock', 'DEM', 'MRRTF', 'MRVBF', 'TWI', 'X_coord', 'Y_coord',
       'aspect', 'bio1', 'bio12', 'bio13', 'bio14', 'bio15', 'bio17', 'bio18',
       'bio19', 'bio2', 'bio3', 'bio4', 'bio5', 'bio6', 'bio7', 'cropland',
       'district', 'ecoforms', 'faosoil_id', 'flow_accumulation', 'gen_curve',
       'grazing', 'hillshade', 'landsurface_forms', 'lithology', 'max_curve',
       'midslope_position', 'min_curve', 'normalized_height', 'pasture',
       'plan_curve', 'precip_sum', 'profile_curve', 'rangeland', 'ridge_level',
       'site_info_id', 'slope', 'slope_height', 'slope_length',
       'standardized_height', 'terrain_surf_texture', 'terrain_surv_conv',
       'tmax', 'total_curve', 'valley_depth', 'year'],
      dtype='object')

In [3]:
##outliers

import numpy as np
import pandas as pd

summary = {}

for col in df.select_dtypes(include=[np.number]).columns:
    data = df[col]

    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    summary[col] = {
        "min": data.min(),
        "max": data.max(),
        "mean": data.mean(),
        "n_outliers_iqr": ((data < lower) | (data > upper)).sum(),
        "lower_bound": lower,
        "upper_bound": upper
    }

outlier_summary = pd.DataFrame(summary).T
print(outlier_summary.sort_values("n_outliers_iqr", ascending=False))


                               min           max          mean  \
ecoforms              0.000000e+00  2.550000e+02  8.959565e+01   
lithology             1.000000e+00  2.550000e+02  6.558696e+00   
flow_accumulation     1.011001e+06  5.814976e+10  4.019374e+08   
bio14                 0.000000e+00  5.000000e+00  2.668495e-01   
landsurface_forms     1.000000e+00  7.000000e+00  2.083696e+00   
total_curve           1.738455e-13  9.945471e-09  2.400502e-10   
bio17                 0.000000e+00  4.500000e+01  4.326930e+00   
profile_curve        -6.646460e-05  6.648619e-05 -9.088714e-07   
plan_curve           -5.025024e-03  6.411329e-03  1.007710e-04   
hillshade             1.804313e+02  2.143575e+02  1.956461e+02   
cropland              0.000000e+00  1.947732e+01  2.910628e+00   
max_curve            -5.007607e-06  9.994370e-05  6.942218e-06   
min_curve            -7.372308e-05  2.322387e-05 -6.363116e-06   
gen_curve            -1.375308e-04  1.581954e-04  1.158205e-06   
slope_leng

In [4]:
# ---------------------------------------------------------
# STEP 2 — Hold out 20% test data (independent validation set)
# ---------------------------------------------------------
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save for later use
train_df.to_csv("/Users/inesschwartz/Desktop/model/train_data.csv", index=False)
test_df.to_csv("/Users/inesschwartz/Desktop/model/test_data.csv", index=False)

print(f"Training samples: {len(train_df)}, Test samples: {len(test_df)}")

# ---------------------------------------------------------
# STEP 3 — Global pre-filtering of covariates (with protected variables)
# ---------------------------------------------------------

# Define which columns are covariates
non_covariates = ['site_info_id', 'X_coord', 'Y_coord', 'log_soc_stock', 'year', 'district']
covariate_cols = [c for c in df.columns if c not in non_covariates]

# List of expert-recommended covariates to KEEP no matter what
protected_vars = [
    "bio12",
    "tmax",
    "TWI",
    "MRVBF",
    "terrain_surf_conv",
    "terrain_surf_texture",
    "normalized_height",
    "slope_height",
    "lithology",
    "bio5",
    "standardized_height",
    "bio7"
]

# --- 3A. Remove near-zero variance predictors ---
var_threshold = 1e-5
variances = train_df[covariate_cols].var()
low_var = variances[variances < var_threshold].index.tolist()
low_var = [v for v in low_var if v not in protected_vars]  # never drop protected
print(f"Removed {len(low_var)} near-zero variance variables.")

filtered_covs = [c for c in covariate_cols if c not in low_var]

# --- 3B. Remove highly correlated variables (|r| > 0.8) ---
corr_matrix = train_df[filtered_covs].corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop_corr = [col for col in upper_tri.columns if any(upper_tri[col] > 0.8)]
# keep protected variables
to_drop_corr = [v for v in to_drop_corr if v not in protected_vars]
print(f"Removed {len(to_drop_corr)} highly correlated variables (|r| > 0.8).")

filtered_covs = [c for c in filtered_covs if c not in to_drop_corr]

# --- 3C. VIF filtering (relaxed, protected variables never dropped) ---
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(df_subset):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = df_subset.columns
    vif_data["VIF"] = [
        variance_inflation_factor(df_subset.values, i)
        for i in range(df_subset.shape[1])
    ]
    return vif_data

X = train_df[filtered_covs].dropna().copy()
vif_threshold = 40  # relaxed threshold for predictive models

vif = calculate_vif(X)

while vif["VIF"].max() > vif_threshold:
    remove_var = vif.loc[vif["VIF"].idxmax(), "Variable"]
    # skip removal if variable is protected
    if remove_var in protected_vars:
        print(f"Skipping removal of protected variable: {remove_var} (VIF = {vif['VIF'].max():.2f})")
        # artificially lower its VIF to break potential infinite loop
        vif.loc[vif['Variable'] == remove_var, 'VIF'] = vif_threshold - 0.1
        continue
    print(f"Removing {remove_var} (VIF = {vif['VIF'].max():.2f})")
    filtered_covs.remove(remove_var)
    vif = calculate_vif(X[filtered_covs])

print(f"Remaining covariates after VIF filtering: {len(filtered_covs)}")

# Ensure all protected variables are included
for v in protected_vars:
    if v not in filtered_covs and v in covariate_cols:
        filtered_covs.append(v)
        print(f"Re-added protected variable: {v}")

print(f"Final covariate count (after protections): {len(filtered_covs)}")

pd.Series(filtered_covs, name="covariate").to_csv("/Users/inesschwartz/Desktop/model/filtered_covariates.csv", index=False)
train_filtered = train_df[non_covariates + filtered_covs]

Training samples: 736, Test samples: 184
Removed 6 near-zero variance variables.
Removed 4 highly correlated variables (|r| > 0.8).
Skipping removal of protected variable: bio7 (VIF = 8993.28)
Removing bio2 (VIF = 7714.14)
Skipping removal of protected variable: tmax (VIF = 3962.73)
Removing hillshade (VIF = 3143.02)
Skipping removal of protected variable: tmax (VIF = 3711.48)
Skipping removal of protected variable: bio5 (VIF = 2907.67)
Removing bio3 (VIF = 723.22)
Skipping removal of protected variable: tmax (VIF = 3095.17)
Skipping removal of protected variable: bio5 (VIF = 2903.71)
Removing bio15 (VIF = 545.85)
Skipping removal of protected variable: bio5 (VIF = 2691.20)
Skipping removal of protected variable: tmax (VIF = 2385.91)
Skipping removal of protected variable: normalized_height (VIF = 287.12)
Skipping removal of protected variable: bio7 (VIF = 265.71)
Skipping removal of protected variable: TWI (VIF = 257.10)
Removing grazing (VIF = 246.82)
Skipping removal of protected va

In [5]:
#double check for nulls
train_filtered.isna().sum().sort_values(ascending=False)

site_info_id            0
cropland                0
terrain_surf_texture    0
standardized_height     0
slope_length            0
slope_height            0
slope                   0
rangeland               0
pasture                 0
normalized_height       0
midslope_position       0
lithology               0
landsurface_forms       0
flow_accumulation       0
faosoil_id              0
ecoforms                0
bio7                    0
X_coord                 0
bio5                    0
bio19                   0
bio18                   0
bio17                   0
bio14                   0
bio12                   0
aspect                  0
TWI                     0
MRVBF                   0
MRRTF                   0
district                0
year                    0
log_soc_stock           0
Y_coord                 0
tmax                    0
dtype: int64

In [6]:
# Save outputs
train_filtered.to_csv("/Users/inesschwartz/Desktop/model/train_filtered.csv", index=False)