feature selection pre filter

In [1]:
# =========================================================
# 01. Data setup + global pre-filtering of covariates
# =========================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

# ---------------------------------------------------------
# STEP 1 — Set up inputs
# ---------------------------------------------------------

# Example input file: your SOC sample data joined with covariates
# Each row = sample point; includes coordinates, SOC value, and 75 covariates
# Example columns: ['site_info_id', 'X_coord', 'Y_coord', 'log_soc_stock', 'cov1', 'cov2', ..., 'cov75']
input_csv = "/Users/inesschwartz/Desktop/final_training_dataset1.csv"
df = pd.read_csv(input_csv)

# Quick check
print(f"Initial number of samples: {len(df)}")
print(f"Initial number of covariates: {df.shape[1] - 4}")  # assuming first 4 cols = id, coords, log_soc_stock

Initial number of samples: 907
Initial number of covariates: 62


In [2]:
df.columns

Index(['log_soc_stock', 'MRRTF', 'MRVBF', 'X_coord', 'Y_coord',
       'annual_mean_temp', 'annual_precip', 'aspect', 'aspect_cos',
       'aspect_sin', 'conv_rangeland_1950', 'conv_rangeland_1960',
       'cropland_1950', 'cropland_1960', 'dem_1km_utm33s', 'faosoil_id',
       'flow_accumulation', 'flow_directions', 'flowline_curve', 'formation',
       'general_curve', 'grazing_1950', 'hill_height', 'hillshade',
       'hillslope_index', 'isothermality', 'landsurface_value',
       'length_slope_factor', 'litho_value', 'max_curve',
       'max_temp_warmest_month', 'max_temp_warmest_month.1',
       'mean_temp_coldest_quarter', 'midslope_position', 'min_curve',
       'normalized_height', 'pasture_1950', 'pasture_1960', 'plan_curve',
       'precip_coldest_quarter', 'precip_driest_month',
       'precip_driest_quarter', 'precip_seasonality', 'precip_sum',
       'precip_warmest_quarter', 'precip_wettest_month', 'profile_curve',
       'rangeland_1950', 'rangeland_1960', 'relief_TRI', 

In [3]:
# ---------------------------------------------------------
# STEP 2 — Hold out 20% test data (independent validation set)
# ---------------------------------------------------------
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save for later use
train_df.to_csv("/Users/inesschwartz/Desktop/model/train_data.csv", index=False)
test_df.to_csv("/Users/inesschwartz/Desktop/model/test_data.csv", index=False)

print(f"Training samples: {len(train_df)}, Test samples: {len(test_df)}")

# ---------------------------------------------------------
# STEP 3 — Global pre-filtering of covariates (with protected variables)
# ---------------------------------------------------------

# Define which columns are covariates
non_covariates = ['site_info_id', 'X_coord', 'Y_coord', 'log_soc_stock']
covariate_cols = [c for c in df.columns if c not in non_covariates]

# List of expert-recommended covariates to KEEP no matter what
protected_vars = [
    "annual_precip",
    "tmax_mean_mean",
    "twi",
    "MRVBF",
    "terrain_surf_convexity",
    "terrain_surf_texture",
    "normalized_height",
    "slope_height",
    "litho_value",
    "max_temp_warmest_month",
    "standardized_height",
    "temp_annual_range"
]

# --- 3A. Remove near-zero variance predictors ---
var_threshold = 1e-5
variances = train_df[covariate_cols].var()
low_var = variances[variances < var_threshold].index.tolist()
low_var = [v for v in low_var if v not in protected_vars]  # never drop protected
print(f"Removed {len(low_var)} near-zero variance variables.")

filtered_covs = [c for c in covariate_cols if c not in low_var]

# --- 3B. Remove highly correlated variables (|r| > 0.8) ---
corr_matrix = train_df[filtered_covs].corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop_corr = [col for col in upper_tri.columns if any(upper_tri[col] > 0.8)]
# keep protected variables
to_drop_corr = [v for v in to_drop_corr if v not in protected_vars]
print(f"Removed {len(to_drop_corr)} highly correlated variables (|r| > 0.8).")

filtered_covs = [c for c in filtered_covs if c not in to_drop_corr]

# --- 3C. VIF filtering (relaxed, protected variables never dropped) ---
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calculate_vif(df_subset):
    vif_data = pd.DataFrame()
    vif_data["Variable"] = df_subset.columns
    vif_data["VIF"] = [
        variance_inflation_factor(df_subset.values, i)
        for i in range(df_subset.shape[1])
    ]
    return vif_data

X = train_df[filtered_covs].dropna().copy()
vif_threshold = 20  # relaxed threshold for predictive models

vif = calculate_vif(X)

while vif["VIF"].max() > vif_threshold:
    remove_var = vif.loc[vif["VIF"].idxmax(), "Variable"]
    # skip removal if variable is protected
    if remove_var in protected_vars:
        print(f"Skipping removal of protected variable: {remove_var} (VIF = {vif['VIF'].max():.2f})")
        # artificially lower its VIF to break potential infinite loop
        vif.loc[vif['Variable'] == remove_var, 'VIF'] = vif_threshold - 0.1
        continue
    print(f"Removing {remove_var} (VIF = {vif['VIF'].max():.2f})")
    filtered_covs.remove(remove_var)
    vif = calculate_vif(X[filtered_covs])

print(f"Remaining covariates after VIF filtering: {len(filtered_covs)}")

# Ensure all protected variables are included
for v in protected_vars:
    if v not in filtered_covs and v in covariate_cols:
        filtered_covs.append(v)
        print(f"Re-added protected variable: {v}")

print(f"Final covariate count (after protections): {len(filtered_covs)}")

# Save outputs
pd.Series(filtered_covs, name="covariate").to_csv("/Users/inesschwartz/Desktop/model/filtered_covariates.csv", index=False)
train_filtered = train_df[non_covariates + filtered_covs]
train_filtered.to_csv("/Users/inesschwartz/Desktop/model/train_filtered.csv", index=False)

print("\n✅ Pre-filtering complete! Protected variables retained:")
print(protected_vars)

Training samples: 725, Test samples: 182
Removed 7 near-zero variance variables.
Removed 6 highly correlated variables (|r| > 0.8).


  vif = 1. / (1. - r_squared_i)


Removing annual_mean_temp (VIF = inf)


  vif = 1. / (1. - r_squared_i)


Skipping removal of protected variable: max_temp_warmest_month (VIF = inf)
Skipping removal of protected variable: temp_annual_range (VIF = inf)
Remaining covariates after VIF filtering: 48
Final covariate count (after protections): 48

✅ Pre-filtering complete! Protected variables retained:
['annual_precip', 'tmax_mean_mean', 'twi', 'MRVBF', 'terrain_surf_convexity', 'terrain_surf_texture', 'normalized_height', 'slope_height', 'litho_value', 'max_temp_warmest_month', 'standardized_height', 'temp_annual_range']


In [4]:
#double check for nulls
train_filtered.isna().sum().sort_values(ascending=False)

site_info_id              0
X_coord                   0
normalized_height         0
pasture_1950              0
pasture_1960              0
precip_coldest_quarter    0
precip_driest_month       0
precip_driest_quarter     0
precip_seasonality        0
precip_sum                0
precip_warmest_quarter    0
precip_wettest_month      0
rangeland_1950            0
relief_TRI                0
slope_height              0
slope_length              0
standardized_height       0
temp_annual_range         0
terrain_surf_convexity    0
terrain_surf_texture      0
tmax_mean_mean            0
tot_irri_1950             0
twi                       0
valley_depth              0
valley_index              0
midslope_position         0
max_temp_warmest_month    0
litho_value               0
cropland_1950             0
Y_coord                   0
log_soc_stock             0
MRRTF                     0
MRVBF                     0
annual_precip             0
aspect                    0
aspect_cos          