**Variable Screening and Transformation**

Goals:
- remove redundant covariates
- stabilize distributions
- produce a candidate set identical for all models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [2]:
df = pd.read_csv("/Users/inesschwartz/Desktop/training_data.csv")
df.head()

Unnamed: 0,site_info_id,X_coord,Y_coord,profile,district,MRVBF,RLD,aspect,aspect_cos,aspect_sin,...,precip_warmest_quarter,precip_wettest_month,precip_wettest_quarter32733,temp_annual_range,temp_seasonality32733,landsurface_value,formation,faosoil_id,litho_value,log_soc_stock
0,2770,12.161278,-15.222598,1_57,Namibe,3.154941,56.44121,193.738,0.002505,-0.099684,...,47,24,50,15.4,25.000542,1,204,131.0,1,0.123792
1,48,12.575774,-4.866985,1_59,Cabinda,1.30931,87.68109,167.89279,-0.00529,0.026958,...,525,196,525,13.382994,18.245573,5,1,112.0,5,1.344501
2,881,17.081955,-9.274587,1_63,Malanje,-0.007585,226.94301,152.63528,0.142316,0.283865,...,542,227,588,19.820923,4.635531,2,113,62.0,2,1.309453
3,2698,13.455059,-14.977228,10_55,Huila,1.245921,71.276505,188.58809,-0.412049,-0.074023,...,163,177,438,20.374357,17.226534,2,113,62.0,2,1.263857
4,2139,16.269295,-12.580465,100_56,Huambo,3.043891,95.82083,188.93633,0.271031,-0.047712,...,322,250,654,21.457123,15.427565,2,97,43.0,2,1.260593


In [3]:
# (Optional) remove obviously redundant covariates
#    - constant columns
#    - perfectly correlated pairs

# Drop columns with a single unique value
constant_cols = [c for c in df.columns if df[c].nunique() <= 1]
df_reduced = df.drop(columns=constant_cols)

# Drop perfectly correlated columns (|r|=1.0)
corr_matrix = df_reduced.corr(numeric_only=True).abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
perfect_corr = [column for column in upper.columns if any(upper[column] == 1.0)]
# df_reduced = df_reduced.drop(columns=perfect_corr)

# Columns you want to retain even if flagged as perfectly correlated
keep_cols = [
    "aspect_sin",
    "TWI",
    "annual_mean_temp",
    "temp_annual_range",
    "temp_seasonality32733"
]

# Drop perfectly correlated columns except the ones you want to keep
perfect_corr_to_drop = [col for col in perfect_corr if col not in keep_cols]
df_reduced = df_reduced.drop(columns=perfect_corr_to_drop)


print("Dropped constant cols:", constant_cols)
print("Dropped perfectly correlated cols:", perfect_corr_to_drop)


Dropped constant cols: []
Dropped perfectly correlated cols: ['roughness', 'max_temp_warmest_month', 'mean_temp_driest_quarter', 'mean_temp_warmest_quarter', 'mean_temp_wettest_quarter', 'min_temp_coldest_month']


In [5]:
# ----------------------------
# 2. Numeric columns
# ----------------------------
numeric_cols = df_reduced.select_dtypes(include=[np.number]).columns

# ----------------------------
# 3. Output folder
# ----------------------------
out_dir = Path("/Users/inesschwartz/Desktop/training_eda_histograms")
out_dir.mkdir(parents=True, exist_ok=True)

# ----------------------------
# 4. Plot histograms in a grid
# ----------------------------
n_cols = 4  # columns in grid
n_rows = int(np.ceil(len(numeric_cols)/n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 3*n_rows))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    sns.histplot(df[col], kde=True, bins=30, ax=axes[i], color="steelblue")
    axes[i].set_title(col, fontsize=10)
    axes[i].tick_params(axis='both', labelsize=8)

# Remove empty subplots
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.savefig(out_dir / "histograms_grid.png", dpi=150)
plt.close()
print(f"Histograms saved to: {out_dir}")

Histograms saved to: /Users/inesschwartz/Desktop/training_eda_histograms


In [33]:
df_reduced.columns

Index(['site_info_id', 'X_coord', 'Y_coord', 'profile', 'district', 'MRVBF',
       'RLD', 'aspect', 'aspect_cos', 'aspect_sin', 'DEM', 'flow_accumulation',
       'relief', 'ridge_levels', 'slope', 'TWI', 'valleydepth',
       'mean_temp_coldest_quarter', 'annual_mean_temp', 'annual_precip',
       'isothermality', 'precip_coldest_quarter', 'precip_driest_month',
       'precip_driest_quarter', 'precip_seasonality', 'precip_warmest_quarter',
       'precip_wettest_month', 'precip_wettest_quarter32733',
       'temp_annual_range', 'temp_seasonality32733', 'landsurface_value',
       'formation', 'faosoil_id', 'litho_value', 'log_soc_stock'],
      dtype='object')

In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

df_reduced

# ----------------------------
# 2. List topographic / DEM-derived columns
# ----------------------------
topo_cols = [
    "MRVBF",
    "RLD",
    "aspect",
    "aspect_cos",
    "aspect_sin",
    "slope",
    "TWI",
    "valleydepth",
    "ridge_levels",
    "DEM",
    "flow_accumulation"
]

# ----------------------------
# 3. Replace known NoData placeholders with NaN
# ----------------------------
nodata_values = [-99999, -9999, -999, 99999, 9999]  # extend if needed
for col in topo_cols:
    df_reduced[col] = df_reduced[col].replace(nodata_values, np.nan)

# ----------------------------
# 4. Clip outliers based on percentiles
# ----------------------------
def clip_outliers(series, lower_pct=1, upper_pct=99):
    lower = np.percentile(series.dropna(), lower_pct)
    upper = np.percentile(series.dropna(), upper_pct)
    return series.clip(lower, upper)

for col in topo_cols:
    df_reduced[col] = clip_outliers(df_reduced[col], lower_pct=1, upper_pct=99)

# ----------------------------
# 5. Optional: plot histograms after clipping
# ----------------------------
out_dir = Path("/Users/inesschwartz/Desktop/topo_histograms")
out_dir.mkdir(parents=True, exist_ok=True)

for col in topo_cols:
    plt.figure(figsize=(5,4))
    sns.histplot(df[col].dropna(), bins=30, kde=True, color='steelblue')
    plt.title(f"Histogram of {col} (clipped)")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(out_dir / f"hist_{col}.png", dpi=150)
    plt.close()

print(f"Clipping done. Histograms saved in: {out_dir}")

# ----------------------------
# 6. Save cleaned dataset
# ----------------------------
clean_csv_path = "/Users/inesschwartz/Desktop/training_data_topo_cleaned.csv"
df_reduced.to_csv(clean_csv_path, index=False)
print(f"Cleaned CSV saved: {clean_csv_path}")


Clipping done. Histograms saved in: /Users/inesschwartz/Desktop/topo_histograms
Cleaned CSV saved: /Users/inesschwartz/Desktop/training_data_topo_cleaned.csv


In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

# ----------------------------
# 1. Load training data
# ----------------------------
csv_path = "/Users/inesschwartz/Desktop/training_data_topo_cleaned.csv"
df = pd.read_csv(csv_path)

# ----------------------------
# 2. List bioclimatic variables
# ----------------------------
bioclim_cols = [
    'mean_temp_coldest_quarter', 'annual_mean_temp', 'annual_precip',
    'isothermality', 'precip_coldest_quarter',
    'precip_driest_month', 'precip_driest_quarter', 'precip_seasonality',
    'precip_warmest_quarter', 'precip_wettest_month',
    'precip_wettest_quarter32733', 'temp_annual_range',
    'temp_seasonality32733'
]

# ----------------------------
# 3. Replace extreme placeholder values with NaN
# ----------------------------
# thresholds based on plausible ranges for temperature (°C) and precipitation (mm)
for col in bioclim_cols:
    df[col] = df[col].apply(lambda x: np.nan if x < -1e5 or x > 1e5 else x)

# ----------------------------
# 4. Function for percentile-based clipping
# ----------------------------
def clip_percentiles(series, lower=1, upper=99):
    series_clean = series.dropna()
    lower_val, upper_val = np.percentile(series_clean, [lower, upper])
    return series.clip(lower_val, upper_val)

# ----------------------------
# 5. Plot and save histograms
# ----------------------------
out_dir = Path("/Users/inesschwartz/Desktop/bioclim_histograms")
out_dir.mkdir(parents=True, exist_ok=True)

for col in bioclim_cols:
    clipped = clip_percentiles(df[col])
    plt.figure(figsize=(5,4))
    sns.histplot(clipped, bins=30, kde=True, color='darkgreen')
    plt.title(f"Histogram of {col} (clipped 1-99%)")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(out_dir / f"hist_{col}.png", dpi=150)
    plt.close()

print(f"Bioclimatic histograms saved in: {out_dir}")

# ----------------------------
# 6. Save cleaned dataset
# ----------------------------
clean_csv_path = "/Users/inesschwartz/Desktop/training_data_bioclim_cleaned.csv"
df.to_csv(clean_csv_path, index=False)
print(f"Cleaned bioclim dataset saved: {clean_csv_path}")


Bioclimatic histograms saved in: /Users/inesschwartz/Desktop/bioclim_histograms
Cleaned bioclim dataset saved: /Users/inesschwartz/Desktop/training_data_bioclim_cleaned.csv


In [37]:
df.columns

Index(['site_info_id', 'X_coord', 'Y_coord', 'profile', 'district', 'MRVBF',
       'RLD', 'aspect', 'aspect_cos', 'aspect_sin', 'DEM', 'flow_accumulation',
       'relief', 'ridge_levels', 'slope', 'TWI', 'valleydepth',
       'mean_temp_coldest_quarter', 'annual_mean_temp', 'annual_precip',
       'isothermality', 'precip_coldest_quarter', 'precip_driest_month',
       'precip_driest_quarter', 'precip_seasonality', 'precip_warmest_quarter',
       'precip_wettest_month', 'precip_wettest_quarter32733',
       'temp_annual_range', 'temp_seasonality32733', 'landsurface_value',
       'formation', 'faosoil_id', 'litho_value', 'log_soc_stock'],
      dtype='object')